diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab948122485d6eb309c822acc4b4e8bef791927d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/hf_xet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/hf_xet/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7ead3a5cb30dad79188b91ea298d4e7640b5581 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/hf_xet/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore-1.0.7.dist-info/licenses/LICENSE.md b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore-1.0.7.dist-info/licenses/LICENSE.md new file mode 100644 index 0000000000000000000000000000000000000000..311b2b56c53f678ab95fc0def708c675d521a807 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore-1.0.7.dist-info/licenses/LICENSE.md @@ -0,0 +1,27 @@ +Copyright © 2020, [Encode OSS Ltd](https://www.encode.io/). +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c87c4ac0863bafc0f2a7fe21f6846ee99f00f1ca Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a408d6fada549cf4afa5c41980566f1901003c5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_exceptions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_exceptions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8efa5d18e0a6f8d889fbc01f602f5aa8be1f615d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_exceptions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_models.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_models.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9de136c946f8ef6af04155ea50d5233b96c0ef2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_models.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_ssl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_ssl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b0b9fa3dc3b07fff97a2f3bc0c6405c995a7fec Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_ssl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_synchronization.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_synchronization.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07861b93fccdb4ca89435110c6d0301d0e3cd3f7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_synchronization.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_trace.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_trace.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15d50eea0481bbed2127c522efedd6170c8d0c09 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_trace.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..730ee9f2868eada184eead4e9f9dde3aac37eda9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/__pycache__/_utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..88dc7f01e132933728cbcf45c88ce82e85ddf65f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__init__.py @@ -0,0 +1,39 @@ +from .connection import AsyncHTTPConnection +from .connection_pool import AsyncConnectionPool +from .http11 import AsyncHTTP11Connection +from .http_proxy import AsyncHTTPProxy +from .interfaces import AsyncConnectionInterface + +try: + from .http2 import AsyncHTTP2Connection +except ImportError: # pragma: nocover + + class AsyncHTTP2Connection: # type: ignore + def __init__(self, *args, **kwargs) -> None: # type: ignore + raise RuntimeError( + "Attempted to use http2 support, but the `h2` package is not " + "installed. Use 'pip install httpcore[http2]'." + ) + + +try: + from .socks_proxy import AsyncSOCKSProxy +except ImportError: # pragma: nocover + + class AsyncSOCKSProxy: # type: ignore + def __init__(self, *args, **kwargs) -> None: # type: ignore + raise RuntimeError( + "Attempted to use SOCKS support, but the `socksio` package is not " + "installed. Use 'pip install httpcore[socks]'." + ) + + +__all__ = [ + "AsyncHTTPConnection", + "AsyncConnectionPool", + "AsyncHTTPProxy", + "AsyncHTTP11Connection", + "AsyncHTTP2Connection", + "AsyncConnectionInterface", + "AsyncSOCKSProxy", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a02d1f8aabed8144f313a6a0dbf27b3f9741528 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/connection.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/connection.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a749e96dca0288365955d7709c553a5d19031c95 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/connection.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/connection_pool.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/connection_pool.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98ca0600bd735026d0a76ccf5e4c3357e660c80e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/connection_pool.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/http11.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/http11.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0bf03d1fb8315290b615528fddae179555d76a60 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/http11.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/http2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/http2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7bd2ff031db657c350fde8e6233ced459ba0c03 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/http2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/http_proxy.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/http_proxy.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd2ade610b26b70be5e9da69b357a93f9ec181f2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/http_proxy.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/interfaces.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/interfaces.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a1008254dbce3ef91db7ca26060a06aee7ba93c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/interfaces.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/socks_proxy.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/socks_proxy.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d17fd124aa00ab0d0775cd366188afa4dbd6f9a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/__pycache__/socks_proxy.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/connection.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/connection.py new file mode 100644 index 0000000000000000000000000000000000000000..b42581dff8aabf4c2ef80ffda26296e1b368d693 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/connection.py @@ -0,0 +1,222 @@ +from __future__ import annotations + +import itertools +import logging +import ssl +import types +import typing + +from .._backends.auto import AutoBackend +from .._backends.base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream +from .._exceptions import ConnectError, ConnectTimeout +from .._models import Origin, Request, Response +from .._ssl import default_ssl_context +from .._synchronization import AsyncLock +from .._trace import Trace +from .http11 import AsyncHTTP11Connection +from .interfaces import AsyncConnectionInterface + +RETRIES_BACKOFF_FACTOR = 0.5 # 0s, 0.5s, 1s, 2s, 4s, etc. + + +logger = logging.getLogger("httpcore.connection") + + +def exponential_backoff(factor: float) -> typing.Iterator[float]: + """ + Generate a geometric sequence that has a ratio of 2 and starts with 0. + + For example: + - `factor = 2`: `0, 2, 4, 8, 16, 32, 64, ...` + - `factor = 3`: `0, 3, 6, 12, 24, 48, 96, ...` + """ + yield 0 + for n in itertools.count(): + yield factor * 2**n + + +class AsyncHTTPConnection(AsyncConnectionInterface): + def __init__( + self, + origin: Origin, + ssl_context: ssl.SSLContext | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + retries: int = 0, + local_address: str | None = None, + uds: str | None = None, + network_backend: AsyncNetworkBackend | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> None: + self._origin = origin + self._ssl_context = ssl_context + self._keepalive_expiry = keepalive_expiry + self._http1 = http1 + self._http2 = http2 + self._retries = retries + self._local_address = local_address + self._uds = uds + + self._network_backend: AsyncNetworkBackend = ( + AutoBackend() if network_backend is None else network_backend + ) + self._connection: AsyncConnectionInterface | None = None + self._connect_failed: bool = False + self._request_lock = AsyncLock() + self._socket_options = socket_options + + async def handle_async_request(self, request: Request) -> Response: + if not self.can_handle_request(request.url.origin): + raise RuntimeError( + f"Attempted to send request to {request.url.origin} on connection to {self._origin}" + ) + + try: + async with self._request_lock: + if self._connection is None: + stream = await self._connect(request) + + ssl_object = stream.get_extra_info("ssl_object") + http2_negotiated = ( + ssl_object is not None + and ssl_object.selected_alpn_protocol() == "h2" + ) + if http2_negotiated or (self._http2 and not self._http1): + from .http2 import AsyncHTTP2Connection + + self._connection = AsyncHTTP2Connection( + origin=self._origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + else: + self._connection = AsyncHTTP11Connection( + origin=self._origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + except BaseException as exc: + self._connect_failed = True + raise exc + + return await self._connection.handle_async_request(request) + + async def _connect(self, request: Request) -> AsyncNetworkStream: + timeouts = request.extensions.get("timeout", {}) + sni_hostname = request.extensions.get("sni_hostname", None) + timeout = timeouts.get("connect", None) + + retries_left = self._retries + delays = exponential_backoff(factor=RETRIES_BACKOFF_FACTOR) + + while True: + try: + if self._uds is None: + kwargs = { + "host": self._origin.host.decode("ascii"), + "port": self._origin.port, + "local_address": self._local_address, + "timeout": timeout, + "socket_options": self._socket_options, + } + async with Trace("connect_tcp", logger, request, kwargs) as trace: + stream = await self._network_backend.connect_tcp(**kwargs) + trace.return_value = stream + else: + kwargs = { + "path": self._uds, + "timeout": timeout, + "socket_options": self._socket_options, + } + async with Trace( + "connect_unix_socket", logger, request, kwargs + ) as trace: + stream = await self._network_backend.connect_unix_socket( + **kwargs + ) + trace.return_value = stream + + if self._origin.scheme in (b"https", b"wss"): + ssl_context = ( + default_ssl_context() + if self._ssl_context is None + else self._ssl_context + ) + alpn_protocols = ["http/1.1", "h2"] if self._http2 else ["http/1.1"] + ssl_context.set_alpn_protocols(alpn_protocols) + + kwargs = { + "ssl_context": ssl_context, + "server_hostname": sni_hostname + or self._origin.host.decode("ascii"), + "timeout": timeout, + } + async with Trace("start_tls", logger, request, kwargs) as trace: + stream = await stream.start_tls(**kwargs) + trace.return_value = stream + return stream + except (ConnectError, ConnectTimeout): + if retries_left <= 0: + raise + retries_left -= 1 + delay = next(delays) + async with Trace("retry", logger, request, kwargs) as trace: + await self._network_backend.sleep(delay) + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._origin + + async def aclose(self) -> None: + if self._connection is not None: + async with Trace("close", logger, None, {}): + await self._connection.aclose() + + def is_available(self) -> bool: + if self._connection is None: + # If HTTP/2 support is enabled, and the resulting connection could + # end up as HTTP/2 then we should indicate the connection as being + # available to service multiple requests. + return ( + self._http2 + and (self._origin.scheme == b"https" or not self._http1) + and not self._connect_failed + ) + return self._connection.is_available() + + def has_expired(self) -> bool: + if self._connection is None: + return self._connect_failed + return self._connection.has_expired() + + def is_idle(self) -> bool: + if self._connection is None: + return self._connect_failed + return self._connection.is_idle() + + def is_closed(self) -> bool: + if self._connection is None: + return self._connect_failed + return self._connection.is_closed() + + def info(self) -> str: + if self._connection is None: + return "CONNECTION FAILED" if self._connect_failed else "CONNECTING" + return self._connection.info() + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} [{self.info()}]>" + + # These context managers are not used in the standard flow, but are + # useful for testing or working with connection instances directly. + + async def __aenter__(self) -> AsyncHTTPConnection: + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + await self.aclose() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/connection_pool.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/connection_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..96e973d0ce223f6bed9be9e6a6a2f3c01622c611 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/connection_pool.py @@ -0,0 +1,420 @@ +from __future__ import annotations + +import ssl +import sys +import types +import typing + +from .._backends.auto import AutoBackend +from .._backends.base import SOCKET_OPTION, AsyncNetworkBackend +from .._exceptions import ConnectionNotAvailable, UnsupportedProtocol +from .._models import Origin, Proxy, Request, Response +from .._synchronization import AsyncEvent, AsyncShieldCancellation, AsyncThreadLock +from .connection import AsyncHTTPConnection +from .interfaces import AsyncConnectionInterface, AsyncRequestInterface + + +class AsyncPoolRequest: + def __init__(self, request: Request) -> None: + self.request = request + self.connection: AsyncConnectionInterface | None = None + self._connection_acquired = AsyncEvent() + + def assign_to_connection(self, connection: AsyncConnectionInterface | None) -> None: + self.connection = connection + self._connection_acquired.set() + + def clear_connection(self) -> None: + self.connection = None + self._connection_acquired = AsyncEvent() + + async def wait_for_connection( + self, timeout: float | None = None + ) -> AsyncConnectionInterface: + if self.connection is None: + await self._connection_acquired.wait(timeout=timeout) + assert self.connection is not None + return self.connection + + def is_queued(self) -> bool: + return self.connection is None + + +class AsyncConnectionPool(AsyncRequestInterface): + """ + A connection pool for making HTTP requests. + """ + + def __init__( + self, + ssl_context: ssl.SSLContext | None = None, + proxy: Proxy | None = None, + max_connections: int | None = 10, + max_keepalive_connections: int | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + retries: int = 0, + local_address: str | None = None, + uds: str | None = None, + network_backend: AsyncNetworkBackend | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> None: + """ + A connection pool for making HTTP requests. + + Parameters: + ssl_context: An SSL context to use for verifying connections. + If not specified, the default `httpcore.default_ssl_context()` + will be used. + max_connections: The maximum number of concurrent HTTP connections that + the pool should allow. Any attempt to send a request on a pool that + would exceed this amount will block until a connection is available. + max_keepalive_connections: The maximum number of idle HTTP connections + that will be maintained in the pool. + keepalive_expiry: The duration in seconds that an idle HTTP connection + may be maintained for before being expired from the pool. + http1: A boolean indicating if HTTP/1.1 requests should be supported + by the connection pool. Defaults to True. + http2: A boolean indicating if HTTP/2 requests should be supported by + the connection pool. Defaults to False. + retries: The maximum number of retries when trying to establish a + connection. + local_address: Local address to connect from. Can also be used to connect + using a particular address family. Using `local_address="0.0.0.0"` + will connect using an `AF_INET` address (IPv4), while using + `local_address="::"` will connect using an `AF_INET6` address (IPv6). + uds: Path to a Unix Domain Socket to use instead of TCP sockets. + network_backend: A backend instance to use for handling network I/O. + socket_options: Socket options that have to be included + in the TCP socket when the connection was established. + """ + self._ssl_context = ssl_context + self._proxy = proxy + self._max_connections = ( + sys.maxsize if max_connections is None else max_connections + ) + self._max_keepalive_connections = ( + sys.maxsize + if max_keepalive_connections is None + else max_keepalive_connections + ) + self._max_keepalive_connections = min( + self._max_connections, self._max_keepalive_connections + ) + + self._keepalive_expiry = keepalive_expiry + self._http1 = http1 + self._http2 = http2 + self._retries = retries + self._local_address = local_address + self._uds = uds + + self._network_backend = ( + AutoBackend() if network_backend is None else network_backend + ) + self._socket_options = socket_options + + # The mutable state on a connection pool is the queue of incoming requests, + # and the set of connections that are servicing those requests. + self._connections: list[AsyncConnectionInterface] = [] + self._requests: list[AsyncPoolRequest] = [] + + # We only mutate the state of the connection pool within an 'optional_thread_lock' + # context. This holds a threading lock unless we're running in async mode, + # in which case it is a no-op. + self._optional_thread_lock = AsyncThreadLock() + + def create_connection(self, origin: Origin) -> AsyncConnectionInterface: + if self._proxy is not None: + if self._proxy.url.scheme in (b"socks5", b"socks5h"): + from .socks_proxy import AsyncSocks5Connection + + return AsyncSocks5Connection( + proxy_origin=self._proxy.url.origin, + proxy_auth=self._proxy.auth, + remote_origin=origin, + ssl_context=self._ssl_context, + keepalive_expiry=self._keepalive_expiry, + http1=self._http1, + http2=self._http2, + network_backend=self._network_backend, + ) + elif origin.scheme == b"http": + from .http_proxy import AsyncForwardHTTPConnection + + return AsyncForwardHTTPConnection( + proxy_origin=self._proxy.url.origin, + proxy_headers=self._proxy.headers, + proxy_ssl_context=self._proxy.ssl_context, + remote_origin=origin, + keepalive_expiry=self._keepalive_expiry, + network_backend=self._network_backend, + ) + from .http_proxy import AsyncTunnelHTTPConnection + + return AsyncTunnelHTTPConnection( + proxy_origin=self._proxy.url.origin, + proxy_headers=self._proxy.headers, + proxy_ssl_context=self._proxy.ssl_context, + remote_origin=origin, + ssl_context=self._ssl_context, + keepalive_expiry=self._keepalive_expiry, + http1=self._http1, + http2=self._http2, + network_backend=self._network_backend, + ) + + return AsyncHTTPConnection( + origin=origin, + ssl_context=self._ssl_context, + keepalive_expiry=self._keepalive_expiry, + http1=self._http1, + http2=self._http2, + retries=self._retries, + local_address=self._local_address, + uds=self._uds, + network_backend=self._network_backend, + socket_options=self._socket_options, + ) + + @property + def connections(self) -> list[AsyncConnectionInterface]: + """ + Return a list of the connections currently in the pool. + + For example: + + ```python + >>> pool.connections + [ + , + , + , + ] + ``` + """ + return list(self._connections) + + async def handle_async_request(self, request: Request) -> Response: + """ + Send an HTTP request, and return an HTTP response. + + This is the core implementation that is called into by `.request()` or `.stream()`. + """ + scheme = request.url.scheme.decode() + if scheme == "": + raise UnsupportedProtocol( + "Request URL is missing an 'http://' or 'https://' protocol." + ) + if scheme not in ("http", "https", "ws", "wss"): + raise UnsupportedProtocol( + f"Request URL has an unsupported protocol '{scheme}://'." + ) + + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("pool", None) + + with self._optional_thread_lock: + # Add the incoming request to our request queue. + pool_request = AsyncPoolRequest(request) + self._requests.append(pool_request) + + try: + while True: + with self._optional_thread_lock: + # Assign incoming requests to available connections, + # closing or creating new connections as required. + closing = self._assign_requests_to_connections() + await self._close_connections(closing) + + # Wait until this request has an assigned connection. + connection = await pool_request.wait_for_connection(timeout=timeout) + + try: + # Send the request on the assigned connection. + response = await connection.handle_async_request( + pool_request.request + ) + except ConnectionNotAvailable: + # In some cases a connection may initially be available to + # handle a request, but then become unavailable. + # + # In this case we clear the connection and try again. + pool_request.clear_connection() + else: + break # pragma: nocover + + except BaseException as exc: + with self._optional_thread_lock: + # For any exception or cancellation we remove the request from + # the queue, and then re-assign requests to connections. + self._requests.remove(pool_request) + closing = self._assign_requests_to_connections() + + await self._close_connections(closing) + raise exc from None + + # Return the response. Note that in this case we still have to manage + # the point at which the response is closed. + assert isinstance(response.stream, typing.AsyncIterable) + return Response( + status=response.status, + headers=response.headers, + content=PoolByteStream( + stream=response.stream, pool_request=pool_request, pool=self + ), + extensions=response.extensions, + ) + + def _assign_requests_to_connections(self) -> list[AsyncConnectionInterface]: + """ + Manage the state of the connection pool, assigning incoming + requests to connections as available. + + Called whenever a new request is added or removed from the pool. + + Any closing connections are returned, allowing the I/O for closing + those connections to be handled seperately. + """ + closing_connections = [] + + # First we handle cleaning up any connections that are closed, + # have expired their keep-alive, or surplus idle connections. + for connection in list(self._connections): + if connection.is_closed(): + # log: "removing closed connection" + self._connections.remove(connection) + elif connection.has_expired(): + # log: "closing expired connection" + self._connections.remove(connection) + closing_connections.append(connection) + elif ( + connection.is_idle() + and len([connection.is_idle() for connection in self._connections]) + > self._max_keepalive_connections + ): + # log: "closing idle connection" + self._connections.remove(connection) + closing_connections.append(connection) + + # Assign queued requests to connections. + queued_requests = [request for request in self._requests if request.is_queued()] + for pool_request in queued_requests: + origin = pool_request.request.url.origin + available_connections = [ + connection + for connection in self._connections + if connection.can_handle_request(origin) and connection.is_available() + ] + idle_connections = [ + connection for connection in self._connections if connection.is_idle() + ] + + # There are three cases for how we may be able to handle the request: + # + # 1. There is an existing connection that can handle the request. + # 2. We can create a new connection to handle the request. + # 3. We can close an idle connection and then create a new connection + # to handle the request. + if available_connections: + # log: "reusing existing connection" + connection = available_connections[0] + pool_request.assign_to_connection(connection) + elif len(self._connections) < self._max_connections: + # log: "creating new connection" + connection = self.create_connection(origin) + self._connections.append(connection) + pool_request.assign_to_connection(connection) + elif idle_connections: + # log: "closing idle connection" + connection = idle_connections[0] + self._connections.remove(connection) + closing_connections.append(connection) + # log: "creating new connection" + connection = self.create_connection(origin) + self._connections.append(connection) + pool_request.assign_to_connection(connection) + + return closing_connections + + async def _close_connections(self, closing: list[AsyncConnectionInterface]) -> None: + # Close connections which have been removed from the pool. + with AsyncShieldCancellation(): + for connection in closing: + await connection.aclose() + + async def aclose(self) -> None: + # Explicitly close the connection pool. + # Clears all existing requests and connections. + with self._optional_thread_lock: + closing_connections = list(self._connections) + self._connections = [] + await self._close_connections(closing_connections) + + async def __aenter__(self) -> AsyncConnectionPool: + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + await self.aclose() + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + with self._optional_thread_lock: + request_is_queued = [request.is_queued() for request in self._requests] + connection_is_idle = [ + connection.is_idle() for connection in self._connections + ] + + num_active_requests = request_is_queued.count(False) + num_queued_requests = request_is_queued.count(True) + num_active_connections = connection_is_idle.count(False) + num_idle_connections = connection_is_idle.count(True) + + requests_info = ( + f"Requests: {num_active_requests} active, {num_queued_requests} queued" + ) + connection_info = ( + f"Connections: {num_active_connections} active, {num_idle_connections} idle" + ) + + return f"<{class_name} [{requests_info} | {connection_info}]>" + + +class PoolByteStream: + def __init__( + self, + stream: typing.AsyncIterable[bytes], + pool_request: AsyncPoolRequest, + pool: AsyncConnectionPool, + ) -> None: + self._stream = stream + self._pool_request = pool_request + self._pool = pool + self._closed = False + + async def __aiter__(self) -> typing.AsyncIterator[bytes]: + try: + async for part in self._stream: + yield part + except BaseException as exc: + await self.aclose() + raise exc from None + + async def aclose(self) -> None: + if not self._closed: + self._closed = True + with AsyncShieldCancellation(): + if hasattr(self._stream, "aclose"): + await self._stream.aclose() + + with self._pool._optional_thread_lock: + self._pool._requests.remove(self._pool_request) + closing = self._pool._assign_requests_to_connections() + + await self._pool._close_connections(closing) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/http11.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/http11.py new file mode 100644 index 0000000000000000000000000000000000000000..e6d6d709852b137a862cfe2b3af42dc790fa705d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/http11.py @@ -0,0 +1,379 @@ +from __future__ import annotations + +import enum +import logging +import ssl +import time +import types +import typing + +import h11 + +from .._backends.base import AsyncNetworkStream +from .._exceptions import ( + ConnectionNotAvailable, + LocalProtocolError, + RemoteProtocolError, + WriteError, + map_exceptions, +) +from .._models import Origin, Request, Response +from .._synchronization import AsyncLock, AsyncShieldCancellation +from .._trace import Trace +from .interfaces import AsyncConnectionInterface + +logger = logging.getLogger("httpcore.http11") + + +# A subset of `h11.Event` types supported by `_send_event` +H11SendEvent = typing.Union[ + h11.Request, + h11.Data, + h11.EndOfMessage, +] + + +class HTTPConnectionState(enum.IntEnum): + NEW = 0 + ACTIVE = 1 + IDLE = 2 + CLOSED = 3 + + +class AsyncHTTP11Connection(AsyncConnectionInterface): + READ_NUM_BYTES = 64 * 1024 + MAX_INCOMPLETE_EVENT_SIZE = 100 * 1024 + + def __init__( + self, + origin: Origin, + stream: AsyncNetworkStream, + keepalive_expiry: float | None = None, + ) -> None: + self._origin = origin + self._network_stream = stream + self._keepalive_expiry: float | None = keepalive_expiry + self._expire_at: float | None = None + self._state = HTTPConnectionState.NEW + self._state_lock = AsyncLock() + self._request_count = 0 + self._h11_state = h11.Connection( + our_role=h11.CLIENT, + max_incomplete_event_size=self.MAX_INCOMPLETE_EVENT_SIZE, + ) + + async def handle_async_request(self, request: Request) -> Response: + if not self.can_handle_request(request.url.origin): + raise RuntimeError( + f"Attempted to send request to {request.url.origin} on connection " + f"to {self._origin}" + ) + + async with self._state_lock: + if self._state in (HTTPConnectionState.NEW, HTTPConnectionState.IDLE): + self._request_count += 1 + self._state = HTTPConnectionState.ACTIVE + self._expire_at = None + else: + raise ConnectionNotAvailable() + + try: + kwargs = {"request": request} + try: + async with Trace( + "send_request_headers", logger, request, kwargs + ) as trace: + await self._send_request_headers(**kwargs) + async with Trace("send_request_body", logger, request, kwargs) as trace: + await self._send_request_body(**kwargs) + except WriteError: + # If we get a write error while we're writing the request, + # then we supress this error and move on to attempting to + # read the response. Servers can sometimes close the request + # pre-emptively and then respond with a well formed HTTP + # error response. + pass + + async with Trace( + "receive_response_headers", logger, request, kwargs + ) as trace: + ( + http_version, + status, + reason_phrase, + headers, + trailing_data, + ) = await self._receive_response_headers(**kwargs) + trace.return_value = ( + http_version, + status, + reason_phrase, + headers, + ) + + network_stream = self._network_stream + + # CONNECT or Upgrade request + if (status == 101) or ( + (request.method == b"CONNECT") and (200 <= status < 300) + ): + network_stream = AsyncHTTP11UpgradeStream(network_stream, trailing_data) + + return Response( + status=status, + headers=headers, + content=HTTP11ConnectionByteStream(self, request), + extensions={ + "http_version": http_version, + "reason_phrase": reason_phrase, + "network_stream": network_stream, + }, + ) + except BaseException as exc: + with AsyncShieldCancellation(): + async with Trace("response_closed", logger, request) as trace: + await self._response_closed() + raise exc + + # Sending the request... + + async def _send_request_headers(self, request: Request) -> None: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("write", None) + + with map_exceptions({h11.LocalProtocolError: LocalProtocolError}): + event = h11.Request( + method=request.method, + target=request.url.target, + headers=request.headers, + ) + await self._send_event(event, timeout=timeout) + + async def _send_request_body(self, request: Request) -> None: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("write", None) + + assert isinstance(request.stream, typing.AsyncIterable) + async for chunk in request.stream: + event = h11.Data(data=chunk) + await self._send_event(event, timeout=timeout) + + await self._send_event(h11.EndOfMessage(), timeout=timeout) + + async def _send_event(self, event: h11.Event, timeout: float | None = None) -> None: + bytes_to_send = self._h11_state.send(event) + if bytes_to_send is not None: + await self._network_stream.write(bytes_to_send, timeout=timeout) + + # Receiving the response... + + async def _receive_response_headers( + self, request: Request + ) -> tuple[bytes, int, bytes, list[tuple[bytes, bytes]], bytes]: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("read", None) + + while True: + event = await self._receive_event(timeout=timeout) + if isinstance(event, h11.Response): + break + if ( + isinstance(event, h11.InformationalResponse) + and event.status_code == 101 + ): + break + + http_version = b"HTTP/" + event.http_version + + # h11 version 0.11+ supports a `raw_items` interface to get the + # raw header casing, rather than the enforced lowercase headers. + headers = event.headers.raw_items() + + trailing_data, _ = self._h11_state.trailing_data + + return http_version, event.status_code, event.reason, headers, trailing_data + + async def _receive_response_body( + self, request: Request + ) -> typing.AsyncIterator[bytes]: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("read", None) + + while True: + event = await self._receive_event(timeout=timeout) + if isinstance(event, h11.Data): + yield bytes(event.data) + elif isinstance(event, (h11.EndOfMessage, h11.PAUSED)): + break + + async def _receive_event( + self, timeout: float | None = None + ) -> h11.Event | type[h11.PAUSED]: + while True: + with map_exceptions({h11.RemoteProtocolError: RemoteProtocolError}): + event = self._h11_state.next_event() + + if event is h11.NEED_DATA: + data = await self._network_stream.read( + self.READ_NUM_BYTES, timeout=timeout + ) + + # If we feed this case through h11 we'll raise an exception like: + # + # httpcore.RemoteProtocolError: can't handle event type + # ConnectionClosed when role=SERVER and state=SEND_RESPONSE + # + # Which is accurate, but not very informative from an end-user + # perspective. Instead we handle this case distinctly and treat + # it as a ConnectError. + if data == b"" and self._h11_state.their_state == h11.SEND_RESPONSE: + msg = "Server disconnected without sending a response." + raise RemoteProtocolError(msg) + + self._h11_state.receive_data(data) + else: + # mypy fails to narrow the type in the above if statement above + return event # type: ignore[return-value] + + async def _response_closed(self) -> None: + async with self._state_lock: + if ( + self._h11_state.our_state is h11.DONE + and self._h11_state.their_state is h11.DONE + ): + self._state = HTTPConnectionState.IDLE + self._h11_state.start_next_cycle() + if self._keepalive_expiry is not None: + now = time.monotonic() + self._expire_at = now + self._keepalive_expiry + else: + await self.aclose() + + # Once the connection is no longer required... + + async def aclose(self) -> None: + # Note that this method unilaterally closes the connection, and does + # not have any kind of locking in place around it. + self._state = HTTPConnectionState.CLOSED + await self._network_stream.aclose() + + # The AsyncConnectionInterface methods provide information about the state of + # the connection, allowing for a connection pooling implementation to + # determine when to reuse and when to close the connection... + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._origin + + def is_available(self) -> bool: + # Note that HTTP/1.1 connections in the "NEW" state are not treated as + # being "available". The control flow which created the connection will + # be able to send an outgoing request, but the connection will not be + # acquired from the connection pool for any other request. + return self._state == HTTPConnectionState.IDLE + + def has_expired(self) -> bool: + now = time.monotonic() + keepalive_expired = self._expire_at is not None and now > self._expire_at + + # If the HTTP connection is idle but the socket is readable, then the + # only valid state is that the socket is about to return b"", indicating + # a server-initiated disconnect. + server_disconnected = ( + self._state == HTTPConnectionState.IDLE + and self._network_stream.get_extra_info("is_readable") + ) + + return keepalive_expired or server_disconnected + + def is_idle(self) -> bool: + return self._state == HTTPConnectionState.IDLE + + def is_closed(self) -> bool: + return self._state == HTTPConnectionState.CLOSED + + def info(self) -> str: + origin = str(self._origin) + return ( + f"{origin!r}, HTTP/1.1, {self._state.name}, " + f"Request Count: {self._request_count}" + ) + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + origin = str(self._origin) + return ( + f"<{class_name} [{origin!r}, {self._state.name}, " + f"Request Count: {self._request_count}]>" + ) + + # These context managers are not used in the standard flow, but are + # useful for testing or working with connection instances directly. + + async def __aenter__(self) -> AsyncHTTP11Connection: + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + await self.aclose() + + +class HTTP11ConnectionByteStream: + def __init__(self, connection: AsyncHTTP11Connection, request: Request) -> None: + self._connection = connection + self._request = request + self._closed = False + + async def __aiter__(self) -> typing.AsyncIterator[bytes]: + kwargs = {"request": self._request} + try: + async with Trace("receive_response_body", logger, self._request, kwargs): + async for chunk in self._connection._receive_response_body(**kwargs): + yield chunk + except BaseException as exc: + # If we get an exception while streaming the response, + # we want to close the response (and possibly the connection) + # before raising that exception. + with AsyncShieldCancellation(): + await self.aclose() + raise exc + + async def aclose(self) -> None: + if not self._closed: + self._closed = True + async with Trace("response_closed", logger, self._request): + await self._connection._response_closed() + + +class AsyncHTTP11UpgradeStream(AsyncNetworkStream): + def __init__(self, stream: AsyncNetworkStream, leading_data: bytes) -> None: + self._stream = stream + self._leading_data = leading_data + + async def read(self, max_bytes: int, timeout: float | None = None) -> bytes: + if self._leading_data: + buffer = self._leading_data[:max_bytes] + self._leading_data = self._leading_data[max_bytes:] + return buffer + else: + return await self._stream.read(max_bytes, timeout) + + async def write(self, buffer: bytes, timeout: float | None = None) -> None: + await self._stream.write(buffer, timeout) + + async def aclose(self) -> None: + await self._stream.aclose() + + async def start_tls( + self, + ssl_context: ssl.SSLContext, + server_hostname: str | None = None, + timeout: float | None = None, + ) -> AsyncNetworkStream: + return await self._stream.start_tls(ssl_context, server_hostname, timeout) + + def get_extra_info(self, info: str) -> typing.Any: + return self._stream.get_extra_info(info) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/http2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/http2.py new file mode 100644 index 0000000000000000000000000000000000000000..c6434a049696dd43b8c0dac7fcbca2e4cb0394b1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/http2.py @@ -0,0 +1,583 @@ +from __future__ import annotations + +import enum +import logging +import time +import types +import typing + +import h2.config +import h2.connection +import h2.events +import h2.exceptions +import h2.settings + +from .._backends.base import AsyncNetworkStream +from .._exceptions import ( + ConnectionNotAvailable, + LocalProtocolError, + RemoteProtocolError, +) +from .._models import Origin, Request, Response +from .._synchronization import AsyncLock, AsyncSemaphore, AsyncShieldCancellation +from .._trace import Trace +from .interfaces import AsyncConnectionInterface + +logger = logging.getLogger("httpcore.http2") + + +def has_body_headers(request: Request) -> bool: + return any( + k.lower() == b"content-length" or k.lower() == b"transfer-encoding" + for k, v in request.headers + ) + + +class HTTPConnectionState(enum.IntEnum): + ACTIVE = 1 + IDLE = 2 + CLOSED = 3 + + +class AsyncHTTP2Connection(AsyncConnectionInterface): + READ_NUM_BYTES = 64 * 1024 + CONFIG = h2.config.H2Configuration(validate_inbound_headers=False) + + def __init__( + self, + origin: Origin, + stream: AsyncNetworkStream, + keepalive_expiry: float | None = None, + ): + self._origin = origin + self._network_stream = stream + self._keepalive_expiry: float | None = keepalive_expiry + self._h2_state = h2.connection.H2Connection(config=self.CONFIG) + self._state = HTTPConnectionState.IDLE + self._expire_at: float | None = None + self._request_count = 0 + self._init_lock = AsyncLock() + self._state_lock = AsyncLock() + self._read_lock = AsyncLock() + self._write_lock = AsyncLock() + self._sent_connection_init = False + self._used_all_stream_ids = False + self._connection_error = False + + # Mapping from stream ID to response stream events. + self._events: dict[ + int, + h2.events.ResponseReceived + | h2.events.DataReceived + | h2.events.StreamEnded + | h2.events.StreamReset, + ] = {} + + # Connection terminated events are stored as state since + # we need to handle them for all streams. + self._connection_terminated: h2.events.ConnectionTerminated | None = None + + self._read_exception: Exception | None = None + self._write_exception: Exception | None = None + + async def handle_async_request(self, request: Request) -> Response: + if not self.can_handle_request(request.url.origin): + # This cannot occur in normal operation, since the connection pool + # will only send requests on connections that handle them. + # It's in place simply for resilience as a guard against incorrect + # usage, for anyone working directly with httpcore connections. + raise RuntimeError( + f"Attempted to send request to {request.url.origin} on connection " + f"to {self._origin}" + ) + + async with self._state_lock: + if self._state in (HTTPConnectionState.ACTIVE, HTTPConnectionState.IDLE): + self._request_count += 1 + self._expire_at = None + self._state = HTTPConnectionState.ACTIVE + else: + raise ConnectionNotAvailable() + + async with self._init_lock: + if not self._sent_connection_init: + try: + kwargs = {"request": request} + async with Trace("send_connection_init", logger, request, kwargs): + await self._send_connection_init(**kwargs) + except BaseException as exc: + with AsyncShieldCancellation(): + await self.aclose() + raise exc + + self._sent_connection_init = True + + # Initially start with just 1 until the remote server provides + # its max_concurrent_streams value + self._max_streams = 1 + + local_settings_max_streams = ( + self._h2_state.local_settings.max_concurrent_streams + ) + self._max_streams_semaphore = AsyncSemaphore(local_settings_max_streams) + + for _ in range(local_settings_max_streams - self._max_streams): + await self._max_streams_semaphore.acquire() + + await self._max_streams_semaphore.acquire() + + try: + stream_id = self._h2_state.get_next_available_stream_id() + self._events[stream_id] = [] + except h2.exceptions.NoAvailableStreamIDError: # pragma: nocover + self._used_all_stream_ids = True + self._request_count -= 1 + raise ConnectionNotAvailable() + + try: + kwargs = {"request": request, "stream_id": stream_id} + async with Trace("send_request_headers", logger, request, kwargs): + await self._send_request_headers(request=request, stream_id=stream_id) + async with Trace("send_request_body", logger, request, kwargs): + await self._send_request_body(request=request, stream_id=stream_id) + async with Trace( + "receive_response_headers", logger, request, kwargs + ) as trace: + status, headers = await self._receive_response( + request=request, stream_id=stream_id + ) + trace.return_value = (status, headers) + + return Response( + status=status, + headers=headers, + content=HTTP2ConnectionByteStream(self, request, stream_id=stream_id), + extensions={ + "http_version": b"HTTP/2", + "network_stream": self._network_stream, + "stream_id": stream_id, + }, + ) + except BaseException as exc: # noqa: PIE786 + with AsyncShieldCancellation(): + kwargs = {"stream_id": stream_id} + async with Trace("response_closed", logger, request, kwargs): + await self._response_closed(stream_id=stream_id) + + if isinstance(exc, h2.exceptions.ProtocolError): + # One case where h2 can raise a protocol error is when a + # closed frame has been seen by the state machine. + # + # This happens when one stream is reading, and encounters + # a GOAWAY event. Other flows of control may then raise + # a protocol error at any point they interact with the 'h2_state'. + # + # In this case we'll have stored the event, and should raise + # it as a RemoteProtocolError. + if self._connection_terminated: # pragma: nocover + raise RemoteProtocolError(self._connection_terminated) + # If h2 raises a protocol error in some other state then we + # must somehow have made a protocol violation. + raise LocalProtocolError(exc) # pragma: nocover + + raise exc + + async def _send_connection_init(self, request: Request) -> None: + """ + The HTTP/2 connection requires some initial setup before we can start + using individual request/response streams on it. + """ + # Need to set these manually here instead of manipulating via + # __setitem__() otherwise the H2Connection will emit SettingsUpdate + # frames in addition to sending the undesired defaults. + self._h2_state.local_settings = h2.settings.Settings( + client=True, + initial_values={ + # Disable PUSH_PROMISE frames from the server since we don't do anything + # with them for now. Maybe when we support caching? + h2.settings.SettingCodes.ENABLE_PUSH: 0, + # These two are taken from h2 for safe defaults + h2.settings.SettingCodes.MAX_CONCURRENT_STREAMS: 100, + h2.settings.SettingCodes.MAX_HEADER_LIST_SIZE: 65536, + }, + ) + + # Some websites (*cough* Yahoo *cough*) balk at this setting being + # present in the initial handshake since it's not defined in the original + # RFC despite the RFC mandating ignoring settings you don't know about. + del self._h2_state.local_settings[ + h2.settings.SettingCodes.ENABLE_CONNECT_PROTOCOL + ] + + self._h2_state.initiate_connection() + self._h2_state.increment_flow_control_window(2**24) + await self._write_outgoing_data(request) + + # Sending the request... + + async def _send_request_headers(self, request: Request, stream_id: int) -> None: + """ + Send the request headers to a given stream ID. + """ + end_stream = not has_body_headers(request) + + # In HTTP/2 the ':authority' pseudo-header is used instead of 'Host'. + # In order to gracefully handle HTTP/1.1 and HTTP/2 we always require + # HTTP/1.1 style headers, and map them appropriately if we end up on + # an HTTP/2 connection. + authority = [v for k, v in request.headers if k.lower() == b"host"][0] + + headers = [ + (b":method", request.method), + (b":authority", authority), + (b":scheme", request.url.scheme), + (b":path", request.url.target), + ] + [ + (k.lower(), v) + for k, v in request.headers + if k.lower() + not in ( + b"host", + b"transfer-encoding", + ) + ] + + self._h2_state.send_headers(stream_id, headers, end_stream=end_stream) + self._h2_state.increment_flow_control_window(2**24, stream_id=stream_id) + await self._write_outgoing_data(request) + + async def _send_request_body(self, request: Request, stream_id: int) -> None: + """ + Iterate over the request body sending it to a given stream ID. + """ + if not has_body_headers(request): + return + + assert isinstance(request.stream, typing.AsyncIterable) + async for data in request.stream: + await self._send_stream_data(request, stream_id, data) + await self._send_end_stream(request, stream_id) + + async def _send_stream_data( + self, request: Request, stream_id: int, data: bytes + ) -> None: + """ + Send a single chunk of data in one or more data frames. + """ + while data: + max_flow = await self._wait_for_outgoing_flow(request, stream_id) + chunk_size = min(len(data), max_flow) + chunk, data = data[:chunk_size], data[chunk_size:] + self._h2_state.send_data(stream_id, chunk) + await self._write_outgoing_data(request) + + async def _send_end_stream(self, request: Request, stream_id: int) -> None: + """ + Send an empty data frame on on a given stream ID with the END_STREAM flag set. + """ + self._h2_state.end_stream(stream_id) + await self._write_outgoing_data(request) + + # Receiving the response... + + async def _receive_response( + self, request: Request, stream_id: int + ) -> tuple[int, list[tuple[bytes, bytes]]]: + """ + Return the response status code and headers for a given stream ID. + """ + while True: + event = await self._receive_stream_event(request, stream_id) + if isinstance(event, h2.events.ResponseReceived): + break + + status_code = 200 + headers = [] + for k, v in event.headers: + if k == b":status": + status_code = int(v.decode("ascii", errors="ignore")) + elif not k.startswith(b":"): + headers.append((k, v)) + + return (status_code, headers) + + async def _receive_response_body( + self, request: Request, stream_id: int + ) -> typing.AsyncIterator[bytes]: + """ + Iterator that returns the bytes of the response body for a given stream ID. + """ + while True: + event = await self._receive_stream_event(request, stream_id) + if isinstance(event, h2.events.DataReceived): + amount = event.flow_controlled_length + self._h2_state.acknowledge_received_data(amount, stream_id) + await self._write_outgoing_data(request) + yield event.data + elif isinstance(event, h2.events.StreamEnded): + break + + async def _receive_stream_event( + self, request: Request, stream_id: int + ) -> h2.events.ResponseReceived | h2.events.DataReceived | h2.events.StreamEnded: + """ + Return the next available event for a given stream ID. + + Will read more data from the network if required. + """ + while not self._events.get(stream_id): + await self._receive_events(request, stream_id) + event = self._events[stream_id].pop(0) + if isinstance(event, h2.events.StreamReset): + raise RemoteProtocolError(event) + return event + + async def _receive_events( + self, request: Request, stream_id: int | None = None + ) -> None: + """ + Read some data from the network until we see one or more events + for a given stream ID. + """ + async with self._read_lock: + if self._connection_terminated is not None: + last_stream_id = self._connection_terminated.last_stream_id + if stream_id and last_stream_id and stream_id > last_stream_id: + self._request_count -= 1 + raise ConnectionNotAvailable() + raise RemoteProtocolError(self._connection_terminated) + + # This conditional is a bit icky. We don't want to block reading if we've + # actually got an event to return for a given stream. We need to do that + # check *within* the atomic read lock. Though it also need to be optional, + # because when we call it from `_wait_for_outgoing_flow` we *do* want to + # block until we've available flow control, event when we have events + # pending for the stream ID we're attempting to send on. + if stream_id is None or not self._events.get(stream_id): + events = await self._read_incoming_data(request) + for event in events: + if isinstance(event, h2.events.RemoteSettingsChanged): + async with Trace( + "receive_remote_settings", logger, request + ) as trace: + await self._receive_remote_settings_change(event) + trace.return_value = event + + elif isinstance( + event, + ( + h2.events.ResponseReceived, + h2.events.DataReceived, + h2.events.StreamEnded, + h2.events.StreamReset, + ), + ): + if event.stream_id in self._events: + self._events[event.stream_id].append(event) + + elif isinstance(event, h2.events.ConnectionTerminated): + self._connection_terminated = event + + await self._write_outgoing_data(request) + + async def _receive_remote_settings_change(self, event: h2.events.Event) -> None: + max_concurrent_streams = event.changed_settings.get( + h2.settings.SettingCodes.MAX_CONCURRENT_STREAMS + ) + if max_concurrent_streams: + new_max_streams = min( + max_concurrent_streams.new_value, + self._h2_state.local_settings.max_concurrent_streams, + ) + if new_max_streams and new_max_streams != self._max_streams: + while new_max_streams > self._max_streams: + await self._max_streams_semaphore.release() + self._max_streams += 1 + while new_max_streams < self._max_streams: + await self._max_streams_semaphore.acquire() + self._max_streams -= 1 + + async def _response_closed(self, stream_id: int) -> None: + await self._max_streams_semaphore.release() + del self._events[stream_id] + async with self._state_lock: + if self._connection_terminated and not self._events: + await self.aclose() + + elif self._state == HTTPConnectionState.ACTIVE and not self._events: + self._state = HTTPConnectionState.IDLE + if self._keepalive_expiry is not None: + now = time.monotonic() + self._expire_at = now + self._keepalive_expiry + if self._used_all_stream_ids: # pragma: nocover + await self.aclose() + + async def aclose(self) -> None: + # Note that this method unilaterally closes the connection, and does + # not have any kind of locking in place around it. + self._h2_state.close_connection() + self._state = HTTPConnectionState.CLOSED + await self._network_stream.aclose() + + # Wrappers around network read/write operations... + + async def _read_incoming_data(self, request: Request) -> list[h2.events.Event]: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("read", None) + + if self._read_exception is not None: + raise self._read_exception # pragma: nocover + + try: + data = await self._network_stream.read(self.READ_NUM_BYTES, timeout) + if data == b"": + raise RemoteProtocolError("Server disconnected") + except Exception as exc: + # If we get a network error we should: + # + # 1. Save the exception and just raise it immediately on any future reads. + # (For example, this means that a single read timeout or disconnect will + # immediately close all pending streams. Without requiring multiple + # sequential timeouts.) + # 2. Mark the connection as errored, so that we don't accept any other + # incoming requests. + self._read_exception = exc + self._connection_error = True + raise exc + + events: list[h2.events.Event] = self._h2_state.receive_data(data) + + return events + + async def _write_outgoing_data(self, request: Request) -> None: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("write", None) + + async with self._write_lock: + data_to_send = self._h2_state.data_to_send() + + if self._write_exception is not None: + raise self._write_exception # pragma: nocover + + try: + await self._network_stream.write(data_to_send, timeout) + except Exception as exc: # pragma: nocover + # If we get a network error we should: + # + # 1. Save the exception and just raise it immediately on any future write. + # (For example, this means that a single write timeout or disconnect will + # immediately close all pending streams. Without requiring multiple + # sequential timeouts.) + # 2. Mark the connection as errored, so that we don't accept any other + # incoming requests. + self._write_exception = exc + self._connection_error = True + raise exc + + # Flow control... + + async def _wait_for_outgoing_flow(self, request: Request, stream_id: int) -> int: + """ + Returns the maximum allowable outgoing flow for a given stream. + + If the allowable flow is zero, then waits on the network until + WindowUpdated frames have increased the flow rate. + https://tools.ietf.org/html/rfc7540#section-6.9 + """ + local_flow: int = self._h2_state.local_flow_control_window(stream_id) + max_frame_size: int = self._h2_state.max_outbound_frame_size + flow = min(local_flow, max_frame_size) + while flow == 0: + await self._receive_events(request) + local_flow = self._h2_state.local_flow_control_window(stream_id) + max_frame_size = self._h2_state.max_outbound_frame_size + flow = min(local_flow, max_frame_size) + return flow + + # Interface for connection pooling... + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._origin + + def is_available(self) -> bool: + return ( + self._state != HTTPConnectionState.CLOSED + and not self._connection_error + and not self._used_all_stream_ids + and not ( + self._h2_state.state_machine.state + == h2.connection.ConnectionState.CLOSED + ) + ) + + def has_expired(self) -> bool: + now = time.monotonic() + return self._expire_at is not None and now > self._expire_at + + def is_idle(self) -> bool: + return self._state == HTTPConnectionState.IDLE + + def is_closed(self) -> bool: + return self._state == HTTPConnectionState.CLOSED + + def info(self) -> str: + origin = str(self._origin) + return ( + f"{origin!r}, HTTP/2, {self._state.name}, " + f"Request Count: {self._request_count}" + ) + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + origin = str(self._origin) + return ( + f"<{class_name} [{origin!r}, {self._state.name}, " + f"Request Count: {self._request_count}]>" + ) + + # These context managers are not used in the standard flow, but are + # useful for testing or working with connection instances directly. + + async def __aenter__(self) -> AsyncHTTP2Connection: + return self + + async def __aexit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + await self.aclose() + + +class HTTP2ConnectionByteStream: + def __init__( + self, connection: AsyncHTTP2Connection, request: Request, stream_id: int + ) -> None: + self._connection = connection + self._request = request + self._stream_id = stream_id + self._closed = False + + async def __aiter__(self) -> typing.AsyncIterator[bytes]: + kwargs = {"request": self._request, "stream_id": self._stream_id} + try: + async with Trace("receive_response_body", logger, self._request, kwargs): + async for chunk in self._connection._receive_response_body( + request=self._request, stream_id=self._stream_id + ): + yield chunk + except BaseException as exc: + # If we get an exception while streaming the response, + # we want to close the response (and possibly the connection) + # before raising that exception. + with AsyncShieldCancellation(): + await self.aclose() + raise exc + + async def aclose(self) -> None: + if not self._closed: + self._closed = True + kwargs = {"stream_id": self._stream_id} + async with Trace("response_closed", logger, self._request, kwargs): + await self._connection._response_closed(stream_id=self._stream_id) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/http_proxy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/http_proxy.py new file mode 100644 index 0000000000000000000000000000000000000000..cc9d92066e1680576846e46ccdf645a2b1dd5718 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/http_proxy.py @@ -0,0 +1,367 @@ +from __future__ import annotations + +import base64 +import logging +import ssl +import typing + +from .._backends.base import SOCKET_OPTION, AsyncNetworkBackend +from .._exceptions import ProxyError +from .._models import ( + URL, + Origin, + Request, + Response, + enforce_bytes, + enforce_headers, + enforce_url, +) +from .._ssl import default_ssl_context +from .._synchronization import AsyncLock +from .._trace import Trace +from .connection import AsyncHTTPConnection +from .connection_pool import AsyncConnectionPool +from .http11 import AsyncHTTP11Connection +from .interfaces import AsyncConnectionInterface + +ByteOrStr = typing.Union[bytes, str] +HeadersAsSequence = typing.Sequence[typing.Tuple[ByteOrStr, ByteOrStr]] +HeadersAsMapping = typing.Mapping[ByteOrStr, ByteOrStr] + + +logger = logging.getLogger("httpcore.proxy") + + +def merge_headers( + default_headers: typing.Sequence[tuple[bytes, bytes]] | None = None, + override_headers: typing.Sequence[tuple[bytes, bytes]] | None = None, +) -> list[tuple[bytes, bytes]]: + """ + Append default_headers and override_headers, de-duplicating if a key exists + in both cases. + """ + default_headers = [] if default_headers is None else list(default_headers) + override_headers = [] if override_headers is None else list(override_headers) + has_override = set(key.lower() for key, value in override_headers) + default_headers = [ + (key, value) + for key, value in default_headers + if key.lower() not in has_override + ] + return default_headers + override_headers + + +class AsyncHTTPProxy(AsyncConnectionPool): # pragma: nocover + """ + A connection pool that sends requests via an HTTP proxy. + """ + + def __init__( + self, + proxy_url: URL | bytes | str, + proxy_auth: tuple[bytes | str, bytes | str] | None = None, + proxy_headers: HeadersAsMapping | HeadersAsSequence | None = None, + ssl_context: ssl.SSLContext | None = None, + proxy_ssl_context: ssl.SSLContext | None = None, + max_connections: int | None = 10, + max_keepalive_connections: int | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + retries: int = 0, + local_address: str | None = None, + uds: str | None = None, + network_backend: AsyncNetworkBackend | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> None: + """ + A connection pool for making HTTP requests. + + Parameters: + proxy_url: The URL to use when connecting to the proxy server. + For example `"http://127.0.0.1:8080/"`. + proxy_auth: Any proxy authentication as a two-tuple of + (username, password). May be either bytes or ascii-only str. + proxy_headers: Any HTTP headers to use for the proxy requests. + For example `{"Proxy-Authorization": "Basic :"}`. + ssl_context: An SSL context to use for verifying connections. + If not specified, the default `httpcore.default_ssl_context()` + will be used. + proxy_ssl_context: The same as `ssl_context`, but for a proxy server rather than a remote origin. + max_connections: The maximum number of concurrent HTTP connections that + the pool should allow. Any attempt to send a request on a pool that + would exceed this amount will block until a connection is available. + max_keepalive_connections: The maximum number of idle HTTP connections + that will be maintained in the pool. + keepalive_expiry: The duration in seconds that an idle HTTP connection + may be maintained for before being expired from the pool. + http1: A boolean indicating if HTTP/1.1 requests should be supported + by the connection pool. Defaults to True. + http2: A boolean indicating if HTTP/2 requests should be supported by + the connection pool. Defaults to False. + retries: The maximum number of retries when trying to establish + a connection. + local_address: Local address to connect from. Can also be used to + connect using a particular address family. Using + `local_address="0.0.0.0"` will connect using an `AF_INET` address + (IPv4), while using `local_address="::"` will connect using an + `AF_INET6` address (IPv6). + uds: Path to a Unix Domain Socket to use instead of TCP sockets. + network_backend: A backend instance to use for handling network I/O. + """ + super().__init__( + ssl_context=ssl_context, + max_connections=max_connections, + max_keepalive_connections=max_keepalive_connections, + keepalive_expiry=keepalive_expiry, + http1=http1, + http2=http2, + network_backend=network_backend, + retries=retries, + local_address=local_address, + uds=uds, + socket_options=socket_options, + ) + + self._proxy_url = enforce_url(proxy_url, name="proxy_url") + if ( + self._proxy_url.scheme == b"http" and proxy_ssl_context is not None + ): # pragma: no cover + raise RuntimeError( + "The `proxy_ssl_context` argument is not allowed for the http scheme" + ) + + self._ssl_context = ssl_context + self._proxy_ssl_context = proxy_ssl_context + self._proxy_headers = enforce_headers(proxy_headers, name="proxy_headers") + if proxy_auth is not None: + username = enforce_bytes(proxy_auth[0], name="proxy_auth") + password = enforce_bytes(proxy_auth[1], name="proxy_auth") + userpass = username + b":" + password + authorization = b"Basic " + base64.b64encode(userpass) + self._proxy_headers = [ + (b"Proxy-Authorization", authorization) + ] + self._proxy_headers + + def create_connection(self, origin: Origin) -> AsyncConnectionInterface: + if origin.scheme == b"http": + return AsyncForwardHTTPConnection( + proxy_origin=self._proxy_url.origin, + proxy_headers=self._proxy_headers, + remote_origin=origin, + keepalive_expiry=self._keepalive_expiry, + network_backend=self._network_backend, + proxy_ssl_context=self._proxy_ssl_context, + ) + return AsyncTunnelHTTPConnection( + proxy_origin=self._proxy_url.origin, + proxy_headers=self._proxy_headers, + remote_origin=origin, + ssl_context=self._ssl_context, + proxy_ssl_context=self._proxy_ssl_context, + keepalive_expiry=self._keepalive_expiry, + http1=self._http1, + http2=self._http2, + network_backend=self._network_backend, + ) + + +class AsyncForwardHTTPConnection(AsyncConnectionInterface): + def __init__( + self, + proxy_origin: Origin, + remote_origin: Origin, + proxy_headers: HeadersAsMapping | HeadersAsSequence | None = None, + keepalive_expiry: float | None = None, + network_backend: AsyncNetworkBackend | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + proxy_ssl_context: ssl.SSLContext | None = None, + ) -> None: + self._connection = AsyncHTTPConnection( + origin=proxy_origin, + keepalive_expiry=keepalive_expiry, + network_backend=network_backend, + socket_options=socket_options, + ssl_context=proxy_ssl_context, + ) + self._proxy_origin = proxy_origin + self._proxy_headers = enforce_headers(proxy_headers, name="proxy_headers") + self._remote_origin = remote_origin + + async def handle_async_request(self, request: Request) -> Response: + headers = merge_headers(self._proxy_headers, request.headers) + url = URL( + scheme=self._proxy_origin.scheme, + host=self._proxy_origin.host, + port=self._proxy_origin.port, + target=bytes(request.url), + ) + proxy_request = Request( + method=request.method, + url=url, + headers=headers, + content=request.stream, + extensions=request.extensions, + ) + return await self._connection.handle_async_request(proxy_request) + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._remote_origin + + async def aclose(self) -> None: + await self._connection.aclose() + + def info(self) -> str: + return self._connection.info() + + def is_available(self) -> bool: + return self._connection.is_available() + + def has_expired(self) -> bool: + return self._connection.has_expired() + + def is_idle(self) -> bool: + return self._connection.is_idle() + + def is_closed(self) -> bool: + return self._connection.is_closed() + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} [{self.info()}]>" + + +class AsyncTunnelHTTPConnection(AsyncConnectionInterface): + def __init__( + self, + proxy_origin: Origin, + remote_origin: Origin, + ssl_context: ssl.SSLContext | None = None, + proxy_ssl_context: ssl.SSLContext | None = None, + proxy_headers: typing.Sequence[tuple[bytes, bytes]] | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + network_backend: AsyncNetworkBackend | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> None: + self._connection: AsyncConnectionInterface = AsyncHTTPConnection( + origin=proxy_origin, + keepalive_expiry=keepalive_expiry, + network_backend=network_backend, + socket_options=socket_options, + ssl_context=proxy_ssl_context, + ) + self._proxy_origin = proxy_origin + self._remote_origin = remote_origin + self._ssl_context = ssl_context + self._proxy_ssl_context = proxy_ssl_context + self._proxy_headers = enforce_headers(proxy_headers, name="proxy_headers") + self._keepalive_expiry = keepalive_expiry + self._http1 = http1 + self._http2 = http2 + self._connect_lock = AsyncLock() + self._connected = False + + async def handle_async_request(self, request: Request) -> Response: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("connect", None) + + async with self._connect_lock: + if not self._connected: + target = b"%b:%d" % (self._remote_origin.host, self._remote_origin.port) + + connect_url = URL( + scheme=self._proxy_origin.scheme, + host=self._proxy_origin.host, + port=self._proxy_origin.port, + target=target, + ) + connect_headers = merge_headers( + [(b"Host", target), (b"Accept", b"*/*")], self._proxy_headers + ) + connect_request = Request( + method=b"CONNECT", + url=connect_url, + headers=connect_headers, + extensions=request.extensions, + ) + connect_response = await self._connection.handle_async_request( + connect_request + ) + + if connect_response.status < 200 or connect_response.status > 299: + reason_bytes = connect_response.extensions.get("reason_phrase", b"") + reason_str = reason_bytes.decode("ascii", errors="ignore") + msg = "%d %s" % (connect_response.status, reason_str) + await self._connection.aclose() + raise ProxyError(msg) + + stream = connect_response.extensions["network_stream"] + + # Upgrade the stream to SSL + ssl_context = ( + default_ssl_context() + if self._ssl_context is None + else self._ssl_context + ) + alpn_protocols = ["http/1.1", "h2"] if self._http2 else ["http/1.1"] + ssl_context.set_alpn_protocols(alpn_protocols) + + kwargs = { + "ssl_context": ssl_context, + "server_hostname": self._remote_origin.host.decode("ascii"), + "timeout": timeout, + } + async with Trace("start_tls", logger, request, kwargs) as trace: + stream = await stream.start_tls(**kwargs) + trace.return_value = stream + + # Determine if we should be using HTTP/1.1 or HTTP/2 + ssl_object = stream.get_extra_info("ssl_object") + http2_negotiated = ( + ssl_object is not None + and ssl_object.selected_alpn_protocol() == "h2" + ) + + # Create the HTTP/1.1 or HTTP/2 connection + if http2_negotiated or (self._http2 and not self._http1): + from .http2 import AsyncHTTP2Connection + + self._connection = AsyncHTTP2Connection( + origin=self._remote_origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + else: + self._connection = AsyncHTTP11Connection( + origin=self._remote_origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + + self._connected = True + return await self._connection.handle_async_request(request) + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._remote_origin + + async def aclose(self) -> None: + await self._connection.aclose() + + def info(self) -> str: + return self._connection.info() + + def is_available(self) -> bool: + return self._connection.is_available() + + def has_expired(self) -> bool: + return self._connection.has_expired() + + def is_idle(self) -> bool: + return self._connection.is_idle() + + def is_closed(self) -> bool: + return self._connection.is_closed() + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} [{self.info()}]>" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/interfaces.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/interfaces.py new file mode 100644 index 0000000000000000000000000000000000000000..361583bede6b2b84088b38054d5d8116ef9f1597 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/interfaces.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +import contextlib +import typing + +from .._models import ( + URL, + Extensions, + HeaderTypes, + Origin, + Request, + Response, + enforce_bytes, + enforce_headers, + enforce_url, + include_request_headers, +) + + +class AsyncRequestInterface: + async def request( + self, + method: bytes | str, + url: URL | bytes | str, + *, + headers: HeaderTypes = None, + content: bytes | typing.AsyncIterator[bytes] | None = None, + extensions: Extensions | None = None, + ) -> Response: + # Strict type checking on our parameters. + method = enforce_bytes(method, name="method") + url = enforce_url(url, name="url") + headers = enforce_headers(headers, name="headers") + + # Include Host header, and optionally Content-Length or Transfer-Encoding. + headers = include_request_headers(headers, url=url, content=content) + + request = Request( + method=method, + url=url, + headers=headers, + content=content, + extensions=extensions, + ) + response = await self.handle_async_request(request) + try: + await response.aread() + finally: + await response.aclose() + return response + + @contextlib.asynccontextmanager + async def stream( + self, + method: bytes | str, + url: URL | bytes | str, + *, + headers: HeaderTypes = None, + content: bytes | typing.AsyncIterator[bytes] | None = None, + extensions: Extensions | None = None, + ) -> typing.AsyncIterator[Response]: + # Strict type checking on our parameters. + method = enforce_bytes(method, name="method") + url = enforce_url(url, name="url") + headers = enforce_headers(headers, name="headers") + + # Include Host header, and optionally Content-Length or Transfer-Encoding. + headers = include_request_headers(headers, url=url, content=content) + + request = Request( + method=method, + url=url, + headers=headers, + content=content, + extensions=extensions, + ) + response = await self.handle_async_request(request) + try: + yield response + finally: + await response.aclose() + + async def handle_async_request(self, request: Request) -> Response: + raise NotImplementedError() # pragma: nocover + + +class AsyncConnectionInterface(AsyncRequestInterface): + async def aclose(self) -> None: + raise NotImplementedError() # pragma: nocover + + def info(self) -> str: + raise NotImplementedError() # pragma: nocover + + def can_handle_request(self, origin: Origin) -> bool: + raise NotImplementedError() # pragma: nocover + + def is_available(self) -> bool: + """ + Return `True` if the connection is currently able to accept an + outgoing request. + + An HTTP/1.1 connection will only be available if it is currently idle. + + An HTTP/2 connection will be available so long as the stream ID space is + not yet exhausted, and the connection is not in an error state. + + While the connection is being established we may not yet know if it is going + to result in an HTTP/1.1 or HTTP/2 connection. The connection should be + treated as being available, but might ultimately raise `NewConnectionRequired` + required exceptions if multiple requests are attempted over a connection + that ends up being established as HTTP/1.1. + """ + raise NotImplementedError() # pragma: nocover + + def has_expired(self) -> bool: + """ + Return `True` if the connection is in a state where it should be closed. + + This either means that the connection is idle and it has passed the + expiry time on its keep-alive, or that server has sent an EOF. + """ + raise NotImplementedError() # pragma: nocover + + def is_idle(self) -> bool: + """ + Return `True` if the connection is currently idle. + """ + raise NotImplementedError() # pragma: nocover + + def is_closed(self) -> bool: + """ + Return `True` if the connection has been closed. + + Used when a response is closed to determine if the connection may be + returned to the connection pool or not. + """ + raise NotImplementedError() # pragma: nocover diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/socks_proxy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/socks_proxy.py new file mode 100644 index 0000000000000000000000000000000000000000..b363f55a0b071de6c5f377726be82dc2110e373c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_async/socks_proxy.py @@ -0,0 +1,341 @@ +from __future__ import annotations + +import logging +import ssl + +import socksio + +from .._backends.auto import AutoBackend +from .._backends.base import AsyncNetworkBackend, AsyncNetworkStream +from .._exceptions import ConnectionNotAvailable, ProxyError +from .._models import URL, Origin, Request, Response, enforce_bytes, enforce_url +from .._ssl import default_ssl_context +from .._synchronization import AsyncLock +from .._trace import Trace +from .connection_pool import AsyncConnectionPool +from .http11 import AsyncHTTP11Connection +from .interfaces import AsyncConnectionInterface + +logger = logging.getLogger("httpcore.socks") + + +AUTH_METHODS = { + b"\x00": "NO AUTHENTICATION REQUIRED", + b"\x01": "GSSAPI", + b"\x02": "USERNAME/PASSWORD", + b"\xff": "NO ACCEPTABLE METHODS", +} + +REPLY_CODES = { + b"\x00": "Succeeded", + b"\x01": "General SOCKS server failure", + b"\x02": "Connection not allowed by ruleset", + b"\x03": "Network unreachable", + b"\x04": "Host unreachable", + b"\x05": "Connection refused", + b"\x06": "TTL expired", + b"\x07": "Command not supported", + b"\x08": "Address type not supported", +} + + +async def _init_socks5_connection( + stream: AsyncNetworkStream, + *, + host: bytes, + port: int, + auth: tuple[bytes, bytes] | None = None, +) -> None: + conn = socksio.socks5.SOCKS5Connection() + + # Auth method request + auth_method = ( + socksio.socks5.SOCKS5AuthMethod.NO_AUTH_REQUIRED + if auth is None + else socksio.socks5.SOCKS5AuthMethod.USERNAME_PASSWORD + ) + conn.send(socksio.socks5.SOCKS5AuthMethodsRequest([auth_method])) + outgoing_bytes = conn.data_to_send() + await stream.write(outgoing_bytes) + + # Auth method response + incoming_bytes = await stream.read(max_bytes=4096) + response = conn.receive_data(incoming_bytes) + assert isinstance(response, socksio.socks5.SOCKS5AuthReply) + if response.method != auth_method: + requested = AUTH_METHODS.get(auth_method, "UNKNOWN") + responded = AUTH_METHODS.get(response.method, "UNKNOWN") + raise ProxyError( + f"Requested {requested} from proxy server, but got {responded}." + ) + + if response.method == socksio.socks5.SOCKS5AuthMethod.USERNAME_PASSWORD: + # Username/password request + assert auth is not None + username, password = auth + conn.send(socksio.socks5.SOCKS5UsernamePasswordRequest(username, password)) + outgoing_bytes = conn.data_to_send() + await stream.write(outgoing_bytes) + + # Username/password response + incoming_bytes = await stream.read(max_bytes=4096) + response = conn.receive_data(incoming_bytes) + assert isinstance(response, socksio.socks5.SOCKS5UsernamePasswordReply) + if not response.success: + raise ProxyError("Invalid username/password") + + # Connect request + conn.send( + socksio.socks5.SOCKS5CommandRequest.from_address( + socksio.socks5.SOCKS5Command.CONNECT, (host, port) + ) + ) + outgoing_bytes = conn.data_to_send() + await stream.write(outgoing_bytes) + + # Connect response + incoming_bytes = await stream.read(max_bytes=4096) + response = conn.receive_data(incoming_bytes) + assert isinstance(response, socksio.socks5.SOCKS5Reply) + if response.reply_code != socksio.socks5.SOCKS5ReplyCode.SUCCEEDED: + reply_code = REPLY_CODES.get(response.reply_code, "UNKOWN") + raise ProxyError(f"Proxy Server could not connect: {reply_code}.") + + +class AsyncSOCKSProxy(AsyncConnectionPool): # pragma: nocover + """ + A connection pool that sends requests via an HTTP proxy. + """ + + def __init__( + self, + proxy_url: URL | bytes | str, + proxy_auth: tuple[bytes | str, bytes | str] | None = None, + ssl_context: ssl.SSLContext | None = None, + max_connections: int | None = 10, + max_keepalive_connections: int | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + retries: int = 0, + network_backend: AsyncNetworkBackend | None = None, + ) -> None: + """ + A connection pool for making HTTP requests. + + Parameters: + proxy_url: The URL to use when connecting to the proxy server. + For example `"http://127.0.0.1:8080/"`. + ssl_context: An SSL context to use for verifying connections. + If not specified, the default `httpcore.default_ssl_context()` + will be used. + max_connections: The maximum number of concurrent HTTP connections that + the pool should allow. Any attempt to send a request on a pool that + would exceed this amount will block until a connection is available. + max_keepalive_connections: The maximum number of idle HTTP connections + that will be maintained in the pool. + keepalive_expiry: The duration in seconds that an idle HTTP connection + may be maintained for before being expired from the pool. + http1: A boolean indicating if HTTP/1.1 requests should be supported + by the connection pool. Defaults to True. + http2: A boolean indicating if HTTP/2 requests should be supported by + the connection pool. Defaults to False. + retries: The maximum number of retries when trying to establish + a connection. + local_address: Local address to connect from. Can also be used to + connect using a particular address family. Using + `local_address="0.0.0.0"` will connect using an `AF_INET` address + (IPv4), while using `local_address="::"` will connect using an + `AF_INET6` address (IPv6). + uds: Path to a Unix Domain Socket to use instead of TCP sockets. + network_backend: A backend instance to use for handling network I/O. + """ + super().__init__( + ssl_context=ssl_context, + max_connections=max_connections, + max_keepalive_connections=max_keepalive_connections, + keepalive_expiry=keepalive_expiry, + http1=http1, + http2=http2, + network_backend=network_backend, + retries=retries, + ) + self._ssl_context = ssl_context + self._proxy_url = enforce_url(proxy_url, name="proxy_url") + if proxy_auth is not None: + username, password = proxy_auth + username_bytes = enforce_bytes(username, name="proxy_auth") + password_bytes = enforce_bytes(password, name="proxy_auth") + self._proxy_auth: tuple[bytes, bytes] | None = ( + username_bytes, + password_bytes, + ) + else: + self._proxy_auth = None + + def create_connection(self, origin: Origin) -> AsyncConnectionInterface: + return AsyncSocks5Connection( + proxy_origin=self._proxy_url.origin, + remote_origin=origin, + proxy_auth=self._proxy_auth, + ssl_context=self._ssl_context, + keepalive_expiry=self._keepalive_expiry, + http1=self._http1, + http2=self._http2, + network_backend=self._network_backend, + ) + + +class AsyncSocks5Connection(AsyncConnectionInterface): + def __init__( + self, + proxy_origin: Origin, + remote_origin: Origin, + proxy_auth: tuple[bytes, bytes] | None = None, + ssl_context: ssl.SSLContext | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + network_backend: AsyncNetworkBackend | None = None, + ) -> None: + self._proxy_origin = proxy_origin + self._remote_origin = remote_origin + self._proxy_auth = proxy_auth + self._ssl_context = ssl_context + self._keepalive_expiry = keepalive_expiry + self._http1 = http1 + self._http2 = http2 + + self._network_backend: AsyncNetworkBackend = ( + AutoBackend() if network_backend is None else network_backend + ) + self._connect_lock = AsyncLock() + self._connection: AsyncConnectionInterface | None = None + self._connect_failed = False + + async def handle_async_request(self, request: Request) -> Response: + timeouts = request.extensions.get("timeout", {}) + sni_hostname = request.extensions.get("sni_hostname", None) + timeout = timeouts.get("connect", None) + + async with self._connect_lock: + if self._connection is None: + try: + # Connect to the proxy + kwargs = { + "host": self._proxy_origin.host.decode("ascii"), + "port": self._proxy_origin.port, + "timeout": timeout, + } + async with Trace("connect_tcp", logger, request, kwargs) as trace: + stream = await self._network_backend.connect_tcp(**kwargs) + trace.return_value = stream + + # Connect to the remote host using socks5 + kwargs = { + "stream": stream, + "host": self._remote_origin.host.decode("ascii"), + "port": self._remote_origin.port, + "auth": self._proxy_auth, + } + async with Trace( + "setup_socks5_connection", logger, request, kwargs + ) as trace: + await _init_socks5_connection(**kwargs) + trace.return_value = stream + + # Upgrade the stream to SSL + if self._remote_origin.scheme == b"https": + ssl_context = ( + default_ssl_context() + if self._ssl_context is None + else self._ssl_context + ) + alpn_protocols = ( + ["http/1.1", "h2"] if self._http2 else ["http/1.1"] + ) + ssl_context.set_alpn_protocols(alpn_protocols) + + kwargs = { + "ssl_context": ssl_context, + "server_hostname": sni_hostname + or self._remote_origin.host.decode("ascii"), + "timeout": timeout, + } + async with Trace("start_tls", logger, request, kwargs) as trace: + stream = await stream.start_tls(**kwargs) + trace.return_value = stream + + # Determine if we should be using HTTP/1.1 or HTTP/2 + ssl_object = stream.get_extra_info("ssl_object") + http2_negotiated = ( + ssl_object is not None + and ssl_object.selected_alpn_protocol() == "h2" + ) + + # Create the HTTP/1.1 or HTTP/2 connection + if http2_negotiated or ( + self._http2 and not self._http1 + ): # pragma: nocover + from .http2 import AsyncHTTP2Connection + + self._connection = AsyncHTTP2Connection( + origin=self._remote_origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + else: + self._connection = AsyncHTTP11Connection( + origin=self._remote_origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + except Exception as exc: + self._connect_failed = True + raise exc + elif not self._connection.is_available(): # pragma: nocover + raise ConnectionNotAvailable() + + return await self._connection.handle_async_request(request) + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._remote_origin + + async def aclose(self) -> None: + if self._connection is not None: + await self._connection.aclose() + + def is_available(self) -> bool: + if self._connection is None: # pragma: nocover + # If HTTP/2 support is enabled, and the resulting connection could + # end up as HTTP/2 then we should indicate the connection as being + # available to service multiple requests. + return ( + self._http2 + and (self._remote_origin.scheme == b"https" or not self._http1) + and not self._connect_failed + ) + return self._connection.is_available() + + def has_expired(self) -> bool: + if self._connection is None: # pragma: nocover + return self._connect_failed + return self._connection.has_expired() + + def is_idle(self) -> bool: + if self._connection is None: # pragma: nocover + return self._connect_failed + return self._connection.is_idle() + + def is_closed(self) -> bool: + if self._connection is None: # pragma: nocover + return self._connect_failed + return self._connection.is_closed() + + def info(self) -> str: + if self._connection is None: # pragma: nocover + return "CONNECTION FAILED" if self._connect_failed else "CONNECTING" + return self._connection.info() + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} [{self.info()}]>" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97424d8e6aac55e6e3642ebeb63a8331161c7266 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/anyio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/anyio.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6b445b1f5294d0bb30235e7b85363d1328c6d4e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/anyio.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/auto.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/auto.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f618f04ee6ad7319258ce0ba2eeda950c745a23 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/auto.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/base.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f397c5938053ec01395c129210c13ddeba0ab43d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/base.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/mock.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/mock.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..919cfb2ad5411cdfa950c0d6bfcec716f26d4c33 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/mock.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/sync.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/sync.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..214d433233a786533d4ec5e1261b9751049b5a7d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/sync.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/trio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/trio.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc628b2b1a3b20fbe7df1297a2fc5a7e5bfb912d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/__pycache__/trio.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/anyio.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/anyio.py new file mode 100644 index 0000000000000000000000000000000000000000..a140095e1b8de022f321a41c0125e0e5febc0749 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/anyio.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +import ssl +import typing + +import anyio + +from .._exceptions import ( + ConnectError, + ConnectTimeout, + ReadError, + ReadTimeout, + WriteError, + WriteTimeout, + map_exceptions, +) +from .._utils import is_socket_readable +from .base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream + + +class AnyIOStream(AsyncNetworkStream): + def __init__(self, stream: anyio.abc.ByteStream) -> None: + self._stream = stream + + async def read(self, max_bytes: int, timeout: float | None = None) -> bytes: + exc_map = { + TimeoutError: ReadTimeout, + anyio.BrokenResourceError: ReadError, + anyio.ClosedResourceError: ReadError, + anyio.EndOfStream: ReadError, + } + with map_exceptions(exc_map): + with anyio.fail_after(timeout): + try: + return await self._stream.receive(max_bytes=max_bytes) + except anyio.EndOfStream: # pragma: nocover + return b"" + + async def write(self, buffer: bytes, timeout: float | None = None) -> None: + if not buffer: + return + + exc_map = { + TimeoutError: WriteTimeout, + anyio.BrokenResourceError: WriteError, + anyio.ClosedResourceError: WriteError, + } + with map_exceptions(exc_map): + with anyio.fail_after(timeout): + await self._stream.send(item=buffer) + + async def aclose(self) -> None: + await self._stream.aclose() + + async def start_tls( + self, + ssl_context: ssl.SSLContext, + server_hostname: str | None = None, + timeout: float | None = None, + ) -> AsyncNetworkStream: + exc_map = { + TimeoutError: ConnectTimeout, + anyio.BrokenResourceError: ConnectError, + anyio.EndOfStream: ConnectError, + ssl.SSLError: ConnectError, + } + with map_exceptions(exc_map): + try: + with anyio.fail_after(timeout): + ssl_stream = await anyio.streams.tls.TLSStream.wrap( + self._stream, + ssl_context=ssl_context, + hostname=server_hostname, + standard_compatible=False, + server_side=False, + ) + except Exception as exc: # pragma: nocover + await self.aclose() + raise exc + return AnyIOStream(ssl_stream) + + def get_extra_info(self, info: str) -> typing.Any: + if info == "ssl_object": + return self._stream.extra(anyio.streams.tls.TLSAttribute.ssl_object, None) + if info == "client_addr": + return self._stream.extra(anyio.abc.SocketAttribute.local_address, None) + if info == "server_addr": + return self._stream.extra(anyio.abc.SocketAttribute.remote_address, None) + if info == "socket": + return self._stream.extra(anyio.abc.SocketAttribute.raw_socket, None) + if info == "is_readable": + sock = self._stream.extra(anyio.abc.SocketAttribute.raw_socket, None) + return is_socket_readable(sock) + return None + + +class AnyIOBackend(AsyncNetworkBackend): + async def connect_tcp( + self, + host: str, + port: int, + timeout: float | None = None, + local_address: str | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> AsyncNetworkStream: # pragma: nocover + if socket_options is None: + socket_options = [] + exc_map = { + TimeoutError: ConnectTimeout, + OSError: ConnectError, + anyio.BrokenResourceError: ConnectError, + } + with map_exceptions(exc_map): + with anyio.fail_after(timeout): + stream: anyio.abc.ByteStream = await anyio.connect_tcp( + remote_host=host, + remote_port=port, + local_host=local_address, + ) + # By default TCP sockets opened in `asyncio` include TCP_NODELAY. + for option in socket_options: + stream._raw_socket.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover + return AnyIOStream(stream) + + async def connect_unix_socket( + self, + path: str, + timeout: float | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> AsyncNetworkStream: # pragma: nocover + if socket_options is None: + socket_options = [] + exc_map = { + TimeoutError: ConnectTimeout, + OSError: ConnectError, + anyio.BrokenResourceError: ConnectError, + } + with map_exceptions(exc_map): + with anyio.fail_after(timeout): + stream: anyio.abc.ByteStream = await anyio.connect_unix(path) + for option in socket_options: + stream._raw_socket.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover + return AnyIOStream(stream) + + async def sleep(self, seconds: float) -> None: + await anyio.sleep(seconds) # pragma: nocover diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/auto.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/auto.py new file mode 100644 index 0000000000000000000000000000000000000000..49f0e698c97ad5623f376d8182675352e21c2c3c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/auto.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +import typing + +from .._synchronization import current_async_library +from .base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream + + +class AutoBackend(AsyncNetworkBackend): + async def _init_backend(self) -> None: + if not (hasattr(self, "_backend")): + backend = current_async_library() + if backend == "trio": + from .trio import TrioBackend + + self._backend: AsyncNetworkBackend = TrioBackend() + else: + from .anyio import AnyIOBackend + + self._backend = AnyIOBackend() + + async def connect_tcp( + self, + host: str, + port: int, + timeout: float | None = None, + local_address: str | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> AsyncNetworkStream: + await self._init_backend() + return await self._backend.connect_tcp( + host, + port, + timeout=timeout, + local_address=local_address, + socket_options=socket_options, + ) + + async def connect_unix_socket( + self, + path: str, + timeout: float | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> AsyncNetworkStream: # pragma: nocover + await self._init_backend() + return await self._backend.connect_unix_socket( + path, timeout=timeout, socket_options=socket_options + ) + + async def sleep(self, seconds: float) -> None: # pragma: nocover + await self._init_backend() + return await self._backend.sleep(seconds) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/base.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/base.py new file mode 100644 index 0000000000000000000000000000000000000000..cf55c8b10eb543872550be863206fe2f760d0d8d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/base.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import ssl +import time +import typing + +SOCKET_OPTION = typing.Union[ + typing.Tuple[int, int, int], + typing.Tuple[int, int, typing.Union[bytes, bytearray]], + typing.Tuple[int, int, None, int], +] + + +class NetworkStream: + def read(self, max_bytes: int, timeout: float | None = None) -> bytes: + raise NotImplementedError() # pragma: nocover + + def write(self, buffer: bytes, timeout: float | None = None) -> None: + raise NotImplementedError() # pragma: nocover + + def close(self) -> None: + raise NotImplementedError() # pragma: nocover + + def start_tls( + self, + ssl_context: ssl.SSLContext, + server_hostname: str | None = None, + timeout: float | None = None, + ) -> NetworkStream: + raise NotImplementedError() # pragma: nocover + + def get_extra_info(self, info: str) -> typing.Any: + return None # pragma: nocover + + +class NetworkBackend: + def connect_tcp( + self, + host: str, + port: int, + timeout: float | None = None, + local_address: str | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> NetworkStream: + raise NotImplementedError() # pragma: nocover + + def connect_unix_socket( + self, + path: str, + timeout: float | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> NetworkStream: + raise NotImplementedError() # pragma: nocover + + def sleep(self, seconds: float) -> None: + time.sleep(seconds) # pragma: nocover + + +class AsyncNetworkStream: + async def read(self, max_bytes: int, timeout: float | None = None) -> bytes: + raise NotImplementedError() # pragma: nocover + + async def write(self, buffer: bytes, timeout: float | None = None) -> None: + raise NotImplementedError() # pragma: nocover + + async def aclose(self) -> None: + raise NotImplementedError() # pragma: nocover + + async def start_tls( + self, + ssl_context: ssl.SSLContext, + server_hostname: str | None = None, + timeout: float | None = None, + ) -> AsyncNetworkStream: + raise NotImplementedError() # pragma: nocover + + def get_extra_info(self, info: str) -> typing.Any: + return None # pragma: nocover + + +class AsyncNetworkBackend: + async def connect_tcp( + self, + host: str, + port: int, + timeout: float | None = None, + local_address: str | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> AsyncNetworkStream: + raise NotImplementedError() # pragma: nocover + + async def connect_unix_socket( + self, + path: str, + timeout: float | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> AsyncNetworkStream: + raise NotImplementedError() # pragma: nocover + + async def sleep(self, seconds: float) -> None: + raise NotImplementedError() # pragma: nocover diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/mock.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/mock.py new file mode 100644 index 0000000000000000000000000000000000000000..9b6edca03d4d4b34f355fd53e49d4b4c699c972c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/mock.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +import ssl +import typing + +from .._exceptions import ReadError +from .base import ( + SOCKET_OPTION, + AsyncNetworkBackend, + AsyncNetworkStream, + NetworkBackend, + NetworkStream, +) + + +class MockSSLObject: + def __init__(self, http2: bool): + self._http2 = http2 + + def selected_alpn_protocol(self) -> str: + return "h2" if self._http2 else "http/1.1" + + +class MockStream(NetworkStream): + def __init__(self, buffer: list[bytes], http2: bool = False) -> None: + self._buffer = buffer + self._http2 = http2 + self._closed = False + + def read(self, max_bytes: int, timeout: float | None = None) -> bytes: + if self._closed: + raise ReadError("Connection closed") + if not self._buffer: + return b"" + return self._buffer.pop(0) + + def write(self, buffer: bytes, timeout: float | None = None) -> None: + pass + + def close(self) -> None: + self._closed = True + + def start_tls( + self, + ssl_context: ssl.SSLContext, + server_hostname: str | None = None, + timeout: float | None = None, + ) -> NetworkStream: + return self + + def get_extra_info(self, info: str) -> typing.Any: + return MockSSLObject(http2=self._http2) if info == "ssl_object" else None + + def __repr__(self) -> str: + return "" + + +class MockBackend(NetworkBackend): + def __init__(self, buffer: list[bytes], http2: bool = False) -> None: + self._buffer = buffer + self._http2 = http2 + + def connect_tcp( + self, + host: str, + port: int, + timeout: float | None = None, + local_address: str | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> NetworkStream: + return MockStream(list(self._buffer), http2=self._http2) + + def connect_unix_socket( + self, + path: str, + timeout: float | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> NetworkStream: + return MockStream(list(self._buffer), http2=self._http2) + + def sleep(self, seconds: float) -> None: + pass + + +class AsyncMockStream(AsyncNetworkStream): + def __init__(self, buffer: list[bytes], http2: bool = False) -> None: + self._buffer = buffer + self._http2 = http2 + self._closed = False + + async def read(self, max_bytes: int, timeout: float | None = None) -> bytes: + if self._closed: + raise ReadError("Connection closed") + if not self._buffer: + return b"" + return self._buffer.pop(0) + + async def write(self, buffer: bytes, timeout: float | None = None) -> None: + pass + + async def aclose(self) -> None: + self._closed = True + + async def start_tls( + self, + ssl_context: ssl.SSLContext, + server_hostname: str | None = None, + timeout: float | None = None, + ) -> AsyncNetworkStream: + return self + + def get_extra_info(self, info: str) -> typing.Any: + return MockSSLObject(http2=self._http2) if info == "ssl_object" else None + + def __repr__(self) -> str: + return "" + + +class AsyncMockBackend(AsyncNetworkBackend): + def __init__(self, buffer: list[bytes], http2: bool = False) -> None: + self._buffer = buffer + self._http2 = http2 + + async def connect_tcp( + self, + host: str, + port: int, + timeout: float | None = None, + local_address: str | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> AsyncNetworkStream: + return AsyncMockStream(list(self._buffer), http2=self._http2) + + async def connect_unix_socket( + self, + path: str, + timeout: float | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> AsyncNetworkStream: + return AsyncMockStream(list(self._buffer), http2=self._http2) + + async def sleep(self, seconds: float) -> None: + pass diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/sync.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/sync.py new file mode 100644 index 0000000000000000000000000000000000000000..4018a09c6fb1e0ef1b03ab8d84b13ebef4031f7c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/sync.py @@ -0,0 +1,241 @@ +from __future__ import annotations + +import functools +import socket +import ssl +import sys +import typing + +from .._exceptions import ( + ConnectError, + ConnectTimeout, + ExceptionMapping, + ReadError, + ReadTimeout, + WriteError, + WriteTimeout, + map_exceptions, +) +from .._utils import is_socket_readable +from .base import SOCKET_OPTION, NetworkBackend, NetworkStream + + +class TLSinTLSStream(NetworkStream): # pragma: no cover + """ + Because the standard `SSLContext.wrap_socket` method does + not work for `SSLSocket` objects, we need this class + to implement TLS stream using an underlying `SSLObject` + instance in order to support TLS on top of TLS. + """ + + # Defined in RFC 8449 + TLS_RECORD_SIZE = 16384 + + def __init__( + self, + sock: socket.socket, + ssl_context: ssl.SSLContext, + server_hostname: str | None = None, + timeout: float | None = None, + ): + self._sock = sock + self._incoming = ssl.MemoryBIO() + self._outgoing = ssl.MemoryBIO() + + self.ssl_obj = ssl_context.wrap_bio( + incoming=self._incoming, + outgoing=self._outgoing, + server_hostname=server_hostname, + ) + + self._sock.settimeout(timeout) + self._perform_io(self.ssl_obj.do_handshake) + + def _perform_io( + self, + func: typing.Callable[..., typing.Any], + ) -> typing.Any: + ret = None + + while True: + errno = None + try: + ret = func() + except (ssl.SSLWantReadError, ssl.SSLWantWriteError) as e: + errno = e.errno + + self._sock.sendall(self._outgoing.read()) + + if errno == ssl.SSL_ERROR_WANT_READ: + buf = self._sock.recv(self.TLS_RECORD_SIZE) + + if buf: + self._incoming.write(buf) + else: + self._incoming.write_eof() + if errno is None: + return ret + + def read(self, max_bytes: int, timeout: float | None = None) -> bytes: + exc_map: ExceptionMapping = {socket.timeout: ReadTimeout, OSError: ReadError} + with map_exceptions(exc_map): + self._sock.settimeout(timeout) + return typing.cast( + bytes, self._perform_io(functools.partial(self.ssl_obj.read, max_bytes)) + ) + + def write(self, buffer: bytes, timeout: float | None = None) -> None: + exc_map: ExceptionMapping = {socket.timeout: WriteTimeout, OSError: WriteError} + with map_exceptions(exc_map): + self._sock.settimeout(timeout) + while buffer: + nsent = self._perform_io(functools.partial(self.ssl_obj.write, buffer)) + buffer = buffer[nsent:] + + def close(self) -> None: + self._sock.close() + + def start_tls( + self, + ssl_context: ssl.SSLContext, + server_hostname: str | None = None, + timeout: float | None = None, + ) -> NetworkStream: + raise NotImplementedError() + + def get_extra_info(self, info: str) -> typing.Any: + if info == "ssl_object": + return self.ssl_obj + if info == "client_addr": + return self._sock.getsockname() + if info == "server_addr": + return self._sock.getpeername() + if info == "socket": + return self._sock + if info == "is_readable": + return is_socket_readable(self._sock) + return None + + +class SyncStream(NetworkStream): + def __init__(self, sock: socket.socket) -> None: + self._sock = sock + + def read(self, max_bytes: int, timeout: float | None = None) -> bytes: + exc_map: ExceptionMapping = {socket.timeout: ReadTimeout, OSError: ReadError} + with map_exceptions(exc_map): + self._sock.settimeout(timeout) + return self._sock.recv(max_bytes) + + def write(self, buffer: bytes, timeout: float | None = None) -> None: + if not buffer: + return + + exc_map: ExceptionMapping = {socket.timeout: WriteTimeout, OSError: WriteError} + with map_exceptions(exc_map): + while buffer: + self._sock.settimeout(timeout) + n = self._sock.send(buffer) + buffer = buffer[n:] + + def close(self) -> None: + self._sock.close() + + def start_tls( + self, + ssl_context: ssl.SSLContext, + server_hostname: str | None = None, + timeout: float | None = None, + ) -> NetworkStream: + exc_map: ExceptionMapping = { + socket.timeout: ConnectTimeout, + OSError: ConnectError, + } + with map_exceptions(exc_map): + try: + if isinstance(self._sock, ssl.SSLSocket): # pragma: no cover + # If the underlying socket has already been upgraded + # to the TLS layer (i.e. is an instance of SSLSocket), + # we need some additional smarts to support TLS-in-TLS. + return TLSinTLSStream( + self._sock, ssl_context, server_hostname, timeout + ) + else: + self._sock.settimeout(timeout) + sock = ssl_context.wrap_socket( + self._sock, server_hostname=server_hostname + ) + except Exception as exc: # pragma: nocover + self.close() + raise exc + return SyncStream(sock) + + def get_extra_info(self, info: str) -> typing.Any: + if info == "ssl_object" and isinstance(self._sock, ssl.SSLSocket): + return self._sock._sslobj # type: ignore + if info == "client_addr": + return self._sock.getsockname() + if info == "server_addr": + return self._sock.getpeername() + if info == "socket": + return self._sock + if info == "is_readable": + return is_socket_readable(self._sock) + return None + + +class SyncBackend(NetworkBackend): + def connect_tcp( + self, + host: str, + port: int, + timeout: float | None = None, + local_address: str | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> NetworkStream: + # Note that we automatically include `TCP_NODELAY` + # in addition to any other custom socket options. + if socket_options is None: + socket_options = [] # pragma: no cover + address = (host, port) + source_address = None if local_address is None else (local_address, 0) + exc_map: ExceptionMapping = { + socket.timeout: ConnectTimeout, + OSError: ConnectError, + } + + with map_exceptions(exc_map): + sock = socket.create_connection( + address, + timeout, + source_address=source_address, + ) + for option in socket_options: + sock.setsockopt(*option) # pragma: no cover + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + return SyncStream(sock) + + def connect_unix_socket( + self, + path: str, + timeout: float | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> NetworkStream: # pragma: nocover + if sys.platform == "win32": + raise RuntimeError( + "Attempted to connect to a UNIX socket on a Windows system." + ) + if socket_options is None: + socket_options = [] + + exc_map: ExceptionMapping = { + socket.timeout: ConnectTimeout, + OSError: ConnectError, + } + with map_exceptions(exc_map): + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + for option in socket_options: + sock.setsockopt(*option) + sock.settimeout(timeout) + sock.connect(path) + return SyncStream(sock) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/trio.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/trio.py new file mode 100644 index 0000000000000000000000000000000000000000..6f53f5f2a025e01e9949e2530bd9ca6928859251 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_backends/trio.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +import ssl +import typing + +import trio + +from .._exceptions import ( + ConnectError, + ConnectTimeout, + ExceptionMapping, + ReadError, + ReadTimeout, + WriteError, + WriteTimeout, + map_exceptions, +) +from .base import SOCKET_OPTION, AsyncNetworkBackend, AsyncNetworkStream + + +class TrioStream(AsyncNetworkStream): + def __init__(self, stream: trio.abc.Stream) -> None: + self._stream = stream + + async def read(self, max_bytes: int, timeout: float | None = None) -> bytes: + timeout_or_inf = float("inf") if timeout is None else timeout + exc_map: ExceptionMapping = { + trio.TooSlowError: ReadTimeout, + trio.BrokenResourceError: ReadError, + trio.ClosedResourceError: ReadError, + } + with map_exceptions(exc_map): + with trio.fail_after(timeout_or_inf): + data: bytes = await self._stream.receive_some(max_bytes=max_bytes) + return data + + async def write(self, buffer: bytes, timeout: float | None = None) -> None: + if not buffer: + return + + timeout_or_inf = float("inf") if timeout is None else timeout + exc_map: ExceptionMapping = { + trio.TooSlowError: WriteTimeout, + trio.BrokenResourceError: WriteError, + trio.ClosedResourceError: WriteError, + } + with map_exceptions(exc_map): + with trio.fail_after(timeout_or_inf): + await self._stream.send_all(data=buffer) + + async def aclose(self) -> None: + await self._stream.aclose() + + async def start_tls( + self, + ssl_context: ssl.SSLContext, + server_hostname: str | None = None, + timeout: float | None = None, + ) -> AsyncNetworkStream: + timeout_or_inf = float("inf") if timeout is None else timeout + exc_map: ExceptionMapping = { + trio.TooSlowError: ConnectTimeout, + trio.BrokenResourceError: ConnectError, + } + ssl_stream = trio.SSLStream( + self._stream, + ssl_context=ssl_context, + server_hostname=server_hostname, + https_compatible=True, + server_side=False, + ) + with map_exceptions(exc_map): + try: + with trio.fail_after(timeout_or_inf): + await ssl_stream.do_handshake() + except Exception as exc: # pragma: nocover + await self.aclose() + raise exc + return TrioStream(ssl_stream) + + def get_extra_info(self, info: str) -> typing.Any: + if info == "ssl_object" and isinstance(self._stream, trio.SSLStream): + # Type checkers cannot see `_ssl_object` attribute because trio._ssl.SSLStream uses __getattr__/__setattr__. + # Tracked at https://github.com/python-trio/trio/issues/542 + return self._stream._ssl_object # type: ignore[attr-defined] + if info == "client_addr": + return self._get_socket_stream().socket.getsockname() + if info == "server_addr": + return self._get_socket_stream().socket.getpeername() + if info == "socket": + stream = self._stream + while isinstance(stream, trio.SSLStream): + stream = stream.transport_stream + assert isinstance(stream, trio.SocketStream) + return stream.socket + if info == "is_readable": + socket = self.get_extra_info("socket") + return socket.is_readable() + return None + + def _get_socket_stream(self) -> trio.SocketStream: + stream = self._stream + while isinstance(stream, trio.SSLStream): + stream = stream.transport_stream + assert isinstance(stream, trio.SocketStream) + return stream + + +class TrioBackend(AsyncNetworkBackend): + async def connect_tcp( + self, + host: str, + port: int, + timeout: float | None = None, + local_address: str | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> AsyncNetworkStream: + # By default for TCP sockets, trio enables TCP_NODELAY. + # https://trio.readthedocs.io/en/stable/reference-io.html#trio.SocketStream + if socket_options is None: + socket_options = [] # pragma: no cover + timeout_or_inf = float("inf") if timeout is None else timeout + exc_map: ExceptionMapping = { + trio.TooSlowError: ConnectTimeout, + trio.BrokenResourceError: ConnectError, + OSError: ConnectError, + } + with map_exceptions(exc_map): + with trio.fail_after(timeout_or_inf): + stream: trio.abc.Stream = await trio.open_tcp_stream( + host=host, port=port, local_address=local_address + ) + for option in socket_options: + stream.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover + return TrioStream(stream) + + async def connect_unix_socket( + self, + path: str, + timeout: float | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> AsyncNetworkStream: # pragma: nocover + if socket_options is None: + socket_options = [] + timeout_or_inf = float("inf") if timeout is None else timeout + exc_map: ExceptionMapping = { + trio.TooSlowError: ConnectTimeout, + trio.BrokenResourceError: ConnectError, + OSError: ConnectError, + } + with map_exceptions(exc_map): + with trio.fail_after(timeout_or_inf): + stream: trio.abc.Stream = await trio.open_unix_socket(path) + for option in socket_options: + stream.setsockopt(*option) # type: ignore[attr-defined] # pragma: no cover + return TrioStream(stream) + + async def sleep(self, seconds: float) -> None: + await trio.sleep(seconds) # pragma: nocover diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b476d76d9a7ff45de8d18ec22d33d6af2982f92e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__init__.py @@ -0,0 +1,39 @@ +from .connection import HTTPConnection +from .connection_pool import ConnectionPool +from .http11 import HTTP11Connection +from .http_proxy import HTTPProxy +from .interfaces import ConnectionInterface + +try: + from .http2 import HTTP2Connection +except ImportError: # pragma: nocover + + class HTTP2Connection: # type: ignore + def __init__(self, *args, **kwargs) -> None: # type: ignore + raise RuntimeError( + "Attempted to use http2 support, but the `h2` package is not " + "installed. Use 'pip install httpcore[http2]'." + ) + + +try: + from .socks_proxy import SOCKSProxy +except ImportError: # pragma: nocover + + class SOCKSProxy: # type: ignore + def __init__(self, *args, **kwargs) -> None: # type: ignore + raise RuntimeError( + "Attempted to use SOCKS support, but the `socksio` package is not " + "installed. Use 'pip install httpcore[socks]'." + ) + + +__all__ = [ + "HTTPConnection", + "ConnectionPool", + "HTTPProxy", + "HTTP11Connection", + "HTTP2Connection", + "ConnectionInterface", + "SOCKSProxy", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93bbaa9ca51f514d5b317ab34cc9d90161922126 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/connection.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/connection.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cfe5aa537073c15f1da5358f36821f299f15f790 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/connection.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/connection_pool.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/connection_pool.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29a30459663747857a474fe41d84e3453b271a5e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/connection_pool.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/http11.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/http11.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bef829e6c5086b57b1a5c8ef520ce252d4f69c3d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/http11.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/http2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/http2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7df0928aa8190146ab48b6781ff75d5c3454b7a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/http2.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/http_proxy.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/http_proxy.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66fff4e2af38384054bd046bffabda66ea65931e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/http_proxy.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/interfaces.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/interfaces.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43d964f8d30cb75a7d676d419178f190ea07d443 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/interfaces.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/socks_proxy.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/socks_proxy.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d74e040b1057fe97b8246d3a0f9eee07a4e0011 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/__pycache__/socks_proxy.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/connection.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/connection.py new file mode 100644 index 0000000000000000000000000000000000000000..363f8be819d2576ea65365e625dd1596ea40429a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/connection.py @@ -0,0 +1,222 @@ +from __future__ import annotations + +import itertools +import logging +import ssl +import types +import typing + +from .._backends.sync import SyncBackend +from .._backends.base import SOCKET_OPTION, NetworkBackend, NetworkStream +from .._exceptions import ConnectError, ConnectTimeout +from .._models import Origin, Request, Response +from .._ssl import default_ssl_context +from .._synchronization import Lock +from .._trace import Trace +from .http11 import HTTP11Connection +from .interfaces import ConnectionInterface + +RETRIES_BACKOFF_FACTOR = 0.5 # 0s, 0.5s, 1s, 2s, 4s, etc. + + +logger = logging.getLogger("httpcore.connection") + + +def exponential_backoff(factor: float) -> typing.Iterator[float]: + """ + Generate a geometric sequence that has a ratio of 2 and starts with 0. + + For example: + - `factor = 2`: `0, 2, 4, 8, 16, 32, 64, ...` + - `factor = 3`: `0, 3, 6, 12, 24, 48, 96, ...` + """ + yield 0 + for n in itertools.count(): + yield factor * 2**n + + +class HTTPConnection(ConnectionInterface): + def __init__( + self, + origin: Origin, + ssl_context: ssl.SSLContext | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + retries: int = 0, + local_address: str | None = None, + uds: str | None = None, + network_backend: NetworkBackend | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> None: + self._origin = origin + self._ssl_context = ssl_context + self._keepalive_expiry = keepalive_expiry + self._http1 = http1 + self._http2 = http2 + self._retries = retries + self._local_address = local_address + self._uds = uds + + self._network_backend: NetworkBackend = ( + SyncBackend() if network_backend is None else network_backend + ) + self._connection: ConnectionInterface | None = None + self._connect_failed: bool = False + self._request_lock = Lock() + self._socket_options = socket_options + + def handle_request(self, request: Request) -> Response: + if not self.can_handle_request(request.url.origin): + raise RuntimeError( + f"Attempted to send request to {request.url.origin} on connection to {self._origin}" + ) + + try: + with self._request_lock: + if self._connection is None: + stream = self._connect(request) + + ssl_object = stream.get_extra_info("ssl_object") + http2_negotiated = ( + ssl_object is not None + and ssl_object.selected_alpn_protocol() == "h2" + ) + if http2_negotiated or (self._http2 and not self._http1): + from .http2 import HTTP2Connection + + self._connection = HTTP2Connection( + origin=self._origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + else: + self._connection = HTTP11Connection( + origin=self._origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + except BaseException as exc: + self._connect_failed = True + raise exc + + return self._connection.handle_request(request) + + def _connect(self, request: Request) -> NetworkStream: + timeouts = request.extensions.get("timeout", {}) + sni_hostname = request.extensions.get("sni_hostname", None) + timeout = timeouts.get("connect", None) + + retries_left = self._retries + delays = exponential_backoff(factor=RETRIES_BACKOFF_FACTOR) + + while True: + try: + if self._uds is None: + kwargs = { + "host": self._origin.host.decode("ascii"), + "port": self._origin.port, + "local_address": self._local_address, + "timeout": timeout, + "socket_options": self._socket_options, + } + with Trace("connect_tcp", logger, request, kwargs) as trace: + stream = self._network_backend.connect_tcp(**kwargs) + trace.return_value = stream + else: + kwargs = { + "path": self._uds, + "timeout": timeout, + "socket_options": self._socket_options, + } + with Trace( + "connect_unix_socket", logger, request, kwargs + ) as trace: + stream = self._network_backend.connect_unix_socket( + **kwargs + ) + trace.return_value = stream + + if self._origin.scheme in (b"https", b"wss"): + ssl_context = ( + default_ssl_context() + if self._ssl_context is None + else self._ssl_context + ) + alpn_protocols = ["http/1.1", "h2"] if self._http2 else ["http/1.1"] + ssl_context.set_alpn_protocols(alpn_protocols) + + kwargs = { + "ssl_context": ssl_context, + "server_hostname": sni_hostname + or self._origin.host.decode("ascii"), + "timeout": timeout, + } + with Trace("start_tls", logger, request, kwargs) as trace: + stream = stream.start_tls(**kwargs) + trace.return_value = stream + return stream + except (ConnectError, ConnectTimeout): + if retries_left <= 0: + raise + retries_left -= 1 + delay = next(delays) + with Trace("retry", logger, request, kwargs) as trace: + self._network_backend.sleep(delay) + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._origin + + def close(self) -> None: + if self._connection is not None: + with Trace("close", logger, None, {}): + self._connection.close() + + def is_available(self) -> bool: + if self._connection is None: + # If HTTP/2 support is enabled, and the resulting connection could + # end up as HTTP/2 then we should indicate the connection as being + # available to service multiple requests. + return ( + self._http2 + and (self._origin.scheme == b"https" or not self._http1) + and not self._connect_failed + ) + return self._connection.is_available() + + def has_expired(self) -> bool: + if self._connection is None: + return self._connect_failed + return self._connection.has_expired() + + def is_idle(self) -> bool: + if self._connection is None: + return self._connect_failed + return self._connection.is_idle() + + def is_closed(self) -> bool: + if self._connection is None: + return self._connect_failed + return self._connection.is_closed() + + def info(self) -> str: + if self._connection is None: + return "CONNECTION FAILED" if self._connect_failed else "CONNECTING" + return self._connection.info() + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} [{self.info()}]>" + + # These context managers are not used in the standard flow, but are + # useful for testing or working with connection instances directly. + + def __enter__(self) -> HTTPConnection: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + self.close() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/connection_pool.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/connection_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..9ccfa53e597a29ee387f9d16f3af4f695ac0d33a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/connection_pool.py @@ -0,0 +1,420 @@ +from __future__ import annotations + +import ssl +import sys +import types +import typing + +from .._backends.sync import SyncBackend +from .._backends.base import SOCKET_OPTION, NetworkBackend +from .._exceptions import ConnectionNotAvailable, UnsupportedProtocol +from .._models import Origin, Proxy, Request, Response +from .._synchronization import Event, ShieldCancellation, ThreadLock +from .connection import HTTPConnection +from .interfaces import ConnectionInterface, RequestInterface + + +class PoolRequest: + def __init__(self, request: Request) -> None: + self.request = request + self.connection: ConnectionInterface | None = None + self._connection_acquired = Event() + + def assign_to_connection(self, connection: ConnectionInterface | None) -> None: + self.connection = connection + self._connection_acquired.set() + + def clear_connection(self) -> None: + self.connection = None + self._connection_acquired = Event() + + def wait_for_connection( + self, timeout: float | None = None + ) -> ConnectionInterface: + if self.connection is None: + self._connection_acquired.wait(timeout=timeout) + assert self.connection is not None + return self.connection + + def is_queued(self) -> bool: + return self.connection is None + + +class ConnectionPool(RequestInterface): + """ + A connection pool for making HTTP requests. + """ + + def __init__( + self, + ssl_context: ssl.SSLContext | None = None, + proxy: Proxy | None = None, + max_connections: int | None = 10, + max_keepalive_connections: int | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + retries: int = 0, + local_address: str | None = None, + uds: str | None = None, + network_backend: NetworkBackend | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> None: + """ + A connection pool for making HTTP requests. + + Parameters: + ssl_context: An SSL context to use for verifying connections. + If not specified, the default `httpcore.default_ssl_context()` + will be used. + max_connections: The maximum number of concurrent HTTP connections that + the pool should allow. Any attempt to send a request on a pool that + would exceed this amount will block until a connection is available. + max_keepalive_connections: The maximum number of idle HTTP connections + that will be maintained in the pool. + keepalive_expiry: The duration in seconds that an idle HTTP connection + may be maintained for before being expired from the pool. + http1: A boolean indicating if HTTP/1.1 requests should be supported + by the connection pool. Defaults to True. + http2: A boolean indicating if HTTP/2 requests should be supported by + the connection pool. Defaults to False. + retries: The maximum number of retries when trying to establish a + connection. + local_address: Local address to connect from. Can also be used to connect + using a particular address family. Using `local_address="0.0.0.0"` + will connect using an `AF_INET` address (IPv4), while using + `local_address="::"` will connect using an `AF_INET6` address (IPv6). + uds: Path to a Unix Domain Socket to use instead of TCP sockets. + network_backend: A backend instance to use for handling network I/O. + socket_options: Socket options that have to be included + in the TCP socket when the connection was established. + """ + self._ssl_context = ssl_context + self._proxy = proxy + self._max_connections = ( + sys.maxsize if max_connections is None else max_connections + ) + self._max_keepalive_connections = ( + sys.maxsize + if max_keepalive_connections is None + else max_keepalive_connections + ) + self._max_keepalive_connections = min( + self._max_connections, self._max_keepalive_connections + ) + + self._keepalive_expiry = keepalive_expiry + self._http1 = http1 + self._http2 = http2 + self._retries = retries + self._local_address = local_address + self._uds = uds + + self._network_backend = ( + SyncBackend() if network_backend is None else network_backend + ) + self._socket_options = socket_options + + # The mutable state on a connection pool is the queue of incoming requests, + # and the set of connections that are servicing those requests. + self._connections: list[ConnectionInterface] = [] + self._requests: list[PoolRequest] = [] + + # We only mutate the state of the connection pool within an 'optional_thread_lock' + # context. This holds a threading lock unless we're running in async mode, + # in which case it is a no-op. + self._optional_thread_lock = ThreadLock() + + def create_connection(self, origin: Origin) -> ConnectionInterface: + if self._proxy is not None: + if self._proxy.url.scheme in (b"socks5", b"socks5h"): + from .socks_proxy import Socks5Connection + + return Socks5Connection( + proxy_origin=self._proxy.url.origin, + proxy_auth=self._proxy.auth, + remote_origin=origin, + ssl_context=self._ssl_context, + keepalive_expiry=self._keepalive_expiry, + http1=self._http1, + http2=self._http2, + network_backend=self._network_backend, + ) + elif origin.scheme == b"http": + from .http_proxy import ForwardHTTPConnection + + return ForwardHTTPConnection( + proxy_origin=self._proxy.url.origin, + proxy_headers=self._proxy.headers, + proxy_ssl_context=self._proxy.ssl_context, + remote_origin=origin, + keepalive_expiry=self._keepalive_expiry, + network_backend=self._network_backend, + ) + from .http_proxy import TunnelHTTPConnection + + return TunnelHTTPConnection( + proxy_origin=self._proxy.url.origin, + proxy_headers=self._proxy.headers, + proxy_ssl_context=self._proxy.ssl_context, + remote_origin=origin, + ssl_context=self._ssl_context, + keepalive_expiry=self._keepalive_expiry, + http1=self._http1, + http2=self._http2, + network_backend=self._network_backend, + ) + + return HTTPConnection( + origin=origin, + ssl_context=self._ssl_context, + keepalive_expiry=self._keepalive_expiry, + http1=self._http1, + http2=self._http2, + retries=self._retries, + local_address=self._local_address, + uds=self._uds, + network_backend=self._network_backend, + socket_options=self._socket_options, + ) + + @property + def connections(self) -> list[ConnectionInterface]: + """ + Return a list of the connections currently in the pool. + + For example: + + ```python + >>> pool.connections + [ + , + , + , + ] + ``` + """ + return list(self._connections) + + def handle_request(self, request: Request) -> Response: + """ + Send an HTTP request, and return an HTTP response. + + This is the core implementation that is called into by `.request()` or `.stream()`. + """ + scheme = request.url.scheme.decode() + if scheme == "": + raise UnsupportedProtocol( + "Request URL is missing an 'http://' or 'https://' protocol." + ) + if scheme not in ("http", "https", "ws", "wss"): + raise UnsupportedProtocol( + f"Request URL has an unsupported protocol '{scheme}://'." + ) + + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("pool", None) + + with self._optional_thread_lock: + # Add the incoming request to our request queue. + pool_request = PoolRequest(request) + self._requests.append(pool_request) + + try: + while True: + with self._optional_thread_lock: + # Assign incoming requests to available connections, + # closing or creating new connections as required. + closing = self._assign_requests_to_connections() + self._close_connections(closing) + + # Wait until this request has an assigned connection. + connection = pool_request.wait_for_connection(timeout=timeout) + + try: + # Send the request on the assigned connection. + response = connection.handle_request( + pool_request.request + ) + except ConnectionNotAvailable: + # In some cases a connection may initially be available to + # handle a request, but then become unavailable. + # + # In this case we clear the connection and try again. + pool_request.clear_connection() + else: + break # pragma: nocover + + except BaseException as exc: + with self._optional_thread_lock: + # For any exception or cancellation we remove the request from + # the queue, and then re-assign requests to connections. + self._requests.remove(pool_request) + closing = self._assign_requests_to_connections() + + self._close_connections(closing) + raise exc from None + + # Return the response. Note that in this case we still have to manage + # the point at which the response is closed. + assert isinstance(response.stream, typing.Iterable) + return Response( + status=response.status, + headers=response.headers, + content=PoolByteStream( + stream=response.stream, pool_request=pool_request, pool=self + ), + extensions=response.extensions, + ) + + def _assign_requests_to_connections(self) -> list[ConnectionInterface]: + """ + Manage the state of the connection pool, assigning incoming + requests to connections as available. + + Called whenever a new request is added or removed from the pool. + + Any closing connections are returned, allowing the I/O for closing + those connections to be handled seperately. + """ + closing_connections = [] + + # First we handle cleaning up any connections that are closed, + # have expired their keep-alive, or surplus idle connections. + for connection in list(self._connections): + if connection.is_closed(): + # log: "removing closed connection" + self._connections.remove(connection) + elif connection.has_expired(): + # log: "closing expired connection" + self._connections.remove(connection) + closing_connections.append(connection) + elif ( + connection.is_idle() + and len([connection.is_idle() for connection in self._connections]) + > self._max_keepalive_connections + ): + # log: "closing idle connection" + self._connections.remove(connection) + closing_connections.append(connection) + + # Assign queued requests to connections. + queued_requests = [request for request in self._requests if request.is_queued()] + for pool_request in queued_requests: + origin = pool_request.request.url.origin + available_connections = [ + connection + for connection in self._connections + if connection.can_handle_request(origin) and connection.is_available() + ] + idle_connections = [ + connection for connection in self._connections if connection.is_idle() + ] + + # There are three cases for how we may be able to handle the request: + # + # 1. There is an existing connection that can handle the request. + # 2. We can create a new connection to handle the request. + # 3. We can close an idle connection and then create a new connection + # to handle the request. + if available_connections: + # log: "reusing existing connection" + connection = available_connections[0] + pool_request.assign_to_connection(connection) + elif len(self._connections) < self._max_connections: + # log: "creating new connection" + connection = self.create_connection(origin) + self._connections.append(connection) + pool_request.assign_to_connection(connection) + elif idle_connections: + # log: "closing idle connection" + connection = idle_connections[0] + self._connections.remove(connection) + closing_connections.append(connection) + # log: "creating new connection" + connection = self.create_connection(origin) + self._connections.append(connection) + pool_request.assign_to_connection(connection) + + return closing_connections + + def _close_connections(self, closing: list[ConnectionInterface]) -> None: + # Close connections which have been removed from the pool. + with ShieldCancellation(): + for connection in closing: + connection.close() + + def close(self) -> None: + # Explicitly close the connection pool. + # Clears all existing requests and connections. + with self._optional_thread_lock: + closing_connections = list(self._connections) + self._connections = [] + self._close_connections(closing_connections) + + def __enter__(self) -> ConnectionPool: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + self.close() + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + with self._optional_thread_lock: + request_is_queued = [request.is_queued() for request in self._requests] + connection_is_idle = [ + connection.is_idle() for connection in self._connections + ] + + num_active_requests = request_is_queued.count(False) + num_queued_requests = request_is_queued.count(True) + num_active_connections = connection_is_idle.count(False) + num_idle_connections = connection_is_idle.count(True) + + requests_info = ( + f"Requests: {num_active_requests} active, {num_queued_requests} queued" + ) + connection_info = ( + f"Connections: {num_active_connections} active, {num_idle_connections} idle" + ) + + return f"<{class_name} [{requests_info} | {connection_info}]>" + + +class PoolByteStream: + def __init__( + self, + stream: typing.Iterable[bytes], + pool_request: PoolRequest, + pool: ConnectionPool, + ) -> None: + self._stream = stream + self._pool_request = pool_request + self._pool = pool + self._closed = False + + def __iter__(self) -> typing.Iterator[bytes]: + try: + for part in self._stream: + yield part + except BaseException as exc: + self.close() + raise exc from None + + def close(self) -> None: + if not self._closed: + self._closed = True + with ShieldCancellation(): + if hasattr(self._stream, "close"): + self._stream.close() + + with self._pool._optional_thread_lock: + self._pool._requests.remove(self._pool_request) + closing = self._pool._assign_requests_to_connections() + + self._pool._close_connections(closing) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/http11.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/http11.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd3a97480c720d418acb1285a7b75da19b62c8c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/http11.py @@ -0,0 +1,379 @@ +from __future__ import annotations + +import enum +import logging +import ssl +import time +import types +import typing + +import h11 + +from .._backends.base import NetworkStream +from .._exceptions import ( + ConnectionNotAvailable, + LocalProtocolError, + RemoteProtocolError, + WriteError, + map_exceptions, +) +from .._models import Origin, Request, Response +from .._synchronization import Lock, ShieldCancellation +from .._trace import Trace +from .interfaces import ConnectionInterface + +logger = logging.getLogger("httpcore.http11") + + +# A subset of `h11.Event` types supported by `_send_event` +H11SendEvent = typing.Union[ + h11.Request, + h11.Data, + h11.EndOfMessage, +] + + +class HTTPConnectionState(enum.IntEnum): + NEW = 0 + ACTIVE = 1 + IDLE = 2 + CLOSED = 3 + + +class HTTP11Connection(ConnectionInterface): + READ_NUM_BYTES = 64 * 1024 + MAX_INCOMPLETE_EVENT_SIZE = 100 * 1024 + + def __init__( + self, + origin: Origin, + stream: NetworkStream, + keepalive_expiry: float | None = None, + ) -> None: + self._origin = origin + self._network_stream = stream + self._keepalive_expiry: float | None = keepalive_expiry + self._expire_at: float | None = None + self._state = HTTPConnectionState.NEW + self._state_lock = Lock() + self._request_count = 0 + self._h11_state = h11.Connection( + our_role=h11.CLIENT, + max_incomplete_event_size=self.MAX_INCOMPLETE_EVENT_SIZE, + ) + + def handle_request(self, request: Request) -> Response: + if not self.can_handle_request(request.url.origin): + raise RuntimeError( + f"Attempted to send request to {request.url.origin} on connection " + f"to {self._origin}" + ) + + with self._state_lock: + if self._state in (HTTPConnectionState.NEW, HTTPConnectionState.IDLE): + self._request_count += 1 + self._state = HTTPConnectionState.ACTIVE + self._expire_at = None + else: + raise ConnectionNotAvailable() + + try: + kwargs = {"request": request} + try: + with Trace( + "send_request_headers", logger, request, kwargs + ) as trace: + self._send_request_headers(**kwargs) + with Trace("send_request_body", logger, request, kwargs) as trace: + self._send_request_body(**kwargs) + except WriteError: + # If we get a write error while we're writing the request, + # then we supress this error and move on to attempting to + # read the response. Servers can sometimes close the request + # pre-emptively and then respond with a well formed HTTP + # error response. + pass + + with Trace( + "receive_response_headers", logger, request, kwargs + ) as trace: + ( + http_version, + status, + reason_phrase, + headers, + trailing_data, + ) = self._receive_response_headers(**kwargs) + trace.return_value = ( + http_version, + status, + reason_phrase, + headers, + ) + + network_stream = self._network_stream + + # CONNECT or Upgrade request + if (status == 101) or ( + (request.method == b"CONNECT") and (200 <= status < 300) + ): + network_stream = HTTP11UpgradeStream(network_stream, trailing_data) + + return Response( + status=status, + headers=headers, + content=HTTP11ConnectionByteStream(self, request), + extensions={ + "http_version": http_version, + "reason_phrase": reason_phrase, + "network_stream": network_stream, + }, + ) + except BaseException as exc: + with ShieldCancellation(): + with Trace("response_closed", logger, request) as trace: + self._response_closed() + raise exc + + # Sending the request... + + def _send_request_headers(self, request: Request) -> None: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("write", None) + + with map_exceptions({h11.LocalProtocolError: LocalProtocolError}): + event = h11.Request( + method=request.method, + target=request.url.target, + headers=request.headers, + ) + self._send_event(event, timeout=timeout) + + def _send_request_body(self, request: Request) -> None: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("write", None) + + assert isinstance(request.stream, typing.Iterable) + for chunk in request.stream: + event = h11.Data(data=chunk) + self._send_event(event, timeout=timeout) + + self._send_event(h11.EndOfMessage(), timeout=timeout) + + def _send_event(self, event: h11.Event, timeout: float | None = None) -> None: + bytes_to_send = self._h11_state.send(event) + if bytes_to_send is not None: + self._network_stream.write(bytes_to_send, timeout=timeout) + + # Receiving the response... + + def _receive_response_headers( + self, request: Request + ) -> tuple[bytes, int, bytes, list[tuple[bytes, bytes]], bytes]: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("read", None) + + while True: + event = self._receive_event(timeout=timeout) + if isinstance(event, h11.Response): + break + if ( + isinstance(event, h11.InformationalResponse) + and event.status_code == 101 + ): + break + + http_version = b"HTTP/" + event.http_version + + # h11 version 0.11+ supports a `raw_items` interface to get the + # raw header casing, rather than the enforced lowercase headers. + headers = event.headers.raw_items() + + trailing_data, _ = self._h11_state.trailing_data + + return http_version, event.status_code, event.reason, headers, trailing_data + + def _receive_response_body( + self, request: Request + ) -> typing.Iterator[bytes]: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("read", None) + + while True: + event = self._receive_event(timeout=timeout) + if isinstance(event, h11.Data): + yield bytes(event.data) + elif isinstance(event, (h11.EndOfMessage, h11.PAUSED)): + break + + def _receive_event( + self, timeout: float | None = None + ) -> h11.Event | type[h11.PAUSED]: + while True: + with map_exceptions({h11.RemoteProtocolError: RemoteProtocolError}): + event = self._h11_state.next_event() + + if event is h11.NEED_DATA: + data = self._network_stream.read( + self.READ_NUM_BYTES, timeout=timeout + ) + + # If we feed this case through h11 we'll raise an exception like: + # + # httpcore.RemoteProtocolError: can't handle event type + # ConnectionClosed when role=SERVER and state=SEND_RESPONSE + # + # Which is accurate, but not very informative from an end-user + # perspective. Instead we handle this case distinctly and treat + # it as a ConnectError. + if data == b"" and self._h11_state.their_state == h11.SEND_RESPONSE: + msg = "Server disconnected without sending a response." + raise RemoteProtocolError(msg) + + self._h11_state.receive_data(data) + else: + # mypy fails to narrow the type in the above if statement above + return event # type: ignore[return-value] + + def _response_closed(self) -> None: + with self._state_lock: + if ( + self._h11_state.our_state is h11.DONE + and self._h11_state.their_state is h11.DONE + ): + self._state = HTTPConnectionState.IDLE + self._h11_state.start_next_cycle() + if self._keepalive_expiry is not None: + now = time.monotonic() + self._expire_at = now + self._keepalive_expiry + else: + self.close() + + # Once the connection is no longer required... + + def close(self) -> None: + # Note that this method unilaterally closes the connection, and does + # not have any kind of locking in place around it. + self._state = HTTPConnectionState.CLOSED + self._network_stream.close() + + # The ConnectionInterface methods provide information about the state of + # the connection, allowing for a connection pooling implementation to + # determine when to reuse and when to close the connection... + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._origin + + def is_available(self) -> bool: + # Note that HTTP/1.1 connections in the "NEW" state are not treated as + # being "available". The control flow which created the connection will + # be able to send an outgoing request, but the connection will not be + # acquired from the connection pool for any other request. + return self._state == HTTPConnectionState.IDLE + + def has_expired(self) -> bool: + now = time.monotonic() + keepalive_expired = self._expire_at is not None and now > self._expire_at + + # If the HTTP connection is idle but the socket is readable, then the + # only valid state is that the socket is about to return b"", indicating + # a server-initiated disconnect. + server_disconnected = ( + self._state == HTTPConnectionState.IDLE + and self._network_stream.get_extra_info("is_readable") + ) + + return keepalive_expired or server_disconnected + + def is_idle(self) -> bool: + return self._state == HTTPConnectionState.IDLE + + def is_closed(self) -> bool: + return self._state == HTTPConnectionState.CLOSED + + def info(self) -> str: + origin = str(self._origin) + return ( + f"{origin!r}, HTTP/1.1, {self._state.name}, " + f"Request Count: {self._request_count}" + ) + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + origin = str(self._origin) + return ( + f"<{class_name} [{origin!r}, {self._state.name}, " + f"Request Count: {self._request_count}]>" + ) + + # These context managers are not used in the standard flow, but are + # useful for testing or working with connection instances directly. + + def __enter__(self) -> HTTP11Connection: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + self.close() + + +class HTTP11ConnectionByteStream: + def __init__(self, connection: HTTP11Connection, request: Request) -> None: + self._connection = connection + self._request = request + self._closed = False + + def __iter__(self) -> typing.Iterator[bytes]: + kwargs = {"request": self._request} + try: + with Trace("receive_response_body", logger, self._request, kwargs): + for chunk in self._connection._receive_response_body(**kwargs): + yield chunk + except BaseException as exc: + # If we get an exception while streaming the response, + # we want to close the response (and possibly the connection) + # before raising that exception. + with ShieldCancellation(): + self.close() + raise exc + + def close(self) -> None: + if not self._closed: + self._closed = True + with Trace("response_closed", logger, self._request): + self._connection._response_closed() + + +class HTTP11UpgradeStream(NetworkStream): + def __init__(self, stream: NetworkStream, leading_data: bytes) -> None: + self._stream = stream + self._leading_data = leading_data + + def read(self, max_bytes: int, timeout: float | None = None) -> bytes: + if self._leading_data: + buffer = self._leading_data[:max_bytes] + self._leading_data = self._leading_data[max_bytes:] + return buffer + else: + return self._stream.read(max_bytes, timeout) + + def write(self, buffer: bytes, timeout: float | None = None) -> None: + self._stream.write(buffer, timeout) + + def close(self) -> None: + self._stream.close() + + def start_tls( + self, + ssl_context: ssl.SSLContext, + server_hostname: str | None = None, + timeout: float | None = None, + ) -> NetworkStream: + return self._stream.start_tls(ssl_context, server_hostname, timeout) + + def get_extra_info(self, info: str) -> typing.Any: + return self._stream.get_extra_info(info) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/http2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/http2.py new file mode 100644 index 0000000000000000000000000000000000000000..ca4dd724325cc7be23e667c2a4d46c63c5e78a80 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/http2.py @@ -0,0 +1,583 @@ +from __future__ import annotations + +import enum +import logging +import time +import types +import typing + +import h2.config +import h2.connection +import h2.events +import h2.exceptions +import h2.settings + +from .._backends.base import NetworkStream +from .._exceptions import ( + ConnectionNotAvailable, + LocalProtocolError, + RemoteProtocolError, +) +from .._models import Origin, Request, Response +from .._synchronization import Lock, Semaphore, ShieldCancellation +from .._trace import Trace +from .interfaces import ConnectionInterface + +logger = logging.getLogger("httpcore.http2") + + +def has_body_headers(request: Request) -> bool: + return any( + k.lower() == b"content-length" or k.lower() == b"transfer-encoding" + for k, v in request.headers + ) + + +class HTTPConnectionState(enum.IntEnum): + ACTIVE = 1 + IDLE = 2 + CLOSED = 3 + + +class HTTP2Connection(ConnectionInterface): + READ_NUM_BYTES = 64 * 1024 + CONFIG = h2.config.H2Configuration(validate_inbound_headers=False) + + def __init__( + self, + origin: Origin, + stream: NetworkStream, + keepalive_expiry: float | None = None, + ): + self._origin = origin + self._network_stream = stream + self._keepalive_expiry: float | None = keepalive_expiry + self._h2_state = h2.connection.H2Connection(config=self.CONFIG) + self._state = HTTPConnectionState.IDLE + self._expire_at: float | None = None + self._request_count = 0 + self._init_lock = Lock() + self._state_lock = Lock() + self._read_lock = Lock() + self._write_lock = Lock() + self._sent_connection_init = False + self._used_all_stream_ids = False + self._connection_error = False + + # Mapping from stream ID to response stream events. + self._events: dict[ + int, + h2.events.ResponseReceived + | h2.events.DataReceived + | h2.events.StreamEnded + | h2.events.StreamReset, + ] = {} + + # Connection terminated events are stored as state since + # we need to handle them for all streams. + self._connection_terminated: h2.events.ConnectionTerminated | None = None + + self._read_exception: Exception | None = None + self._write_exception: Exception | None = None + + def handle_request(self, request: Request) -> Response: + if not self.can_handle_request(request.url.origin): + # This cannot occur in normal operation, since the connection pool + # will only send requests on connections that handle them. + # It's in place simply for resilience as a guard against incorrect + # usage, for anyone working directly with httpcore connections. + raise RuntimeError( + f"Attempted to send request to {request.url.origin} on connection " + f"to {self._origin}" + ) + + with self._state_lock: + if self._state in (HTTPConnectionState.ACTIVE, HTTPConnectionState.IDLE): + self._request_count += 1 + self._expire_at = None + self._state = HTTPConnectionState.ACTIVE + else: + raise ConnectionNotAvailable() + + with self._init_lock: + if not self._sent_connection_init: + try: + kwargs = {"request": request} + with Trace("send_connection_init", logger, request, kwargs): + self._send_connection_init(**kwargs) + except BaseException as exc: + with ShieldCancellation(): + self.close() + raise exc + + self._sent_connection_init = True + + # Initially start with just 1 until the remote server provides + # its max_concurrent_streams value + self._max_streams = 1 + + local_settings_max_streams = ( + self._h2_state.local_settings.max_concurrent_streams + ) + self._max_streams_semaphore = Semaphore(local_settings_max_streams) + + for _ in range(local_settings_max_streams - self._max_streams): + self._max_streams_semaphore.acquire() + + self._max_streams_semaphore.acquire() + + try: + stream_id = self._h2_state.get_next_available_stream_id() + self._events[stream_id] = [] + except h2.exceptions.NoAvailableStreamIDError: # pragma: nocover + self._used_all_stream_ids = True + self._request_count -= 1 + raise ConnectionNotAvailable() + + try: + kwargs = {"request": request, "stream_id": stream_id} + with Trace("send_request_headers", logger, request, kwargs): + self._send_request_headers(request=request, stream_id=stream_id) + with Trace("send_request_body", logger, request, kwargs): + self._send_request_body(request=request, stream_id=stream_id) + with Trace( + "receive_response_headers", logger, request, kwargs + ) as trace: + status, headers = self._receive_response( + request=request, stream_id=stream_id + ) + trace.return_value = (status, headers) + + return Response( + status=status, + headers=headers, + content=HTTP2ConnectionByteStream(self, request, stream_id=stream_id), + extensions={ + "http_version": b"HTTP/2", + "network_stream": self._network_stream, + "stream_id": stream_id, + }, + ) + except BaseException as exc: # noqa: PIE786 + with ShieldCancellation(): + kwargs = {"stream_id": stream_id} + with Trace("response_closed", logger, request, kwargs): + self._response_closed(stream_id=stream_id) + + if isinstance(exc, h2.exceptions.ProtocolError): + # One case where h2 can raise a protocol error is when a + # closed frame has been seen by the state machine. + # + # This happens when one stream is reading, and encounters + # a GOAWAY event. Other flows of control may then raise + # a protocol error at any point they interact with the 'h2_state'. + # + # In this case we'll have stored the event, and should raise + # it as a RemoteProtocolError. + if self._connection_terminated: # pragma: nocover + raise RemoteProtocolError(self._connection_terminated) + # If h2 raises a protocol error in some other state then we + # must somehow have made a protocol violation. + raise LocalProtocolError(exc) # pragma: nocover + + raise exc + + def _send_connection_init(self, request: Request) -> None: + """ + The HTTP/2 connection requires some initial setup before we can start + using individual request/response streams on it. + """ + # Need to set these manually here instead of manipulating via + # __setitem__() otherwise the H2Connection will emit SettingsUpdate + # frames in addition to sending the undesired defaults. + self._h2_state.local_settings = h2.settings.Settings( + client=True, + initial_values={ + # Disable PUSH_PROMISE frames from the server since we don't do anything + # with them for now. Maybe when we support caching? + h2.settings.SettingCodes.ENABLE_PUSH: 0, + # These two are taken from h2 for safe defaults + h2.settings.SettingCodes.MAX_CONCURRENT_STREAMS: 100, + h2.settings.SettingCodes.MAX_HEADER_LIST_SIZE: 65536, + }, + ) + + # Some websites (*cough* Yahoo *cough*) balk at this setting being + # present in the initial handshake since it's not defined in the original + # RFC despite the RFC mandating ignoring settings you don't know about. + del self._h2_state.local_settings[ + h2.settings.SettingCodes.ENABLE_CONNECT_PROTOCOL + ] + + self._h2_state.initiate_connection() + self._h2_state.increment_flow_control_window(2**24) + self._write_outgoing_data(request) + + # Sending the request... + + def _send_request_headers(self, request: Request, stream_id: int) -> None: + """ + Send the request headers to a given stream ID. + """ + end_stream = not has_body_headers(request) + + # In HTTP/2 the ':authority' pseudo-header is used instead of 'Host'. + # In order to gracefully handle HTTP/1.1 and HTTP/2 we always require + # HTTP/1.1 style headers, and map them appropriately if we end up on + # an HTTP/2 connection. + authority = [v for k, v in request.headers if k.lower() == b"host"][0] + + headers = [ + (b":method", request.method), + (b":authority", authority), + (b":scheme", request.url.scheme), + (b":path", request.url.target), + ] + [ + (k.lower(), v) + for k, v in request.headers + if k.lower() + not in ( + b"host", + b"transfer-encoding", + ) + ] + + self._h2_state.send_headers(stream_id, headers, end_stream=end_stream) + self._h2_state.increment_flow_control_window(2**24, stream_id=stream_id) + self._write_outgoing_data(request) + + def _send_request_body(self, request: Request, stream_id: int) -> None: + """ + Iterate over the request body sending it to a given stream ID. + """ + if not has_body_headers(request): + return + + assert isinstance(request.stream, typing.Iterable) + for data in request.stream: + self._send_stream_data(request, stream_id, data) + self._send_end_stream(request, stream_id) + + def _send_stream_data( + self, request: Request, stream_id: int, data: bytes + ) -> None: + """ + Send a single chunk of data in one or more data frames. + """ + while data: + max_flow = self._wait_for_outgoing_flow(request, stream_id) + chunk_size = min(len(data), max_flow) + chunk, data = data[:chunk_size], data[chunk_size:] + self._h2_state.send_data(stream_id, chunk) + self._write_outgoing_data(request) + + def _send_end_stream(self, request: Request, stream_id: int) -> None: + """ + Send an empty data frame on on a given stream ID with the END_STREAM flag set. + """ + self._h2_state.end_stream(stream_id) + self._write_outgoing_data(request) + + # Receiving the response... + + def _receive_response( + self, request: Request, stream_id: int + ) -> tuple[int, list[tuple[bytes, bytes]]]: + """ + Return the response status code and headers for a given stream ID. + """ + while True: + event = self._receive_stream_event(request, stream_id) + if isinstance(event, h2.events.ResponseReceived): + break + + status_code = 200 + headers = [] + for k, v in event.headers: + if k == b":status": + status_code = int(v.decode("ascii", errors="ignore")) + elif not k.startswith(b":"): + headers.append((k, v)) + + return (status_code, headers) + + def _receive_response_body( + self, request: Request, stream_id: int + ) -> typing.Iterator[bytes]: + """ + Iterator that returns the bytes of the response body for a given stream ID. + """ + while True: + event = self._receive_stream_event(request, stream_id) + if isinstance(event, h2.events.DataReceived): + amount = event.flow_controlled_length + self._h2_state.acknowledge_received_data(amount, stream_id) + self._write_outgoing_data(request) + yield event.data + elif isinstance(event, h2.events.StreamEnded): + break + + def _receive_stream_event( + self, request: Request, stream_id: int + ) -> h2.events.ResponseReceived | h2.events.DataReceived | h2.events.StreamEnded: + """ + Return the next available event for a given stream ID. + + Will read more data from the network if required. + """ + while not self._events.get(stream_id): + self._receive_events(request, stream_id) + event = self._events[stream_id].pop(0) + if isinstance(event, h2.events.StreamReset): + raise RemoteProtocolError(event) + return event + + def _receive_events( + self, request: Request, stream_id: int | None = None + ) -> None: + """ + Read some data from the network until we see one or more events + for a given stream ID. + """ + with self._read_lock: + if self._connection_terminated is not None: + last_stream_id = self._connection_terminated.last_stream_id + if stream_id and last_stream_id and stream_id > last_stream_id: + self._request_count -= 1 + raise ConnectionNotAvailable() + raise RemoteProtocolError(self._connection_terminated) + + # This conditional is a bit icky. We don't want to block reading if we've + # actually got an event to return for a given stream. We need to do that + # check *within* the atomic read lock. Though it also need to be optional, + # because when we call it from `_wait_for_outgoing_flow` we *do* want to + # block until we've available flow control, event when we have events + # pending for the stream ID we're attempting to send on. + if stream_id is None or not self._events.get(stream_id): + events = self._read_incoming_data(request) + for event in events: + if isinstance(event, h2.events.RemoteSettingsChanged): + with Trace( + "receive_remote_settings", logger, request + ) as trace: + self._receive_remote_settings_change(event) + trace.return_value = event + + elif isinstance( + event, + ( + h2.events.ResponseReceived, + h2.events.DataReceived, + h2.events.StreamEnded, + h2.events.StreamReset, + ), + ): + if event.stream_id in self._events: + self._events[event.stream_id].append(event) + + elif isinstance(event, h2.events.ConnectionTerminated): + self._connection_terminated = event + + self._write_outgoing_data(request) + + def _receive_remote_settings_change(self, event: h2.events.Event) -> None: + max_concurrent_streams = event.changed_settings.get( + h2.settings.SettingCodes.MAX_CONCURRENT_STREAMS + ) + if max_concurrent_streams: + new_max_streams = min( + max_concurrent_streams.new_value, + self._h2_state.local_settings.max_concurrent_streams, + ) + if new_max_streams and new_max_streams != self._max_streams: + while new_max_streams > self._max_streams: + self._max_streams_semaphore.release() + self._max_streams += 1 + while new_max_streams < self._max_streams: + self._max_streams_semaphore.acquire() + self._max_streams -= 1 + + def _response_closed(self, stream_id: int) -> None: + self._max_streams_semaphore.release() + del self._events[stream_id] + with self._state_lock: + if self._connection_terminated and not self._events: + self.close() + + elif self._state == HTTPConnectionState.ACTIVE and not self._events: + self._state = HTTPConnectionState.IDLE + if self._keepalive_expiry is not None: + now = time.monotonic() + self._expire_at = now + self._keepalive_expiry + if self._used_all_stream_ids: # pragma: nocover + self.close() + + def close(self) -> None: + # Note that this method unilaterally closes the connection, and does + # not have any kind of locking in place around it. + self._h2_state.close_connection() + self._state = HTTPConnectionState.CLOSED + self._network_stream.close() + + # Wrappers around network read/write operations... + + def _read_incoming_data(self, request: Request) -> list[h2.events.Event]: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("read", None) + + if self._read_exception is not None: + raise self._read_exception # pragma: nocover + + try: + data = self._network_stream.read(self.READ_NUM_BYTES, timeout) + if data == b"": + raise RemoteProtocolError("Server disconnected") + except Exception as exc: + # If we get a network error we should: + # + # 1. Save the exception and just raise it immediately on any future reads. + # (For example, this means that a single read timeout or disconnect will + # immediately close all pending streams. Without requiring multiple + # sequential timeouts.) + # 2. Mark the connection as errored, so that we don't accept any other + # incoming requests. + self._read_exception = exc + self._connection_error = True + raise exc + + events: list[h2.events.Event] = self._h2_state.receive_data(data) + + return events + + def _write_outgoing_data(self, request: Request) -> None: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("write", None) + + with self._write_lock: + data_to_send = self._h2_state.data_to_send() + + if self._write_exception is not None: + raise self._write_exception # pragma: nocover + + try: + self._network_stream.write(data_to_send, timeout) + except Exception as exc: # pragma: nocover + # If we get a network error we should: + # + # 1. Save the exception and just raise it immediately on any future write. + # (For example, this means that a single write timeout or disconnect will + # immediately close all pending streams. Without requiring multiple + # sequential timeouts.) + # 2. Mark the connection as errored, so that we don't accept any other + # incoming requests. + self._write_exception = exc + self._connection_error = True + raise exc + + # Flow control... + + def _wait_for_outgoing_flow(self, request: Request, stream_id: int) -> int: + """ + Returns the maximum allowable outgoing flow for a given stream. + + If the allowable flow is zero, then waits on the network until + WindowUpdated frames have increased the flow rate. + https://tools.ietf.org/html/rfc7540#section-6.9 + """ + local_flow: int = self._h2_state.local_flow_control_window(stream_id) + max_frame_size: int = self._h2_state.max_outbound_frame_size + flow = min(local_flow, max_frame_size) + while flow == 0: + self._receive_events(request) + local_flow = self._h2_state.local_flow_control_window(stream_id) + max_frame_size = self._h2_state.max_outbound_frame_size + flow = min(local_flow, max_frame_size) + return flow + + # Interface for connection pooling... + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._origin + + def is_available(self) -> bool: + return ( + self._state != HTTPConnectionState.CLOSED + and not self._connection_error + and not self._used_all_stream_ids + and not ( + self._h2_state.state_machine.state + == h2.connection.ConnectionState.CLOSED + ) + ) + + def has_expired(self) -> bool: + now = time.monotonic() + return self._expire_at is not None and now > self._expire_at + + def is_idle(self) -> bool: + return self._state == HTTPConnectionState.IDLE + + def is_closed(self) -> bool: + return self._state == HTTPConnectionState.CLOSED + + def info(self) -> str: + origin = str(self._origin) + return ( + f"{origin!r}, HTTP/2, {self._state.name}, " + f"Request Count: {self._request_count}" + ) + + def __repr__(self) -> str: + class_name = self.__class__.__name__ + origin = str(self._origin) + return ( + f"<{class_name} [{origin!r}, {self._state.name}, " + f"Request Count: {self._request_count}]>" + ) + + # These context managers are not used in the standard flow, but are + # useful for testing or working with connection instances directly. + + def __enter__(self) -> HTTP2Connection: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None = None, + exc_value: BaseException | None = None, + traceback: types.TracebackType | None = None, + ) -> None: + self.close() + + +class HTTP2ConnectionByteStream: + def __init__( + self, connection: HTTP2Connection, request: Request, stream_id: int + ) -> None: + self._connection = connection + self._request = request + self._stream_id = stream_id + self._closed = False + + def __iter__(self) -> typing.Iterator[bytes]: + kwargs = {"request": self._request, "stream_id": self._stream_id} + try: + with Trace("receive_response_body", logger, self._request, kwargs): + for chunk in self._connection._receive_response_body( + request=self._request, stream_id=self._stream_id + ): + yield chunk + except BaseException as exc: + # If we get an exception while streaming the response, + # we want to close the response (and possibly the connection) + # before raising that exception. + with ShieldCancellation(): + self.close() + raise exc + + def close(self) -> None: + if not self._closed: + self._closed = True + kwargs = {"stream_id": self._stream_id} + with Trace("response_closed", logger, self._request, kwargs): + self._connection._response_closed(stream_id=self._stream_id) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/http_proxy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/http_proxy.py new file mode 100644 index 0000000000000000000000000000000000000000..ecca88f7dc93b78f2aa26f16cf29d17a8a83ae27 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/http_proxy.py @@ -0,0 +1,367 @@ +from __future__ import annotations + +import base64 +import logging +import ssl +import typing + +from .._backends.base import SOCKET_OPTION, NetworkBackend +from .._exceptions import ProxyError +from .._models import ( + URL, + Origin, + Request, + Response, + enforce_bytes, + enforce_headers, + enforce_url, +) +from .._ssl import default_ssl_context +from .._synchronization import Lock +from .._trace import Trace +from .connection import HTTPConnection +from .connection_pool import ConnectionPool +from .http11 import HTTP11Connection +from .interfaces import ConnectionInterface + +ByteOrStr = typing.Union[bytes, str] +HeadersAsSequence = typing.Sequence[typing.Tuple[ByteOrStr, ByteOrStr]] +HeadersAsMapping = typing.Mapping[ByteOrStr, ByteOrStr] + + +logger = logging.getLogger("httpcore.proxy") + + +def merge_headers( + default_headers: typing.Sequence[tuple[bytes, bytes]] | None = None, + override_headers: typing.Sequence[tuple[bytes, bytes]] | None = None, +) -> list[tuple[bytes, bytes]]: + """ + Append default_headers and override_headers, de-duplicating if a key exists + in both cases. + """ + default_headers = [] if default_headers is None else list(default_headers) + override_headers = [] if override_headers is None else list(override_headers) + has_override = set(key.lower() for key, value in override_headers) + default_headers = [ + (key, value) + for key, value in default_headers + if key.lower() not in has_override + ] + return default_headers + override_headers + + +class HTTPProxy(ConnectionPool): # pragma: nocover + """ + A connection pool that sends requests via an HTTP proxy. + """ + + def __init__( + self, + proxy_url: URL | bytes | str, + proxy_auth: tuple[bytes | str, bytes | str] | None = None, + proxy_headers: HeadersAsMapping | HeadersAsSequence | None = None, + ssl_context: ssl.SSLContext | None = None, + proxy_ssl_context: ssl.SSLContext | None = None, + max_connections: int | None = 10, + max_keepalive_connections: int | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + retries: int = 0, + local_address: str | None = None, + uds: str | None = None, + network_backend: NetworkBackend | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> None: + """ + A connection pool for making HTTP requests. + + Parameters: + proxy_url: The URL to use when connecting to the proxy server. + For example `"http://127.0.0.1:8080/"`. + proxy_auth: Any proxy authentication as a two-tuple of + (username, password). May be either bytes or ascii-only str. + proxy_headers: Any HTTP headers to use for the proxy requests. + For example `{"Proxy-Authorization": "Basic :"}`. + ssl_context: An SSL context to use for verifying connections. + If not specified, the default `httpcore.default_ssl_context()` + will be used. + proxy_ssl_context: The same as `ssl_context`, but for a proxy server rather than a remote origin. + max_connections: The maximum number of concurrent HTTP connections that + the pool should allow. Any attempt to send a request on a pool that + would exceed this amount will block until a connection is available. + max_keepalive_connections: The maximum number of idle HTTP connections + that will be maintained in the pool. + keepalive_expiry: The duration in seconds that an idle HTTP connection + may be maintained for before being expired from the pool. + http1: A boolean indicating if HTTP/1.1 requests should be supported + by the connection pool. Defaults to True. + http2: A boolean indicating if HTTP/2 requests should be supported by + the connection pool. Defaults to False. + retries: The maximum number of retries when trying to establish + a connection. + local_address: Local address to connect from. Can also be used to + connect using a particular address family. Using + `local_address="0.0.0.0"` will connect using an `AF_INET` address + (IPv4), while using `local_address="::"` will connect using an + `AF_INET6` address (IPv6). + uds: Path to a Unix Domain Socket to use instead of TCP sockets. + network_backend: A backend instance to use for handling network I/O. + """ + super().__init__( + ssl_context=ssl_context, + max_connections=max_connections, + max_keepalive_connections=max_keepalive_connections, + keepalive_expiry=keepalive_expiry, + http1=http1, + http2=http2, + network_backend=network_backend, + retries=retries, + local_address=local_address, + uds=uds, + socket_options=socket_options, + ) + + self._proxy_url = enforce_url(proxy_url, name="proxy_url") + if ( + self._proxy_url.scheme == b"http" and proxy_ssl_context is not None + ): # pragma: no cover + raise RuntimeError( + "The `proxy_ssl_context` argument is not allowed for the http scheme" + ) + + self._ssl_context = ssl_context + self._proxy_ssl_context = proxy_ssl_context + self._proxy_headers = enforce_headers(proxy_headers, name="proxy_headers") + if proxy_auth is not None: + username = enforce_bytes(proxy_auth[0], name="proxy_auth") + password = enforce_bytes(proxy_auth[1], name="proxy_auth") + userpass = username + b":" + password + authorization = b"Basic " + base64.b64encode(userpass) + self._proxy_headers = [ + (b"Proxy-Authorization", authorization) + ] + self._proxy_headers + + def create_connection(self, origin: Origin) -> ConnectionInterface: + if origin.scheme == b"http": + return ForwardHTTPConnection( + proxy_origin=self._proxy_url.origin, + proxy_headers=self._proxy_headers, + remote_origin=origin, + keepalive_expiry=self._keepalive_expiry, + network_backend=self._network_backend, + proxy_ssl_context=self._proxy_ssl_context, + ) + return TunnelHTTPConnection( + proxy_origin=self._proxy_url.origin, + proxy_headers=self._proxy_headers, + remote_origin=origin, + ssl_context=self._ssl_context, + proxy_ssl_context=self._proxy_ssl_context, + keepalive_expiry=self._keepalive_expiry, + http1=self._http1, + http2=self._http2, + network_backend=self._network_backend, + ) + + +class ForwardHTTPConnection(ConnectionInterface): + def __init__( + self, + proxy_origin: Origin, + remote_origin: Origin, + proxy_headers: HeadersAsMapping | HeadersAsSequence | None = None, + keepalive_expiry: float | None = None, + network_backend: NetworkBackend | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + proxy_ssl_context: ssl.SSLContext | None = None, + ) -> None: + self._connection = HTTPConnection( + origin=proxy_origin, + keepalive_expiry=keepalive_expiry, + network_backend=network_backend, + socket_options=socket_options, + ssl_context=proxy_ssl_context, + ) + self._proxy_origin = proxy_origin + self._proxy_headers = enforce_headers(proxy_headers, name="proxy_headers") + self._remote_origin = remote_origin + + def handle_request(self, request: Request) -> Response: + headers = merge_headers(self._proxy_headers, request.headers) + url = URL( + scheme=self._proxy_origin.scheme, + host=self._proxy_origin.host, + port=self._proxy_origin.port, + target=bytes(request.url), + ) + proxy_request = Request( + method=request.method, + url=url, + headers=headers, + content=request.stream, + extensions=request.extensions, + ) + return self._connection.handle_request(proxy_request) + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._remote_origin + + def close(self) -> None: + self._connection.close() + + def info(self) -> str: + return self._connection.info() + + def is_available(self) -> bool: + return self._connection.is_available() + + def has_expired(self) -> bool: + return self._connection.has_expired() + + def is_idle(self) -> bool: + return self._connection.is_idle() + + def is_closed(self) -> bool: + return self._connection.is_closed() + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} [{self.info()}]>" + + +class TunnelHTTPConnection(ConnectionInterface): + def __init__( + self, + proxy_origin: Origin, + remote_origin: Origin, + ssl_context: ssl.SSLContext | None = None, + proxy_ssl_context: ssl.SSLContext | None = None, + proxy_headers: typing.Sequence[tuple[bytes, bytes]] | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + network_backend: NetworkBackend | None = None, + socket_options: typing.Iterable[SOCKET_OPTION] | None = None, + ) -> None: + self._connection: ConnectionInterface = HTTPConnection( + origin=proxy_origin, + keepalive_expiry=keepalive_expiry, + network_backend=network_backend, + socket_options=socket_options, + ssl_context=proxy_ssl_context, + ) + self._proxy_origin = proxy_origin + self._remote_origin = remote_origin + self._ssl_context = ssl_context + self._proxy_ssl_context = proxy_ssl_context + self._proxy_headers = enforce_headers(proxy_headers, name="proxy_headers") + self._keepalive_expiry = keepalive_expiry + self._http1 = http1 + self._http2 = http2 + self._connect_lock = Lock() + self._connected = False + + def handle_request(self, request: Request) -> Response: + timeouts = request.extensions.get("timeout", {}) + timeout = timeouts.get("connect", None) + + with self._connect_lock: + if not self._connected: + target = b"%b:%d" % (self._remote_origin.host, self._remote_origin.port) + + connect_url = URL( + scheme=self._proxy_origin.scheme, + host=self._proxy_origin.host, + port=self._proxy_origin.port, + target=target, + ) + connect_headers = merge_headers( + [(b"Host", target), (b"Accept", b"*/*")], self._proxy_headers + ) + connect_request = Request( + method=b"CONNECT", + url=connect_url, + headers=connect_headers, + extensions=request.extensions, + ) + connect_response = self._connection.handle_request( + connect_request + ) + + if connect_response.status < 200 or connect_response.status > 299: + reason_bytes = connect_response.extensions.get("reason_phrase", b"") + reason_str = reason_bytes.decode("ascii", errors="ignore") + msg = "%d %s" % (connect_response.status, reason_str) + self._connection.close() + raise ProxyError(msg) + + stream = connect_response.extensions["network_stream"] + + # Upgrade the stream to SSL + ssl_context = ( + default_ssl_context() + if self._ssl_context is None + else self._ssl_context + ) + alpn_protocols = ["http/1.1", "h2"] if self._http2 else ["http/1.1"] + ssl_context.set_alpn_protocols(alpn_protocols) + + kwargs = { + "ssl_context": ssl_context, + "server_hostname": self._remote_origin.host.decode("ascii"), + "timeout": timeout, + } + with Trace("start_tls", logger, request, kwargs) as trace: + stream = stream.start_tls(**kwargs) + trace.return_value = stream + + # Determine if we should be using HTTP/1.1 or HTTP/2 + ssl_object = stream.get_extra_info("ssl_object") + http2_negotiated = ( + ssl_object is not None + and ssl_object.selected_alpn_protocol() == "h2" + ) + + # Create the HTTP/1.1 or HTTP/2 connection + if http2_negotiated or (self._http2 and not self._http1): + from .http2 import HTTP2Connection + + self._connection = HTTP2Connection( + origin=self._remote_origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + else: + self._connection = HTTP11Connection( + origin=self._remote_origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + + self._connected = True + return self._connection.handle_request(request) + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._remote_origin + + def close(self) -> None: + self._connection.close() + + def info(self) -> str: + return self._connection.info() + + def is_available(self) -> bool: + return self._connection.is_available() + + def has_expired(self) -> bool: + return self._connection.has_expired() + + def is_idle(self) -> bool: + return self._connection.is_idle() + + def is_closed(self) -> bool: + return self._connection.is_closed() + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} [{self.info()}]>" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/interfaces.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/interfaces.py new file mode 100644 index 0000000000000000000000000000000000000000..e673d4cc1b1dd7e7ecdbde91fd6ada386c3de03f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/interfaces.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +import contextlib +import typing + +from .._models import ( + URL, + Extensions, + HeaderTypes, + Origin, + Request, + Response, + enforce_bytes, + enforce_headers, + enforce_url, + include_request_headers, +) + + +class RequestInterface: + def request( + self, + method: bytes | str, + url: URL | bytes | str, + *, + headers: HeaderTypes = None, + content: bytes | typing.Iterator[bytes] | None = None, + extensions: Extensions | None = None, + ) -> Response: + # Strict type checking on our parameters. + method = enforce_bytes(method, name="method") + url = enforce_url(url, name="url") + headers = enforce_headers(headers, name="headers") + + # Include Host header, and optionally Content-Length or Transfer-Encoding. + headers = include_request_headers(headers, url=url, content=content) + + request = Request( + method=method, + url=url, + headers=headers, + content=content, + extensions=extensions, + ) + response = self.handle_request(request) + try: + response.read() + finally: + response.close() + return response + + @contextlib.contextmanager + def stream( + self, + method: bytes | str, + url: URL | bytes | str, + *, + headers: HeaderTypes = None, + content: bytes | typing.Iterator[bytes] | None = None, + extensions: Extensions | None = None, + ) -> typing.Iterator[Response]: + # Strict type checking on our parameters. + method = enforce_bytes(method, name="method") + url = enforce_url(url, name="url") + headers = enforce_headers(headers, name="headers") + + # Include Host header, and optionally Content-Length or Transfer-Encoding. + headers = include_request_headers(headers, url=url, content=content) + + request = Request( + method=method, + url=url, + headers=headers, + content=content, + extensions=extensions, + ) + response = self.handle_request(request) + try: + yield response + finally: + response.close() + + def handle_request(self, request: Request) -> Response: + raise NotImplementedError() # pragma: nocover + + +class ConnectionInterface(RequestInterface): + def close(self) -> None: + raise NotImplementedError() # pragma: nocover + + def info(self) -> str: + raise NotImplementedError() # pragma: nocover + + def can_handle_request(self, origin: Origin) -> bool: + raise NotImplementedError() # pragma: nocover + + def is_available(self) -> bool: + """ + Return `True` if the connection is currently able to accept an + outgoing request. + + An HTTP/1.1 connection will only be available if it is currently idle. + + An HTTP/2 connection will be available so long as the stream ID space is + not yet exhausted, and the connection is not in an error state. + + While the connection is being established we may not yet know if it is going + to result in an HTTP/1.1 or HTTP/2 connection. The connection should be + treated as being available, but might ultimately raise `NewConnectionRequired` + required exceptions if multiple requests are attempted over a connection + that ends up being established as HTTP/1.1. + """ + raise NotImplementedError() # pragma: nocover + + def has_expired(self) -> bool: + """ + Return `True` if the connection is in a state where it should be closed. + + This either means that the connection is idle and it has passed the + expiry time on its keep-alive, or that server has sent an EOF. + """ + raise NotImplementedError() # pragma: nocover + + def is_idle(self) -> bool: + """ + Return `True` if the connection is currently idle. + """ + raise NotImplementedError() # pragma: nocover + + def is_closed(self) -> bool: + """ + Return `True` if the connection has been closed. + + Used when a response is closed to determine if the connection may be + returned to the connection pool or not. + """ + raise NotImplementedError() # pragma: nocover diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/socks_proxy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/socks_proxy.py new file mode 100644 index 0000000000000000000000000000000000000000..0ca96ddfb580b19413797f41e79f7abcecdd9d79 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/httpcore/_sync/socks_proxy.py @@ -0,0 +1,341 @@ +from __future__ import annotations + +import logging +import ssl + +import socksio + +from .._backends.sync import SyncBackend +from .._backends.base import NetworkBackend, NetworkStream +from .._exceptions import ConnectionNotAvailable, ProxyError +from .._models import URL, Origin, Request, Response, enforce_bytes, enforce_url +from .._ssl import default_ssl_context +from .._synchronization import Lock +from .._trace import Trace +from .connection_pool import ConnectionPool +from .http11 import HTTP11Connection +from .interfaces import ConnectionInterface + +logger = logging.getLogger("httpcore.socks") + + +AUTH_METHODS = { + b"\x00": "NO AUTHENTICATION REQUIRED", + b"\x01": "GSSAPI", + b"\x02": "USERNAME/PASSWORD", + b"\xff": "NO ACCEPTABLE METHODS", +} + +REPLY_CODES = { + b"\x00": "Succeeded", + b"\x01": "General SOCKS server failure", + b"\x02": "Connection not allowed by ruleset", + b"\x03": "Network unreachable", + b"\x04": "Host unreachable", + b"\x05": "Connection refused", + b"\x06": "TTL expired", + b"\x07": "Command not supported", + b"\x08": "Address type not supported", +} + + +def _init_socks5_connection( + stream: NetworkStream, + *, + host: bytes, + port: int, + auth: tuple[bytes, bytes] | None = None, +) -> None: + conn = socksio.socks5.SOCKS5Connection() + + # Auth method request + auth_method = ( + socksio.socks5.SOCKS5AuthMethod.NO_AUTH_REQUIRED + if auth is None + else socksio.socks5.SOCKS5AuthMethod.USERNAME_PASSWORD + ) + conn.send(socksio.socks5.SOCKS5AuthMethodsRequest([auth_method])) + outgoing_bytes = conn.data_to_send() + stream.write(outgoing_bytes) + + # Auth method response + incoming_bytes = stream.read(max_bytes=4096) + response = conn.receive_data(incoming_bytes) + assert isinstance(response, socksio.socks5.SOCKS5AuthReply) + if response.method != auth_method: + requested = AUTH_METHODS.get(auth_method, "UNKNOWN") + responded = AUTH_METHODS.get(response.method, "UNKNOWN") + raise ProxyError( + f"Requested {requested} from proxy server, but got {responded}." + ) + + if response.method == socksio.socks5.SOCKS5AuthMethod.USERNAME_PASSWORD: + # Username/password request + assert auth is not None + username, password = auth + conn.send(socksio.socks5.SOCKS5UsernamePasswordRequest(username, password)) + outgoing_bytes = conn.data_to_send() + stream.write(outgoing_bytes) + + # Username/password response + incoming_bytes = stream.read(max_bytes=4096) + response = conn.receive_data(incoming_bytes) + assert isinstance(response, socksio.socks5.SOCKS5UsernamePasswordReply) + if not response.success: + raise ProxyError("Invalid username/password") + + # Connect request + conn.send( + socksio.socks5.SOCKS5CommandRequest.from_address( + socksio.socks5.SOCKS5Command.CONNECT, (host, port) + ) + ) + outgoing_bytes = conn.data_to_send() + stream.write(outgoing_bytes) + + # Connect response + incoming_bytes = stream.read(max_bytes=4096) + response = conn.receive_data(incoming_bytes) + assert isinstance(response, socksio.socks5.SOCKS5Reply) + if response.reply_code != socksio.socks5.SOCKS5ReplyCode.SUCCEEDED: + reply_code = REPLY_CODES.get(response.reply_code, "UNKOWN") + raise ProxyError(f"Proxy Server could not connect: {reply_code}.") + + +class SOCKSProxy(ConnectionPool): # pragma: nocover + """ + A connection pool that sends requests via an HTTP proxy. + """ + + def __init__( + self, + proxy_url: URL | bytes | str, + proxy_auth: tuple[bytes | str, bytes | str] | None = None, + ssl_context: ssl.SSLContext | None = None, + max_connections: int | None = 10, + max_keepalive_connections: int | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + retries: int = 0, + network_backend: NetworkBackend | None = None, + ) -> None: + """ + A connection pool for making HTTP requests. + + Parameters: + proxy_url: The URL to use when connecting to the proxy server. + For example `"http://127.0.0.1:8080/"`. + ssl_context: An SSL context to use for verifying connections. + If not specified, the default `httpcore.default_ssl_context()` + will be used. + max_connections: The maximum number of concurrent HTTP connections that + the pool should allow. Any attempt to send a request on a pool that + would exceed this amount will block until a connection is available. + max_keepalive_connections: The maximum number of idle HTTP connections + that will be maintained in the pool. + keepalive_expiry: The duration in seconds that an idle HTTP connection + may be maintained for before being expired from the pool. + http1: A boolean indicating if HTTP/1.1 requests should be supported + by the connection pool. Defaults to True. + http2: A boolean indicating if HTTP/2 requests should be supported by + the connection pool. Defaults to False. + retries: The maximum number of retries when trying to establish + a connection. + local_address: Local address to connect from. Can also be used to + connect using a particular address family. Using + `local_address="0.0.0.0"` will connect using an `AF_INET` address + (IPv4), while using `local_address="::"` will connect using an + `AF_INET6` address (IPv6). + uds: Path to a Unix Domain Socket to use instead of TCP sockets. + network_backend: A backend instance to use for handling network I/O. + """ + super().__init__( + ssl_context=ssl_context, + max_connections=max_connections, + max_keepalive_connections=max_keepalive_connections, + keepalive_expiry=keepalive_expiry, + http1=http1, + http2=http2, + network_backend=network_backend, + retries=retries, + ) + self._ssl_context = ssl_context + self._proxy_url = enforce_url(proxy_url, name="proxy_url") + if proxy_auth is not None: + username, password = proxy_auth + username_bytes = enforce_bytes(username, name="proxy_auth") + password_bytes = enforce_bytes(password, name="proxy_auth") + self._proxy_auth: tuple[bytes, bytes] | None = ( + username_bytes, + password_bytes, + ) + else: + self._proxy_auth = None + + def create_connection(self, origin: Origin) -> ConnectionInterface: + return Socks5Connection( + proxy_origin=self._proxy_url.origin, + remote_origin=origin, + proxy_auth=self._proxy_auth, + ssl_context=self._ssl_context, + keepalive_expiry=self._keepalive_expiry, + http1=self._http1, + http2=self._http2, + network_backend=self._network_backend, + ) + + +class Socks5Connection(ConnectionInterface): + def __init__( + self, + proxy_origin: Origin, + remote_origin: Origin, + proxy_auth: tuple[bytes, bytes] | None = None, + ssl_context: ssl.SSLContext | None = None, + keepalive_expiry: float | None = None, + http1: bool = True, + http2: bool = False, + network_backend: NetworkBackend | None = None, + ) -> None: + self._proxy_origin = proxy_origin + self._remote_origin = remote_origin + self._proxy_auth = proxy_auth + self._ssl_context = ssl_context + self._keepalive_expiry = keepalive_expiry + self._http1 = http1 + self._http2 = http2 + + self._network_backend: NetworkBackend = ( + SyncBackend() if network_backend is None else network_backend + ) + self._connect_lock = Lock() + self._connection: ConnectionInterface | None = None + self._connect_failed = False + + def handle_request(self, request: Request) -> Response: + timeouts = request.extensions.get("timeout", {}) + sni_hostname = request.extensions.get("sni_hostname", None) + timeout = timeouts.get("connect", None) + + with self._connect_lock: + if self._connection is None: + try: + # Connect to the proxy + kwargs = { + "host": self._proxy_origin.host.decode("ascii"), + "port": self._proxy_origin.port, + "timeout": timeout, + } + with Trace("connect_tcp", logger, request, kwargs) as trace: + stream = self._network_backend.connect_tcp(**kwargs) + trace.return_value = stream + + # Connect to the remote host using socks5 + kwargs = { + "stream": stream, + "host": self._remote_origin.host.decode("ascii"), + "port": self._remote_origin.port, + "auth": self._proxy_auth, + } + with Trace( + "setup_socks5_connection", logger, request, kwargs + ) as trace: + _init_socks5_connection(**kwargs) + trace.return_value = stream + + # Upgrade the stream to SSL + if self._remote_origin.scheme == b"https": + ssl_context = ( + default_ssl_context() + if self._ssl_context is None + else self._ssl_context + ) + alpn_protocols = ( + ["http/1.1", "h2"] if self._http2 else ["http/1.1"] + ) + ssl_context.set_alpn_protocols(alpn_protocols) + + kwargs = { + "ssl_context": ssl_context, + "server_hostname": sni_hostname + or self._remote_origin.host.decode("ascii"), + "timeout": timeout, + } + with Trace("start_tls", logger, request, kwargs) as trace: + stream = stream.start_tls(**kwargs) + trace.return_value = stream + + # Determine if we should be using HTTP/1.1 or HTTP/2 + ssl_object = stream.get_extra_info("ssl_object") + http2_negotiated = ( + ssl_object is not None + and ssl_object.selected_alpn_protocol() == "h2" + ) + + # Create the HTTP/1.1 or HTTP/2 connection + if http2_negotiated or ( + self._http2 and not self._http1 + ): # pragma: nocover + from .http2 import HTTP2Connection + + self._connection = HTTP2Connection( + origin=self._remote_origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + else: + self._connection = HTTP11Connection( + origin=self._remote_origin, + stream=stream, + keepalive_expiry=self._keepalive_expiry, + ) + except Exception as exc: + self._connect_failed = True + raise exc + elif not self._connection.is_available(): # pragma: nocover + raise ConnectionNotAvailable() + + return self._connection.handle_request(request) + + def can_handle_request(self, origin: Origin) -> bool: + return origin == self._remote_origin + + def close(self) -> None: + if self._connection is not None: + self._connection.close() + + def is_available(self) -> bool: + if self._connection is None: # pragma: nocover + # If HTTP/2 support is enabled, and the resulting connection could + # end up as HTTP/2 then we should indicate the connection as being + # available to service multiple requests. + return ( + self._http2 + and (self._remote_origin.scheme == b"https" or not self._http1) + and not self._connect_failed + ) + return self._connection.is_available() + + def has_expired(self) -> bool: + if self._connection is None: # pragma: nocover + return self._connect_failed + return self._connection.has_expired() + + def is_idle(self) -> bool: + if self._connection is None: # pragma: nocover + return self._connect_failed + return self._connection.is_idle() + + def is_closed(self) -> bool: + if self._connection is None: # pragma: nocover + return self._connect_failed + return self._connection.is_closed() + + def info(self) -> str: + if self._connection is None: # pragma: nocover + return "CONNECTION FAILED" if self._connect_failed else "CONNECTING" + return self._connection.info() + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} [{self.info()}]>" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/licenses/LICENSE.txt b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/licenses/LICENSE.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bdf03913ffdfd09be5a524c9303bd54a132d450 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/licenses/LICENSE.txt @@ -0,0 +1,31 @@ +BSD 3-Clause License + +Copyright (c) 2004 Infrae. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + 3. Neither the name of Infrae nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INFRAE OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/licenses/LICENSES.txt b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/licenses/LICENSES.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f97c18aa0b162ffcc3e745d80efb9def1c82515 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml-6.0.2.dist-info/licenses/LICENSES.txt @@ -0,0 +1,29 @@ +lxml is copyright Infrae and distributed under the BSD license (see +doc/licenses/BSD.txt), with the following exceptions: + +Some code, such a selftest.py, selftest2.py and +src/lxml/_elementpath.py are derived from ElementTree and +cElementTree. See doc/licenses/elementtree.txt for the license text. + +lxml.cssselect and lxml.html are copyright Ian Bicking and distributed +under the BSD license (see doc/licenses/BSD.txt). + +test.py, the test-runner script, is GPL and copyright Shuttleworth +Foundation. See doc/licenses/GPL.txt. It is believed the unchanged +inclusion of test.py to run the unit test suite falls under the +"aggregation" clause of the GPL and thus does not affect the license +of the rest of the package. + +The isoschematron implementation uses several XSL and RelaxNG resources: + * The (XML syntax) RelaxNG schema for schematron, copyright International + Organization for Standardization (see + src/lxml/isoschematron/resources/rng/iso-schematron.rng for the license + text) + * The skeleton iso-schematron-xlt1 pure-xslt schematron implementation + xsl stylesheets, copyright Rick Jelliffe and Academia Sinica Computing + Center, Taiwan (see the xsl files here for the license text: + src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/) + * The xsd/rng schema schematron extraction xsl transformations are unlicensed + and copyright the respective authors as noted (see + src/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl and + src/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e348ed93931521b46748e318f3ab1548dd77fc8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/config.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a121fd9a2f36c4a4770948789fe221ee9a45dd37 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/config.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/dates.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/dates.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6ec55b23077db21fc7f69b4424fe05d4cad6621 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/dates.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/display.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/display.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..936d8ce7de0cd25079572ec032e21132a21d29ba Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/display.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/localization.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/localization.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8cf94680b61eef462167c3f6d64cf74496150f4e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_config/__pycache__/localization.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96569df2463aca9d5fb54295edcc438cbdf9cfe8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..88a9a259ac8ec87255b25b3e15375b1c92099999 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/__init__.py @@ -0,0 +1,87 @@ +__all__ = [ + "dtypes", + "localize_pydatetime", + "NaT", + "NaTType", + "iNaT", + "nat_strings", + "OutOfBoundsDatetime", + "OutOfBoundsTimedelta", + "IncompatibleFrequency", + "Period", + "Resolution", + "Timedelta", + "normalize_i8_timestamps", + "is_date_array_normalized", + "dt64arr_to_periodarr", + "delta_to_nanoseconds", + "ints_to_pydatetime", + "ints_to_pytimedelta", + "get_resolution", + "Timestamp", + "tz_convert_from_utc_single", + "tz_convert_from_utc", + "to_offset", + "Tick", + "BaseOffset", + "tz_compare", + "is_unitless", + "astype_overflowsafe", + "get_unit_from_dtype", + "periods_per_day", + "periods_per_second", + "guess_datetime_format", + "add_overflowsafe", + "get_supported_dtype", + "is_supported_dtype", +] + +from pandas._libs.tslibs import dtypes # pylint: disable=import-self +from pandas._libs.tslibs.conversion import localize_pydatetime +from pandas._libs.tslibs.dtypes import ( + Resolution, + periods_per_day, + periods_per_second, +) +from pandas._libs.tslibs.nattype import ( + NaT, + NaTType, + iNaT, + nat_strings, +) +from pandas._libs.tslibs.np_datetime import ( + OutOfBoundsDatetime, + OutOfBoundsTimedelta, + add_overflowsafe, + astype_overflowsafe, + get_supported_dtype, + is_supported_dtype, + is_unitless, + py_get_unit_from_dtype as get_unit_from_dtype, +) +from pandas._libs.tslibs.offsets import ( + BaseOffset, + Tick, + to_offset, +) +from pandas._libs.tslibs.parsing import guess_datetime_format +from pandas._libs.tslibs.period import ( + IncompatibleFrequency, + Period, +) +from pandas._libs.tslibs.timedeltas import ( + Timedelta, + delta_to_nanoseconds, + ints_to_pytimedelta, +) +from pandas._libs.tslibs.timestamps import Timestamp +from pandas._libs.tslibs.timezones import tz_compare +from pandas._libs.tslibs.tzconversion import tz_convert_from_utc_single +from pandas._libs.tslibs.vectorized import ( + dt64arr_to_periodarr, + get_resolution, + ints_to_pydatetime, + is_date_array_normalized, + normalize_i8_timestamps, + tz_convert_from_utc, +) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0698b54d0baa05859f44978c5a1f41d563bb01c1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/base.cpython-312-x86_64-linux-gnu.so b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/base.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..4139d012e3a30225bd1320e57953625ff4c12f13 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/base.cpython-312-x86_64-linux-gnu.so differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/ccalendar.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/ccalendar.pyi new file mode 100644 index 0000000000000000000000000000000000000000..993f18a61d74aaa643e9790df70ede618b917223 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/ccalendar.pyi @@ -0,0 +1,12 @@ +DAYS: list[str] +MONTH_ALIASES: dict[int, str] +MONTH_NUMBERS: dict[str, int] +MONTHS: list[str] +int_to_weekday: dict[int, str] + +def get_firstbday(year: int, month: int) -> int: ... +def get_lastbday(year: int, month: int) -> int: ... +def get_day_of_year(year: int, month: int, day: int) -> int: ... +def get_iso_calendar(year: int, month: int, day: int) -> tuple[int, int, int]: ... +def get_week_of_year(year: int, month: int, day: int) -> int: ... +def get_days_in_month(year: int, month: int) -> int: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/conversion.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/conversion.pyi new file mode 100644 index 0000000000000000000000000000000000000000..26affae577f4d3f4ecda2ac9c1bc0cb748a35d4d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/conversion.pyi @@ -0,0 +1,14 @@ +from datetime import ( + datetime, + tzinfo, +) + +import numpy as np + +DT64NS_DTYPE: np.dtype +TD64NS_DTYPE: np.dtype + +def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... +def cast_from_unit_vectorized( + values: np.ndarray, unit: str, out_unit: str = ... +) -> np.ndarray: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/dtypes.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/dtypes.pyi new file mode 100644 index 0000000000000000000000000000000000000000..7fdeb88d498ac88b6d023e25409e8f756b12c07a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/dtypes.pyi @@ -0,0 +1,83 @@ +from enum import Enum + +OFFSET_TO_PERIOD_FREQSTR: dict[str, str] + +def periods_per_day(reso: int = ...) -> int: ... +def periods_per_second(reso: int) -> int: ... +def abbrev_to_npy_unit(abbrev: str | None) -> int: ... +def freq_to_period_freqstr(freq_n: int, freq_name: str) -> str: ... + +class PeriodDtypeBase: + _dtype_code: int # PeriodDtypeCode + _n: int + + # actually __cinit__ + def __new__(cls, code: int, n: int): ... + @property + def _freq_group_code(self) -> int: ... + @property + def _resolution_obj(self) -> Resolution: ... + def _get_to_timestamp_base(self) -> int: ... + @property + def _freqstr(self) -> str: ... + def __hash__(self) -> int: ... + def _is_tick_like(self) -> bool: ... + @property + def _creso(self) -> int: ... + @property + def _td64_unit(self) -> str: ... + +class FreqGroup(Enum): + FR_ANN: int + FR_QTR: int + FR_MTH: int + FR_WK: int + FR_BUS: int + FR_DAY: int + FR_HR: int + FR_MIN: int + FR_SEC: int + FR_MS: int + FR_US: int + FR_NS: int + FR_UND: int + @staticmethod + def from_period_dtype_code(code: int) -> FreqGroup: ... + +class Resolution(Enum): + RESO_NS: int + RESO_US: int + RESO_MS: int + RESO_SEC: int + RESO_MIN: int + RESO_HR: int + RESO_DAY: int + RESO_MTH: int + RESO_QTR: int + RESO_YR: int + def __lt__(self, other: Resolution) -> bool: ... + def __ge__(self, other: Resolution) -> bool: ... + @property + def attrname(self) -> str: ... + @classmethod + def from_attrname(cls, attrname: str) -> Resolution: ... + @classmethod + def get_reso_from_freqstr(cls, freq: str) -> Resolution: ... + @property + def attr_abbrev(self) -> str: ... + +class NpyDatetimeUnit(Enum): + NPY_FR_Y: int + NPY_FR_M: int + NPY_FR_W: int + NPY_FR_D: int + NPY_FR_h: int + NPY_FR_m: int + NPY_FR_s: int + NPY_FR_ms: int + NPY_FR_us: int + NPY_FR_ns: int + NPY_FR_ps: int + NPY_FR_fs: int + NPY_FR_as: int + NPY_FR_GENERIC: int diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/fields.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/fields.pyi new file mode 100644 index 0000000000000000000000000000000000000000..c6cfd44e9f6ab76ee4bc3be7c765569991514d18 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/fields.pyi @@ -0,0 +1,62 @@ +import numpy as np + +from pandas._typing import npt + +def build_field_sarray( + dtindex: npt.NDArray[np.int64], # const int64_t[:] + reso: int, # NPY_DATETIMEUNIT +) -> np.ndarray: ... +def month_position_check(fields, weekdays) -> str | None: ... +def get_date_name_field( + dtindex: npt.NDArray[np.int64], # const int64_t[:] + field: str, + locale: str | None = ..., + reso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.object_]: ... +def get_start_end_field( + dtindex: npt.NDArray[np.int64], + field: str, + freqstr: str | None = ..., + month_kw: int = ..., + reso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.bool_]: ... +def get_date_field( + dtindex: npt.NDArray[np.int64], # const int64_t[:] + field: str, + reso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.int32]: ... +def get_timedelta_field( + tdindex: npt.NDArray[np.int64], # const int64_t[:] + field: str, + reso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.int32]: ... +def get_timedelta_days( + tdindex: npt.NDArray[np.int64], # const int64_t[:] + reso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.int64]: ... +def isleapyear_arr( + years: np.ndarray, +) -> npt.NDArray[np.bool_]: ... +def build_isocalendar_sarray( + dtindex: npt.NDArray[np.int64], # const int64_t[:] + reso: int, # NPY_DATETIMEUNIT +) -> np.ndarray: ... +def _get_locale_names(name_type: str, locale: str | None = ...): ... + +class RoundTo: + @property + def MINUS_INFTY(self) -> int: ... + @property + def PLUS_INFTY(self) -> int: ... + @property + def NEAREST_HALF_EVEN(self) -> int: ... + @property + def NEAREST_HALF_PLUS_INFTY(self) -> int: ... + @property + def NEAREST_HALF_MINUS_INFTY(self) -> int: ... + +def round_nsint64( + values: npt.NDArray[np.int64], + mode: RoundTo, + nanos: int, +) -> npt.NDArray[np.int64]: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/nattype.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/nattype.pyi new file mode 100644 index 0000000000000000000000000000000000000000..bd86a6fdc2174a38bdd0845019e0afd6f2aecbf1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/nattype.pyi @@ -0,0 +1,141 @@ +from datetime import ( + datetime, + timedelta, + tzinfo as _tzinfo, +) +import typing + +import numpy as np + +from pandas._libs.tslibs.period import Period +from pandas._typing import Self + +NaT: NaTType +iNaT: int +nat_strings: set[str] + +_NaTComparisonTypes: typing.TypeAlias = ( + datetime | timedelta | Period | np.datetime64 | np.timedelta64 +) + +class _NatComparison: + def __call__(self, other: _NaTComparisonTypes) -> bool: ... + +class NaTType: + _value: np.int64 + @property + def value(self) -> int: ... + @property + def asm8(self) -> np.datetime64: ... + def to_datetime64(self) -> np.datetime64: ... + def to_numpy( + self, dtype: np.dtype | str | None = ..., copy: bool = ... + ) -> np.datetime64 | np.timedelta64: ... + @property + def is_leap_year(self) -> bool: ... + @property + def is_month_start(self) -> bool: ... + @property + def is_quarter_start(self) -> bool: ... + @property + def is_year_start(self) -> bool: ... + @property + def is_month_end(self) -> bool: ... + @property + def is_quarter_end(self) -> bool: ... + @property + def is_year_end(self) -> bool: ... + @property + def day_of_year(self) -> float: ... + @property + def dayofyear(self) -> float: ... + @property + def days_in_month(self) -> float: ... + @property + def daysinmonth(self) -> float: ... + @property + def day_of_week(self) -> float: ... + @property + def dayofweek(self) -> float: ... + @property + def week(self) -> float: ... + @property + def weekofyear(self) -> float: ... + def day_name(self) -> float: ... + def month_name(self) -> float: ... + def weekday(self) -> float: ... + def isoweekday(self) -> float: ... + def total_seconds(self) -> float: ... + def today(self, *args, **kwargs) -> NaTType: ... + def now(self, *args, **kwargs) -> NaTType: ... + def to_pydatetime(self) -> NaTType: ... + def date(self) -> NaTType: ... + def round(self) -> NaTType: ... + def floor(self) -> NaTType: ... + def ceil(self) -> NaTType: ... + @property + def tzinfo(self) -> None: ... + @property + def tz(self) -> None: ... + def tz_convert(self, tz: _tzinfo | str | None) -> NaTType: ... + def tz_localize( + self, + tz: _tzinfo | str | None, + ambiguous: str = ..., + nonexistent: str = ..., + ) -> NaTType: ... + def replace( + self, + year: int | None = ..., + month: int | None = ..., + day: int | None = ..., + hour: int | None = ..., + minute: int | None = ..., + second: int | None = ..., + microsecond: int | None = ..., + nanosecond: int | None = ..., + tzinfo: _tzinfo | None = ..., + fold: int | None = ..., + ) -> NaTType: ... + @property + def year(self) -> float: ... + @property + def quarter(self) -> float: ... + @property + def month(self) -> float: ... + @property + def day(self) -> float: ... + @property + def hour(self) -> float: ... + @property + def minute(self) -> float: ... + @property + def second(self) -> float: ... + @property + def millisecond(self) -> float: ... + @property + def microsecond(self) -> float: ... + @property + def nanosecond(self) -> float: ... + # inject Timedelta properties + @property + def days(self) -> float: ... + @property + def microseconds(self) -> float: ... + @property + def nanoseconds(self) -> float: ... + # inject Period properties + @property + def qyear(self) -> float: ... + def __eq__(self, other: object) -> bool: ... + def __ne__(self, other: object) -> bool: ... + __lt__: _NatComparison + __le__: _NatComparison + __gt__: _NatComparison + __ge__: _NatComparison + def __sub__(self, other: Self | timedelta | datetime) -> Self: ... + def __rsub__(self, other: Self | timedelta | datetime) -> Self: ... + def __add__(self, other: Self | timedelta | datetime) -> Self: ... + def __radd__(self, other: Self | timedelta | datetime) -> Self: ... + def __hash__(self) -> int: ... + def as_unit(self, unit: str, round_ok: bool = ...) -> NaTType: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/np_datetime.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/np_datetime.pyi new file mode 100644 index 0000000000000000000000000000000000000000..00ef35c50e53251d5ca6f6c6d5ad28a67a695a21 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/np_datetime.pyi @@ -0,0 +1,27 @@ +import numpy as np + +from pandas._typing import npt + +class OutOfBoundsDatetime(ValueError): ... +class OutOfBoundsTimedelta(ValueError): ... + +# only exposed for testing +def py_get_unit_from_dtype(dtype: np.dtype): ... +def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ... +def astype_overflowsafe( + values: np.ndarray, + dtype: np.dtype, + copy: bool = ..., + round_ok: bool = ..., + is_coerce: bool = ..., +) -> np.ndarray: ... +def is_unitless(dtype: np.dtype) -> bool: ... +def compare_mismatched_resolutions( + left: np.ndarray, right: np.ndarray, op +) -> npt.NDArray[np.bool_]: ... +def add_overflowsafe( + left: npt.NDArray[np.int64], + right: npt.NDArray[np.int64], +) -> npt.NDArray[np.int64]: ... +def get_supported_dtype(dtype: np.dtype) -> np.dtype: ... +def is_supported_dtype(dtype: np.dtype) -> bool: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/offsets.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/offsets.pyi new file mode 100644 index 0000000000000000000000000000000000000000..7eb8dc0813868de9ca96c086f5506619719937f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/offsets.pyi @@ -0,0 +1,287 @@ +from datetime import ( + datetime, + time, + timedelta, +) +from typing import ( + Any, + Collection, + Literal, + TypeVar, + overload, +) + +import numpy as np + +from pandas._libs.tslibs.nattype import NaTType +from pandas._typing import ( + OffsetCalendar, + Self, + npt, +) + +from .timedeltas import Timedelta + +_BaseOffsetT = TypeVar("_BaseOffsetT", bound=BaseOffset) +_DatetimeT = TypeVar("_DatetimeT", bound=datetime) +_TimedeltaT = TypeVar("_TimedeltaT", bound=timedelta) + +_relativedelta_kwds: set[str] +prefix_mapping: dict[str, type] + +class ApplyTypeError(TypeError): ... + +class BaseOffset: + n: int + normalize: bool + def __init__(self, n: int = ..., normalize: bool = ...) -> None: ... + def __eq__(self, other) -> bool: ... + def __ne__(self, other) -> bool: ... + def __hash__(self) -> int: ... + @property + def kwds(self) -> dict: ... + @property + def base(self) -> BaseOffset: ... + @overload + def __add__(self, other: npt.NDArray[np.object_]) -> npt.NDArray[np.object_]: ... + @overload + def __add__(self, other: BaseOffset) -> Self: ... + @overload + def __add__(self, other: _DatetimeT) -> _DatetimeT: ... + @overload + def __add__(self, other: _TimedeltaT) -> _TimedeltaT: ... + @overload + def __radd__(self, other: npt.NDArray[np.object_]) -> npt.NDArray[np.object_]: ... + @overload + def __radd__(self, other: BaseOffset) -> Self: ... + @overload + def __radd__(self, other: _DatetimeT) -> _DatetimeT: ... + @overload + def __radd__(self, other: _TimedeltaT) -> _TimedeltaT: ... + @overload + def __radd__(self, other: NaTType) -> NaTType: ... + def __sub__(self, other: BaseOffset) -> Self: ... + @overload + def __rsub__(self, other: npt.NDArray[np.object_]) -> npt.NDArray[np.object_]: ... + @overload + def __rsub__(self, other: BaseOffset): ... + @overload + def __rsub__(self, other: _DatetimeT) -> _DatetimeT: ... + @overload + def __rsub__(self, other: _TimedeltaT) -> _TimedeltaT: ... + @overload + def __mul__(self, other: np.ndarray) -> np.ndarray: ... + @overload + def __mul__(self, other: int): ... + @overload + def __rmul__(self, other: np.ndarray) -> np.ndarray: ... + @overload + def __rmul__(self, other: int) -> Self: ... + def __neg__(self) -> Self: ... + def copy(self) -> Self: ... + @property + def name(self) -> str: ... + @property + def rule_code(self) -> str: ... + @property + def freqstr(self) -> str: ... + def _apply(self, other): ... + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: ... + def rollback(self, dt: datetime) -> datetime: ... + def rollforward(self, dt: datetime) -> datetime: ... + def is_on_offset(self, dt: datetime) -> bool: ... + def __setstate__(self, state) -> None: ... + def __getstate__(self): ... + @property + def nanos(self) -> int: ... + def is_anchored(self) -> bool: ... + +def _get_offset(name: str) -> BaseOffset: ... + +class SingleConstructorOffset(BaseOffset): + @classmethod + def _from_name(cls, suffix: None = ...): ... + def __reduce__(self): ... + +@overload +def to_offset(freq: None, is_period: bool = ...) -> None: ... +@overload +def to_offset(freq: _BaseOffsetT, is_period: bool = ...) -> _BaseOffsetT: ... +@overload +def to_offset(freq: timedelta | str, is_period: bool = ...) -> BaseOffset: ... + +class Tick(SingleConstructorOffset): + _creso: int + _prefix: str + def __init__(self, n: int = ..., normalize: bool = ...) -> None: ... + @property + def delta(self) -> Timedelta: ... + @property + def nanos(self) -> int: ... + +def delta_to_tick(delta: timedelta) -> Tick: ... + +class Day(Tick): ... +class Hour(Tick): ... +class Minute(Tick): ... +class Second(Tick): ... +class Milli(Tick): ... +class Micro(Tick): ... +class Nano(Tick): ... + +class RelativeDeltaOffset(BaseOffset): + def __init__(self, n: int = ..., normalize: bool = ..., **kwds: Any) -> None: ... + +class BusinessMixin(SingleConstructorOffset): + def __init__( + self, n: int = ..., normalize: bool = ..., offset: timedelta = ... + ) -> None: ... + +class BusinessDay(BusinessMixin): ... + +class BusinessHour(BusinessMixin): + def __init__( + self, + n: int = ..., + normalize: bool = ..., + start: str | time | Collection[str | time] = ..., + end: str | time | Collection[str | time] = ..., + offset: timedelta = ..., + ) -> None: ... + +class WeekOfMonthMixin(SingleConstructorOffset): + def __init__( + self, n: int = ..., normalize: bool = ..., weekday: int = ... + ) -> None: ... + +class YearOffset(SingleConstructorOffset): + def __init__( + self, n: int = ..., normalize: bool = ..., month: int | None = ... + ) -> None: ... + +class BYearEnd(YearOffset): ... +class BYearBegin(YearOffset): ... +class YearEnd(YearOffset): ... +class YearBegin(YearOffset): ... + +class QuarterOffset(SingleConstructorOffset): + def __init__( + self, n: int = ..., normalize: bool = ..., startingMonth: int | None = ... + ) -> None: ... + +class BQuarterEnd(QuarterOffset): ... +class BQuarterBegin(QuarterOffset): ... +class QuarterEnd(QuarterOffset): ... +class QuarterBegin(QuarterOffset): ... +class MonthOffset(SingleConstructorOffset): ... +class MonthEnd(MonthOffset): ... +class MonthBegin(MonthOffset): ... +class BusinessMonthEnd(MonthOffset): ... +class BusinessMonthBegin(MonthOffset): ... + +class SemiMonthOffset(SingleConstructorOffset): + def __init__( + self, n: int = ..., normalize: bool = ..., day_of_month: int | None = ... + ) -> None: ... + +class SemiMonthEnd(SemiMonthOffset): ... +class SemiMonthBegin(SemiMonthOffset): ... + +class Week(SingleConstructorOffset): + def __init__( + self, n: int = ..., normalize: bool = ..., weekday: int | None = ... + ) -> None: ... + +class WeekOfMonth(WeekOfMonthMixin): + def __init__( + self, n: int = ..., normalize: bool = ..., week: int = ..., weekday: int = ... + ) -> None: ... + +class LastWeekOfMonth(WeekOfMonthMixin): ... + +class FY5253Mixin(SingleConstructorOffset): + def __init__( + self, + n: int = ..., + normalize: bool = ..., + weekday: int = ..., + startingMonth: int = ..., + variation: Literal["nearest", "last"] = ..., + ) -> None: ... + +class FY5253(FY5253Mixin): ... + +class FY5253Quarter(FY5253Mixin): + def __init__( + self, + n: int = ..., + normalize: bool = ..., + weekday: int = ..., + startingMonth: int = ..., + qtr_with_extra_week: int = ..., + variation: Literal["nearest", "last"] = ..., + ) -> None: ... + +class Easter(SingleConstructorOffset): ... + +class _CustomBusinessMonth(BusinessMixin): + def __init__( + self, + n: int = ..., + normalize: bool = ..., + weekmask: str = ..., + holidays: list | None = ..., + calendar: OffsetCalendar | None = ..., + offset: timedelta = ..., + ) -> None: ... + +class CustomBusinessDay(BusinessDay): + def __init__( + self, + n: int = ..., + normalize: bool = ..., + weekmask: str = ..., + holidays: list | None = ..., + calendar: OffsetCalendar | None = ..., + offset: timedelta = ..., + ) -> None: ... + +class CustomBusinessHour(BusinessHour): + def __init__( + self, + n: int = ..., + normalize: bool = ..., + weekmask: str = ..., + holidays: list | None = ..., + calendar: OffsetCalendar | None = ..., + start: str | time | Collection[str | time] = ..., + end: str | time | Collection[str | time] = ..., + offset: timedelta = ..., + ) -> None: ... + +class CustomBusinessMonthEnd(_CustomBusinessMonth): ... +class CustomBusinessMonthBegin(_CustomBusinessMonth): ... +class OffsetMeta(type): ... +class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta): ... + +BDay = BusinessDay +BMonthEnd = BusinessMonthEnd +BMonthBegin = BusinessMonthBegin +CBMonthEnd = CustomBusinessMonthEnd +CBMonthBegin = CustomBusinessMonthBegin +CDay = CustomBusinessDay + +def roll_qtrday( + other: datetime, n: int, month: int, day_opt: str, modby: int +) -> int: ... + +INVALID_FREQ_ERR_MSG: Literal["Invalid frequency: {0}"] + +def shift_months( + dtindex: npt.NDArray[np.int64], + months: int, + day_opt: str | None = ..., + reso: int = ..., +) -> npt.NDArray[np.int64]: ... + +_offset_map: dict[str, BaseOffset] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/parsing.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/parsing.pyi new file mode 100644 index 0000000000000000000000000000000000000000..40394f915d4b0f12d3631d997c46381757152bfe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/parsing.pyi @@ -0,0 +1,33 @@ +from datetime import datetime + +import numpy as np + +from pandas._typing import npt + +class DateParseError(ValueError): ... + +def py_parse_datetime_string( + date_string: str, + dayfirst: bool = ..., + yearfirst: bool = ..., +) -> datetime: ... +def parse_datetime_string_with_reso( + date_string: str, + freq: str | None = ..., + dayfirst: bool | None = ..., + yearfirst: bool | None = ..., +) -> tuple[datetime, str]: ... +def _does_string_look_like_datetime(py_string: str) -> bool: ... +def quarter_to_myear(year: int, quarter: int, freq: str) -> tuple[int, int]: ... +def try_parse_dates( + values: npt.NDArray[np.object_], # object[:] + parser, +) -> npt.NDArray[np.object_]: ... +def guess_datetime_format( + dt_str: str, + dayfirst: bool | None = ..., +) -> str | None: ... +def concat_date_cols( + date_cols: tuple, +) -> npt.NDArray[np.object_]: ... +def get_rule_month(source: str) -> str: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/period.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/period.pyi new file mode 100644 index 0000000000000000000000000000000000000000..22f3bdbe668decaac0c53cf080ae5c3f098d7a48 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/period.pyi @@ -0,0 +1,135 @@ +from datetime import timedelta +from typing import Literal + +import numpy as np + +from pandas._libs.tslibs.dtypes import PeriodDtypeBase +from pandas._libs.tslibs.nattype import NaTType +from pandas._libs.tslibs.offsets import BaseOffset +from pandas._libs.tslibs.timestamps import Timestamp +from pandas._typing import ( + Frequency, + npt, +) + +INVALID_FREQ_ERR_MSG: str +DIFFERENT_FREQ: str + +class IncompatibleFrequency(ValueError): ... + +def periodarr_to_dt64arr( + periodarr: npt.NDArray[np.int64], # const int64_t[:] + freq: int, +) -> npt.NDArray[np.int64]: ... +def period_asfreq_arr( + arr: npt.NDArray[np.int64], + freq1: int, + freq2: int, + end: bool, +) -> npt.NDArray[np.int64]: ... +def get_period_field_arr( + field: str, + arr: npt.NDArray[np.int64], # const int64_t[:] + freq: int, +) -> npt.NDArray[np.int64]: ... +def from_ordinals( + values: npt.NDArray[np.int64], # const int64_t[:] + freq: timedelta | BaseOffset | str, +) -> npt.NDArray[np.int64]: ... +def extract_ordinals( + values: npt.NDArray[np.object_], + freq: Frequency | int, +) -> npt.NDArray[np.int64]: ... +def extract_freq( + values: npt.NDArray[np.object_], +) -> BaseOffset: ... +def period_array_strftime( + values: npt.NDArray[np.int64], + dtype_code: int, + na_rep, + date_format: str | None, +) -> npt.NDArray[np.object_]: ... + +# exposed for tests +def period_asfreq(ordinal: int, freq1: int, freq2: int, end: bool) -> int: ... +def period_ordinal( + y: int, m: int, d: int, h: int, min: int, s: int, us: int, ps: int, freq: int +) -> int: ... +def freq_to_dtype_code(freq: BaseOffset) -> int: ... +def validate_end_alias(how: str) -> Literal["E", "S"]: ... + +class PeriodMixin: + @property + def end_time(self) -> Timestamp: ... + @property + def start_time(self) -> Timestamp: ... + def _require_matching_freq(self, other: BaseOffset, base: bool = ...) -> None: ... + +class Period(PeriodMixin): + ordinal: int # int64_t + freq: BaseOffset + _dtype: PeriodDtypeBase + + # error: "__new__" must return a class instance (got "Union[Period, NaTType]") + def __new__( # type: ignore[misc] + cls, + value=..., + freq: int | str | BaseOffset | None = ..., + ordinal: int | None = ..., + year: int | None = ..., + month: int | None = ..., + quarter: int | None = ..., + day: int | None = ..., + hour: int | None = ..., + minute: int | None = ..., + second: int | None = ..., + ) -> Period | NaTType: ... + @classmethod + def _maybe_convert_freq(cls, freq) -> BaseOffset: ... + @classmethod + def _from_ordinal(cls, ordinal: int, freq: BaseOffset) -> Period: ... + @classmethod + def now(cls, freq: Frequency) -> Period: ... + def strftime(self, fmt: str | None) -> str: ... + def to_timestamp( + self, + freq: str | BaseOffset | None = ..., + how: str = ..., + ) -> Timestamp: ... + def asfreq(self, freq: str | BaseOffset, how: str = ...) -> Period: ... + @property + def freqstr(self) -> str: ... + @property + def is_leap_year(self) -> bool: ... + @property + def daysinmonth(self) -> int: ... + @property + def days_in_month(self) -> int: ... + @property + def qyear(self) -> int: ... + @property + def quarter(self) -> int: ... + @property + def day_of_year(self) -> int: ... + @property + def weekday(self) -> int: ... + @property + def day_of_week(self) -> int: ... + @property + def week(self) -> int: ... + @property + def weekofyear(self) -> int: ... + @property + def second(self) -> int: ... + @property + def minute(self) -> int: ... + @property + def hour(self) -> int: ... + @property + def day(self) -> int: ... + @property + def month(self) -> int: ... + @property + def year(self) -> int: ... + def __sub__(self, other) -> Period | BaseOffset: ... + def __add__(self, other) -> Period: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/strptime.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/strptime.pyi new file mode 100644 index 0000000000000000000000000000000000000000..0ec1a1e25a2b3cfe974baebfe32d686435f73e11 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/strptime.pyi @@ -0,0 +1,14 @@ +import numpy as np + +from pandas._typing import npt + +def array_strptime( + values: npt.NDArray[np.object_], + fmt: str | None, + exact: bool = ..., + errors: str = ..., + utc: bool = ..., + creso: int = ..., # NPY_DATETIMEUNIT +) -> tuple[np.ndarray, np.ndarray]: ... + +# first ndarray is M8[ns], second is object ndarray of tzinfo | None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/timedeltas.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/timedeltas.pyi new file mode 100644 index 0000000000000000000000000000000000000000..24ec6c8891a89a7af4042df066fec6bc9d7b0e04 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/timedeltas.pyi @@ -0,0 +1,174 @@ +from datetime import timedelta +from typing import ( + ClassVar, + Literal, + TypeAlias, + TypeVar, + overload, +) + +import numpy as np + +from pandas._libs.tslibs import ( + NaTType, + Tick, +) +from pandas._typing import ( + Frequency, + Self, + npt, +) + +# This should be kept consistent with the keys in the dict timedelta_abbrevs +# in pandas/_libs/tslibs/timedeltas.pyx +UnitChoices: TypeAlias = Literal[ + "Y", + "y", + "M", + "W", + "w", + "D", + "d", + "days", + "day", + "hours", + "hour", + "hr", + "h", + "m", + "minute", + "min", + "minutes", + "T", + "t", + "s", + "seconds", + "sec", + "second", + "ms", + "milliseconds", + "millisecond", + "milli", + "millis", + "L", + "l", + "us", + "microseconds", + "microsecond", + "µs", + "micro", + "micros", + "u", + "ns", + "nanoseconds", + "nano", + "nanos", + "nanosecond", + "n", +] +_S = TypeVar("_S", bound=timedelta) + +def get_unit_for_round(freq, creso: int) -> int: ... +def disallow_ambiguous_unit(unit: str | None) -> None: ... +def ints_to_pytimedelta( + m8values: npt.NDArray[np.timedelta64], + box: bool = ..., +) -> npt.NDArray[np.object_]: ... +def array_to_timedelta64( + values: npt.NDArray[np.object_], + unit: str | None = ..., + errors: str = ..., +) -> np.ndarray: ... # np.ndarray[m8ns] +def parse_timedelta_unit(unit: str | None) -> UnitChoices: ... +def delta_to_nanoseconds( + delta: np.timedelta64 | timedelta | Tick, + reso: int = ..., # NPY_DATETIMEUNIT + round_ok: bool = ..., +) -> int: ... +def floordiv_object_array( + left: np.ndarray, right: npt.NDArray[np.object_] +) -> np.ndarray: ... +def truediv_object_array( + left: np.ndarray, right: npt.NDArray[np.object_] +) -> np.ndarray: ... + +class Timedelta(timedelta): + _creso: int + min: ClassVar[Timedelta] + max: ClassVar[Timedelta] + resolution: ClassVar[Timedelta] + value: int # np.int64 + _value: int # np.int64 + # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]") + def __new__( # type: ignore[misc] + cls: type[_S], + value=..., + unit: str | None = ..., + **kwargs: float | np.integer | np.floating, + ) -> _S | NaTType: ... + @classmethod + def _from_value_and_reso(cls, value: np.int64, reso: int) -> Timedelta: ... + @property + def days(self) -> int: ... + @property + def seconds(self) -> int: ... + @property + def microseconds(self) -> int: ... + def total_seconds(self) -> float: ... + def to_pytimedelta(self) -> timedelta: ... + def to_timedelta64(self) -> np.timedelta64: ... + @property + def asm8(self) -> np.timedelta64: ... + # TODO: round/floor/ceil could return NaT? + def round(self, freq: Frequency) -> Self: ... + def floor(self, freq: Frequency) -> Self: ... + def ceil(self, freq: Frequency) -> Self: ... + @property + def resolution_string(self) -> str: ... + def __add__(self, other: timedelta) -> Timedelta: ... + def __radd__(self, other: timedelta) -> Timedelta: ... + def __sub__(self, other: timedelta) -> Timedelta: ... + def __rsub__(self, other: timedelta) -> Timedelta: ... + def __neg__(self) -> Timedelta: ... + def __pos__(self) -> Timedelta: ... + def __abs__(self) -> Timedelta: ... + def __mul__(self, other: float) -> Timedelta: ... + def __rmul__(self, other: float) -> Timedelta: ... + # error: Signature of "__floordiv__" incompatible with supertype "timedelta" + @overload # type: ignore[override] + def __floordiv__(self, other: timedelta) -> int: ... + @overload + def __floordiv__(self, other: float) -> Timedelta: ... + @overload + def __floordiv__( + self, other: npt.NDArray[np.timedelta64] + ) -> npt.NDArray[np.intp]: ... + @overload + def __floordiv__( + self, other: npt.NDArray[np.number] + ) -> npt.NDArray[np.timedelta64] | Timedelta: ... + @overload + def __rfloordiv__(self, other: timedelta | str) -> int: ... + @overload + def __rfloordiv__(self, other: None | NaTType) -> NaTType: ... + @overload + def __rfloordiv__(self, other: np.ndarray) -> npt.NDArray[np.timedelta64]: ... + @overload + def __truediv__(self, other: timedelta) -> float: ... + @overload + def __truediv__(self, other: float) -> Timedelta: ... + def __mod__(self, other: timedelta) -> Timedelta: ... + def __divmod__(self, other: timedelta) -> tuple[int, Timedelta]: ... + def __le__(self, other: timedelta) -> bool: ... + def __lt__(self, other: timedelta) -> bool: ... + def __ge__(self, other: timedelta) -> bool: ... + def __gt__(self, other: timedelta) -> bool: ... + def __hash__(self) -> int: ... + def isoformat(self) -> str: ... + def to_numpy( + self, dtype: npt.DTypeLike = ..., copy: bool = False + ) -> np.timedelta64: ... + def view(self, dtype: npt.DTypeLike) -> object: ... + @property + def unit(self) -> str: ... + def as_unit(self, unit: str, round_ok: bool = ...) -> Timedelta: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/timestamps.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/timestamps.pyi new file mode 100644 index 0000000000000000000000000000000000000000..c769b09d1b7a1adc4532dd367bcbc4cf522419cd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/timestamps.pyi @@ -0,0 +1,241 @@ +from datetime import ( + date as _date, + datetime, + time as _time, + timedelta, + tzinfo as _tzinfo, +) +from time import struct_time +from typing import ( + ClassVar, + Literal, + TypeAlias, + overload, +) + +import numpy as np + +from pandas._libs.tslibs import ( + BaseOffset, + NaTType, + Period, + Tick, + Timedelta, +) +from pandas._typing import ( + Self, + TimestampNonexistent, +) + +_TimeZones: TypeAlias = str | _tzinfo | None | int + +def integer_op_not_supported(obj: object) -> TypeError: ... + +class Timestamp(datetime): + _creso: int + min: ClassVar[Timestamp] + max: ClassVar[Timestamp] + + resolution: ClassVar[Timedelta] + _value: int # np.int64 + # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]") + def __new__( # type: ignore[misc] + cls: type[Self], + ts_input: np.integer | float | str | _date | datetime | np.datetime64 = ..., + year: int | None = ..., + month: int | None = ..., + day: int | None = ..., + hour: int | None = ..., + minute: int | None = ..., + second: int | None = ..., + microsecond: int | None = ..., + tzinfo: _tzinfo | None = ..., + *, + nanosecond: int | None = ..., + tz: _TimeZones = ..., + unit: str | int | None = ..., + fold: int | None = ..., + ) -> Self | NaTType: ... + @classmethod + def _from_value_and_reso( + cls, value: int, reso: int, tz: _TimeZones + ) -> Timestamp: ... + @property + def value(self) -> int: ... # np.int64 + @property + def year(self) -> int: ... + @property + def month(self) -> int: ... + @property + def day(self) -> int: ... + @property + def hour(self) -> int: ... + @property + def minute(self) -> int: ... + @property + def second(self) -> int: ... + @property + def microsecond(self) -> int: ... + @property + def nanosecond(self) -> int: ... + @property + def tzinfo(self) -> _tzinfo | None: ... + @property + def tz(self) -> _tzinfo | None: ... + @property + def fold(self) -> int: ... + @classmethod + def fromtimestamp(cls, ts: float, tz: _TimeZones = ...) -> Self: ... + @classmethod + def utcfromtimestamp(cls, ts: float) -> Self: ... + @classmethod + def today(cls, tz: _TimeZones = ...) -> Self: ... + @classmethod + def fromordinal( + cls, + ordinal: int, + tz: _TimeZones = ..., + ) -> Self: ... + @classmethod + def now(cls, tz: _TimeZones = ...) -> Self: ... + @classmethod + def utcnow(cls) -> Self: ... + # error: Signature of "combine" incompatible with supertype "datetime" + @classmethod + def combine( # type: ignore[override] + cls, date: _date, time: _time + ) -> datetime: ... + @classmethod + def fromisoformat(cls, date_string: str) -> Self: ... + def strftime(self, format: str) -> str: ... + def __format__(self, fmt: str) -> str: ... + def toordinal(self) -> int: ... + def timetuple(self) -> struct_time: ... + def timestamp(self) -> float: ... + def utctimetuple(self) -> struct_time: ... + def date(self) -> _date: ... + def time(self) -> _time: ... + def timetz(self) -> _time: ... + # LSP violation: nanosecond is not present in datetime.datetime.replace + # and has positional args following it + def replace( # type: ignore[override] + self, + year: int | None = ..., + month: int | None = ..., + day: int | None = ..., + hour: int | None = ..., + minute: int | None = ..., + second: int | None = ..., + microsecond: int | None = ..., + nanosecond: int | None = ..., + tzinfo: _tzinfo | type[object] | None = ..., + fold: int | None = ..., + ) -> Self: ... + # LSP violation: datetime.datetime.astimezone has a default value for tz + def astimezone(self, tz: _TimeZones) -> Self: ... # type: ignore[override] + def ctime(self) -> str: ... + def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ... + @classmethod + def strptime( + # Note: strptime is actually disabled and raises NotImplementedError + cls, + date_string: str, + format: str, + ) -> Self: ... + def utcoffset(self) -> timedelta | None: ... + def tzname(self) -> str | None: ... + def dst(self) -> timedelta | None: ... + def __le__(self, other: datetime) -> bool: ... # type: ignore[override] + def __lt__(self, other: datetime) -> bool: ... # type: ignore[override] + def __ge__(self, other: datetime) -> bool: ... # type: ignore[override] + def __gt__(self, other: datetime) -> bool: ... # type: ignore[override] + # error: Signature of "__add__" incompatible with supertype "date"/"datetime" + @overload # type: ignore[override] + def __add__(self, other: np.ndarray) -> np.ndarray: ... + @overload + def __add__(self, other: timedelta | np.timedelta64 | Tick) -> Self: ... + def __radd__(self, other: timedelta) -> Self: ... + @overload # type: ignore[override] + def __sub__(self, other: datetime) -> Timedelta: ... + @overload + def __sub__(self, other: timedelta | np.timedelta64 | Tick) -> Self: ... + def __hash__(self) -> int: ... + def weekday(self) -> int: ... + def isoweekday(self) -> int: ... + # Return type "Tuple[int, int, int]" of "isocalendar" incompatible with return + # type "_IsoCalendarDate" in supertype "date" + def isocalendar(self) -> tuple[int, int, int]: ... # type: ignore[override] + @property + def is_leap_year(self) -> bool: ... + @property + def is_month_start(self) -> bool: ... + @property + def is_quarter_start(self) -> bool: ... + @property + def is_year_start(self) -> bool: ... + @property + def is_month_end(self) -> bool: ... + @property + def is_quarter_end(self) -> bool: ... + @property + def is_year_end(self) -> bool: ... + def to_pydatetime(self, warn: bool = ...) -> datetime: ... + def to_datetime64(self) -> np.datetime64: ... + def to_period(self, freq: BaseOffset | str | None = None) -> Period: ... + def to_julian_date(self) -> np.float64: ... + @property + def asm8(self) -> np.datetime64: ... + def tz_convert(self, tz: _TimeZones) -> Self: ... + # TODO: could return NaT? + def tz_localize( + self, + tz: _TimeZones, + ambiguous: bool | Literal["raise", "NaT"] = ..., + nonexistent: TimestampNonexistent = ..., + ) -> Self: ... + def normalize(self) -> Self: ... + # TODO: round/floor/ceil could return NaT? + def round( + self, + freq: str, + ambiguous: bool | Literal["raise", "NaT"] = ..., + nonexistent: TimestampNonexistent = ..., + ) -> Self: ... + def floor( + self, + freq: str, + ambiguous: bool | Literal["raise", "NaT"] = ..., + nonexistent: TimestampNonexistent = ..., + ) -> Self: ... + def ceil( + self, + freq: str, + ambiguous: bool | Literal["raise", "NaT"] = ..., + nonexistent: TimestampNonexistent = ..., + ) -> Self: ... + def day_name(self, locale: str | None = ...) -> str: ... + def month_name(self, locale: str | None = ...) -> str: ... + @property + def day_of_week(self) -> int: ... + @property + def dayofweek(self) -> int: ... + @property + def day_of_year(self) -> int: ... + @property + def dayofyear(self) -> int: ... + @property + def quarter(self) -> int: ... + @property + def week(self) -> int: ... + def to_numpy( + self, dtype: np.dtype | None = ..., copy: bool = ... + ) -> np.datetime64: ... + @property + def _date_repr(self) -> str: ... + @property + def days_in_month(self) -> int: ... + @property + def daysinmonth(self) -> int: ... + @property + def unit(self) -> str: ... + def as_unit(self, unit: str, round_ok: bool = ...) -> Timestamp: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/timezones.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/timezones.pyi new file mode 100644 index 0000000000000000000000000000000000000000..4e9f0c6ae6c33447ebc86d3daf5bf5cedbe5b0cb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/timezones.pyi @@ -0,0 +1,21 @@ +from datetime import ( + datetime, + tzinfo, +) +from typing import Callable + +import numpy as np + +# imported from dateutil.tz +dateutil_gettz: Callable[[str], tzinfo] + +def tz_standardize(tz: tzinfo) -> tzinfo: ... +def tz_compare(start: tzinfo | None, end: tzinfo | None) -> bool: ... +def infer_tzinfo( + start: datetime | None, + end: datetime | None, +) -> tzinfo | None: ... +def maybe_get_tz(tz: str | int | np.int64 | tzinfo | None) -> tzinfo | None: ... +def get_timezone(tz: tzinfo) -> tzinfo | str: ... +def is_utc(tz: tzinfo | None) -> bool: ... +def is_fixed_offset(tz: tzinfo) -> bool: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/tzconversion.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/tzconversion.pyi new file mode 100644 index 0000000000000000000000000000000000000000..2108fa0f35547191c3db683f8cc1015e5bda4abb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/tzconversion.pyi @@ -0,0 +1,21 @@ +from datetime import ( + timedelta, + tzinfo, +) +from typing import Iterable + +import numpy as np + +from pandas._typing import npt + +# tz_convert_from_utc_single exposed for testing +def tz_convert_from_utc_single( + utc_val: np.int64, tz: tzinfo, creso: int = ... +) -> np.int64: ... +def tz_localize_to_utc( + vals: npt.NDArray[np.int64], + tz: tzinfo | None, + ambiguous: str | bool | Iterable[bool] | None = ..., + nonexistent: str | timedelta | np.timedelta64 | None = ..., + creso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.int64]: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/vectorized.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/vectorized.pyi new file mode 100644 index 0000000000000000000000000000000000000000..de19f592da62bbeb0b3ab267039abd902d4bb854 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/tslibs/vectorized.pyi @@ -0,0 +1,43 @@ +""" +For cython types that cannot be represented precisely, closest-available +python equivalents are used, and the precise types kept as adjacent comments. +""" +from datetime import tzinfo + +import numpy as np + +from pandas._libs.tslibs.dtypes import Resolution +from pandas._typing import npt + +def dt64arr_to_periodarr( + stamps: npt.NDArray[np.int64], + freq: int, + tz: tzinfo | None, + reso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.int64]: ... +def is_date_array_normalized( + stamps: npt.NDArray[np.int64], + tz: tzinfo | None, + reso: int, # NPY_DATETIMEUNIT +) -> bool: ... +def normalize_i8_timestamps( + stamps: npt.NDArray[np.int64], + tz: tzinfo | None, + reso: int, # NPY_DATETIMEUNIT +) -> npt.NDArray[np.int64]: ... +def get_resolution( + stamps: npt.NDArray[np.int64], + tz: tzinfo | None = ..., + reso: int = ..., # NPY_DATETIMEUNIT +) -> Resolution: ... +def ints_to_pydatetime( + stamps: npt.NDArray[np.int64], + tz: tzinfo | None = ..., + box: str = ..., + reso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.object_]: ... +def tz_convert_from_utc( + stamps: npt.NDArray[np.int64], + tz: tzinfo | None, + reso: int = ..., # NPY_DATETIMEUNIT +) -> npt.NDArray[np.int64]: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/window/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/window/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/window/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/window/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b7e2dd8ffc097bde930538e93650571f63ccb21 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/window/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/window/aggregations.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/window/aggregations.pyi new file mode 100644 index 0000000000000000000000000000000000000000..a6cfbec9b15b9aa13a15dab3ff35b0d8761f1036 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/window/aggregations.pyi @@ -0,0 +1,127 @@ +from typing import ( + Any, + Callable, + Literal, +) + +import numpy as np + +from pandas._typing import ( + WindowingRankType, + npt, +) + +def roll_sum( + values: np.ndarray, # const float64_t[:] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_mean( + values: np.ndarray, # const float64_t[:] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_var( + values: np.ndarray, # const float64_t[:] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t + ddof: int = ..., +) -> np.ndarray: ... # np.ndarray[float] +def roll_skew( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_kurt( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_median_c( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_max( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_min( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_quantile( + values: np.ndarray, # const float64_t[:] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t + quantile: float, # float64_t + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"], +) -> np.ndarray: ... # np.ndarray[float] +def roll_rank( + values: np.ndarray, + start: np.ndarray, + end: np.ndarray, + minp: int, + percentile: bool, + method: WindowingRankType, + ascending: bool, +) -> np.ndarray: ... # np.ndarray[float] +def roll_apply( + obj: object, + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t + function: Callable[..., Any], + raw: bool, + args: tuple[Any, ...], + kwargs: dict[str, Any], +) -> npt.NDArray[np.float64]: ... +def roll_weighted_sum( + values: np.ndarray, # const float64_t[:] + weights: np.ndarray, # const float64_t[:] + minp: int, +) -> np.ndarray: ... # np.ndarray[np.float64] +def roll_weighted_mean( + values: np.ndarray, # const float64_t[:] + weights: np.ndarray, # const float64_t[:] + minp: int, +) -> np.ndarray: ... # np.ndarray[np.float64] +def roll_weighted_var( + values: np.ndarray, # const float64_t[:] + weights: np.ndarray, # const float64_t[:] + minp: int, # int64_t + ddof: int, # unsigned int +) -> np.ndarray: ... # np.ndarray[np.float64] +def ewm( + vals: np.ndarray, # const float64_t[:] + start: np.ndarray, # const int64_t[:] + end: np.ndarray, # const int64_t[:] + minp: int, + com: float, # float64_t + adjust: bool, + ignore_na: bool, + deltas: np.ndarray | None = None, # const float64_t[:] + normalize: bool = True, +) -> np.ndarray: ... # np.ndarray[np.float64] +def ewmcov( + input_x: np.ndarray, # const float64_t[:] + start: np.ndarray, # const int64_t[:] + end: np.ndarray, # const int64_t[:] + minp: int, + input_y: np.ndarray, # const float64_t[:] + com: float, # float64_t + adjust: bool, + ignore_na: bool, + bias: bool, +) -> np.ndarray: ... # np.ndarray[np.float64] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/window/indexers.pyi b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/window/indexers.pyi new file mode 100644 index 0000000000000000000000000000000000000000..c9bc64be34ac9a41d14fef33b0fc76bdf66527e9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/_libs/window/indexers.pyi @@ -0,0 +1,12 @@ +import numpy as np + +from pandas._typing import npt + +def calculate_variable_window_bounds( + num_values: int, # int64_t + window_size: int, # int64_t + min_periods, + center: bool, + closed: str | None, + index: np.ndarray, # const int64_t[:] +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79ed4d59953353eb7923da7fcb5a150d0434f64e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/extensions/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/extensions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ea5f1ba926899f9d11e34e70181ed77cae7ead1d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/extensions/__init__.py @@ -0,0 +1,33 @@ +""" +Public API for extending pandas objects. +""" + +from pandas._libs.lib import no_default + +from pandas.core.dtypes.base import ( + ExtensionDtype, + register_extension_dtype, +) + +from pandas.core.accessor import ( + register_dataframe_accessor, + register_index_accessor, + register_series_accessor, +) +from pandas.core.algorithms import take +from pandas.core.arrays import ( + ExtensionArray, + ExtensionScalarOpsMixin, +) + +__all__ = [ + "no_default", + "ExtensionDtype", + "register_extension_dtype", + "register_dataframe_accessor", + "register_index_accessor", + "register_series_accessor", + "take", + "ExtensionArray", + "ExtensionScalarOpsMixin", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/extensions/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/extensions/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e9a3e9d47104728adb27d94f61faa415619bb80 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/extensions/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/indexers/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/indexers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..78357f11dc3b79f13490b91c69ef5457fbfa9768 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/indexers/__init__.py @@ -0,0 +1,17 @@ +""" +Public API for Rolling Window Indexers. +""" + +from pandas.core.indexers import check_array_indexer +from pandas.core.indexers.objects import ( + BaseIndexer, + FixedForwardWindowIndexer, + VariableOffsetWindowIndexer, +) + +__all__ = [ + "check_array_indexer", + "BaseIndexer", + "FixedForwardWindowIndexer", + "VariableOffsetWindowIndexer", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/indexers/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/indexers/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c34cc8e6a46d838d44637b5936994b90d332ed35 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/indexers/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/interchange/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/interchange/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2f3a73bc46b3109c3c13e1a3468a69aed2ffb2e8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/interchange/__init__.py @@ -0,0 +1,8 @@ +""" +Public API for DataFrame interchange protocol. +""" + +from pandas.core.interchange.dataframe_protocol import DataFrame +from pandas.core.interchange.from_dataframe import from_dataframe + +__all__ = ["from_dataframe", "DataFrame"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/interchange/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/interchange/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28ffe52f40d122ceaabafeb0039a9172328db536 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/interchange/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/types/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/types/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c601086bb9f86a633d6a3f2245779fea9428bf95 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/types/__init__.py @@ -0,0 +1,23 @@ +""" +Public toolkit API. +""" + +from pandas._libs.lib import infer_dtype + +from pandas.core.dtypes.api import * # noqa: F403 +from pandas.core.dtypes.concat import union_categoricals +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) + +__all__ = [ + "infer_dtype", + "union_categoricals", + "CategoricalDtype", + "DatetimeTZDtype", + "IntervalDtype", + "PeriodDtype", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/types/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/types/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32911b6963206f085c302b66ce8e509094a5130d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/types/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/typing/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/typing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5d2cb06b523508b15025be804a66daaaaf7a45 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/typing/__init__.py @@ -0,0 +1,55 @@ +""" +Public API classes that store intermediate results useful for type-hinting. +""" + +from pandas._libs import NaTType +from pandas._libs.missing import NAType + +from pandas.core.groupby import ( + DataFrameGroupBy, + SeriesGroupBy, +) +from pandas.core.resample import ( + DatetimeIndexResamplerGroupby, + PeriodIndexResamplerGroupby, + Resampler, + TimedeltaIndexResamplerGroupby, + TimeGrouper, +) +from pandas.core.window import ( + Expanding, + ExpandingGroupby, + ExponentialMovingWindow, + ExponentialMovingWindowGroupby, + Rolling, + RollingGroupby, + Window, +) + +# TODO: Can't import Styler without importing jinja2 +# from pandas.io.formats.style import Styler +from pandas.io.json._json import JsonReader +from pandas.io.stata import StataReader + +__all__ = [ + "DataFrameGroupBy", + "DatetimeIndexResamplerGroupby", + "Expanding", + "ExpandingGroupby", + "ExponentialMovingWindow", + "ExponentialMovingWindowGroupby", + "JsonReader", + "NaTType", + "NAType", + "PeriodIndexResamplerGroupby", + "Resampler", + "Rolling", + "RollingGroupby", + "SeriesGroupBy", + "StataReader", + # See TODO above + # "Styler", + "TimedeltaIndexResamplerGroupby", + "TimeGrouper", + "Window", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/typing/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/typing/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7b3718c00b578babb84c1ac398821434e108583 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/api/typing/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..deb8405bc9f34835fbbb03c882d1ba1e7f7dc5fe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/_constants.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/_constants.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60459b828f7a88e3fdbee522ec626fcea3d0788d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/_constants.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/_optional.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/_optional.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7cc82458cd116c58469fc87632b5edb588e1a436 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/_optional.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/compressors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/compressors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f826dcd9ef5f26eff72b4a8835dd7c09aa2ce7cd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/compressors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/pickle_compat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/pickle_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4f321be725bbae3d7528378f8d60f2a4f437814 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/pickle_compat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/pyarrow.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/pyarrow.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34aa76c557e6b3acbdac9f464bc92d92d708e47c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/__pycache__/pyarrow.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a06761d03887bc2a2dd186fe2d3ac781f7222aeb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/__init__.py @@ -0,0 +1,53 @@ +""" support numpy compatibility across versions """ +import warnings + +import numpy as np + +from pandas.util.version import Version + +# numpy versioning +_np_version = np.__version__ +_nlv = Version(_np_version) +np_version_lt1p23 = _nlv < Version("1.23") +np_version_gte1p24 = _nlv >= Version("1.24") +np_version_gte1p24p3 = _nlv >= Version("1.24.3") +np_version_gte1p25 = _nlv >= Version("1.25") +np_version_gt2 = _nlv >= Version("2.0.0") +is_numpy_dev = _nlv.dev is not None +_min_numpy_ver = "1.22.4" + + +if _nlv < Version(_min_numpy_ver): + raise ImportError( + f"this version of pandas is incompatible with numpy < {_min_numpy_ver}\n" + f"your numpy version is {_np_version}.\n" + f"Please upgrade numpy to >= {_min_numpy_ver} to use this pandas version" + ) + + +np_long: type +np_ulong: type + +if np_version_gt2: + try: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + r".*In the future `np\.long` will be defined as.*", + FutureWarning, + ) + np_long = np.long # type: ignore[attr-defined] + np_ulong = np.ulong # type: ignore[attr-defined] + except AttributeError: + np_long = np.int_ + np_ulong = np.uint +else: + np_long = np.int_ + np_ulong = np.uint + + +__all__ = [ + "np", + "_np_version", + "is_numpy_dev", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c874b91362457e5c5368005748f7f2333ad9bbb7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/__pycache__/function.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/__pycache__/function.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45e9bc357f65dc8f7594e82adf311f99fb49bb54 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/__pycache__/function.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/function.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/function.py new file mode 100644 index 0000000000000000000000000000000000000000..4df30f7f4a8a79984ca6de521ac058bd30fd8faf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/compat/numpy/function.py @@ -0,0 +1,418 @@ +""" +For compatibility with numpy libraries, pandas functions or methods have to +accept '*args' and '**kwargs' parameters to accommodate numpy arguments that +are not actually used or respected in the pandas implementation. + +To ensure that users do not abuse these parameters, validation is performed in +'validators.py' to make sure that any extra parameters passed correspond ONLY +to those in the numpy signature. Part of that validation includes whether or +not the user attempted to pass in non-default values for these extraneous +parameters. As we want to discourage users from relying on these parameters +when calling the pandas implementation, we want them only to pass in the +default values for these parameters. + +This module provides a set of commonly used default arguments for functions and +methods that are spread throughout the codebase. This module will make it +easier to adjust to future upstream changes in the analogous numpy signatures. +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, + TypeVar, + cast, + overload, +) + +import numpy as np +from numpy import ndarray + +from pandas._libs.lib import ( + is_bool, + is_integer, +) +from pandas.errors import UnsupportedFunctionCall +from pandas.util._validators import ( + validate_args, + validate_args_and_kwargs, + validate_kwargs, +) + +if TYPE_CHECKING: + from pandas._typing import ( + Axis, + AxisInt, + ) + + AxisNoneT = TypeVar("AxisNoneT", Axis, None) + + +class CompatValidator: + def __init__( + self, + defaults, + fname=None, + method: str | None = None, + max_fname_arg_count=None, + ) -> None: + self.fname = fname + self.method = method + self.defaults = defaults + self.max_fname_arg_count = max_fname_arg_count + + def __call__( + self, + args, + kwargs, + fname=None, + max_fname_arg_count=None, + method: str | None = None, + ) -> None: + if not args and not kwargs: + return None + + fname = self.fname if fname is None else fname + max_fname_arg_count = ( + self.max_fname_arg_count + if max_fname_arg_count is None + else max_fname_arg_count + ) + method = self.method if method is None else method + + if method == "args": + validate_args(fname, args, max_fname_arg_count, self.defaults) + elif method == "kwargs": + validate_kwargs(fname, kwargs, self.defaults) + elif method == "both": + validate_args_and_kwargs( + fname, args, kwargs, max_fname_arg_count, self.defaults + ) + else: + raise ValueError(f"invalid validation method '{method}'") + + +ARGMINMAX_DEFAULTS = {"out": None} +validate_argmin = CompatValidator( + ARGMINMAX_DEFAULTS, fname="argmin", method="both", max_fname_arg_count=1 +) +validate_argmax = CompatValidator( + ARGMINMAX_DEFAULTS, fname="argmax", method="both", max_fname_arg_count=1 +) + + +def process_skipna(skipna: bool | ndarray | None, args) -> tuple[bool, Any]: + if isinstance(skipna, ndarray) or skipna is None: + args = (skipna,) + args + skipna = True + + return skipna, args + + +def validate_argmin_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> bool: + """ + If 'Series.argmin' is called via the 'numpy' library, the third parameter + in its signature is 'out', which takes either an ndarray or 'None', so + check if the 'skipna' parameter is either an instance of ndarray or is + None, since 'skipna' itself should be a boolean + """ + skipna, args = process_skipna(skipna, args) + validate_argmin(args, kwargs) + return skipna + + +def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> bool: + """ + If 'Series.argmax' is called via the 'numpy' library, the third parameter + in its signature is 'out', which takes either an ndarray or 'None', so + check if the 'skipna' parameter is either an instance of ndarray or is + None, since 'skipna' itself should be a boolean + """ + skipna, args = process_skipna(skipna, args) + validate_argmax(args, kwargs) + return skipna + + +ARGSORT_DEFAULTS: dict[str, int | str | None] = {} +ARGSORT_DEFAULTS["axis"] = -1 +ARGSORT_DEFAULTS["kind"] = "quicksort" +ARGSORT_DEFAULTS["order"] = None +ARGSORT_DEFAULTS["kind"] = None +ARGSORT_DEFAULTS["stable"] = None + + +validate_argsort = CompatValidator( + ARGSORT_DEFAULTS, fname="argsort", max_fname_arg_count=0, method="both" +) + +# two different signatures of argsort, this second validation for when the +# `kind` param is supported +ARGSORT_DEFAULTS_KIND: dict[str, int | None] = {} +ARGSORT_DEFAULTS_KIND["axis"] = -1 +ARGSORT_DEFAULTS_KIND["order"] = None +ARGSORT_DEFAULTS_KIND["stable"] = None +validate_argsort_kind = CompatValidator( + ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both" +) + + +def validate_argsort_with_ascending(ascending: bool | int | None, args, kwargs) -> bool: + """ + If 'Categorical.argsort' is called via the 'numpy' library, the first + parameter in its signature is 'axis', which takes either an integer or + 'None', so check if the 'ascending' parameter has either integer type or is + None, since 'ascending' itself should be a boolean + """ + if is_integer(ascending) or ascending is None: + args = (ascending,) + args + ascending = True + + validate_argsort_kind(args, kwargs, max_fname_arg_count=3) + ascending = cast(bool, ascending) + return ascending + + +CLIP_DEFAULTS: dict[str, Any] = {"out": None} +validate_clip = CompatValidator( + CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3 +) + + +@overload +def validate_clip_with_axis(axis: ndarray, args, kwargs) -> None: + ... + + +@overload +def validate_clip_with_axis(axis: AxisNoneT, args, kwargs) -> AxisNoneT: + ... + + +def validate_clip_with_axis( + axis: ndarray | AxisNoneT, args, kwargs +) -> AxisNoneT | None: + """ + If 'NDFrame.clip' is called via the numpy library, the third parameter in + its signature is 'out', which can takes an ndarray, so check if the 'axis' + parameter is an instance of ndarray, since 'axis' itself should either be + an integer or None + """ + if isinstance(axis, ndarray): + args = (axis,) + args + # error: Incompatible types in assignment (expression has type "None", + # variable has type "Union[ndarray[Any, Any], str, int]") + axis = None # type: ignore[assignment] + + validate_clip(args, kwargs) + # error: Incompatible return value type (got "Union[ndarray[Any, Any], + # str, int]", expected "Union[str, int, None]") + return axis # type: ignore[return-value] + + +CUM_FUNC_DEFAULTS: dict[str, Any] = {} +CUM_FUNC_DEFAULTS["dtype"] = None +CUM_FUNC_DEFAULTS["out"] = None +validate_cum_func = CompatValidator( + CUM_FUNC_DEFAULTS, method="both", max_fname_arg_count=1 +) +validate_cumsum = CompatValidator( + CUM_FUNC_DEFAULTS, fname="cumsum", method="both", max_fname_arg_count=1 +) + + +def validate_cum_func_with_skipna(skipna: bool, args, kwargs, name) -> bool: + """ + If this function is called via the 'numpy' library, the third parameter in + its signature is 'dtype', which takes either a 'numpy' dtype or 'None', so + check if the 'skipna' parameter is a boolean or not + """ + if not is_bool(skipna): + args = (skipna,) + args + skipna = True + elif isinstance(skipna, np.bool_): + skipna = bool(skipna) + + validate_cum_func(args, kwargs, fname=name) + return skipna + + +ALLANY_DEFAULTS: dict[str, bool | None] = {} +ALLANY_DEFAULTS["dtype"] = None +ALLANY_DEFAULTS["out"] = None +ALLANY_DEFAULTS["keepdims"] = False +ALLANY_DEFAULTS["axis"] = None +validate_all = CompatValidator( + ALLANY_DEFAULTS, fname="all", method="both", max_fname_arg_count=1 +) +validate_any = CompatValidator( + ALLANY_DEFAULTS, fname="any", method="both", max_fname_arg_count=1 +) + +LOGICAL_FUNC_DEFAULTS = {"out": None, "keepdims": False} +validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs") + +MINMAX_DEFAULTS = {"axis": None, "dtype": None, "out": None, "keepdims": False} +validate_min = CompatValidator( + MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1 +) +validate_max = CompatValidator( + MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 +) + +RESHAPE_DEFAULTS: dict[str, str] = {"order": "C"} +validate_reshape = CompatValidator( + RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 +) + +REPEAT_DEFAULTS: dict[str, Any] = {"axis": None} +validate_repeat = CompatValidator( + REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1 +) + +ROUND_DEFAULTS: dict[str, Any] = {"out": None} +validate_round = CompatValidator( + ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 +) + +SORT_DEFAULTS: dict[str, int | str | None] = {} +SORT_DEFAULTS["axis"] = -1 +SORT_DEFAULTS["kind"] = "quicksort" +SORT_DEFAULTS["order"] = None +validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") + +STAT_FUNC_DEFAULTS: dict[str, Any | None] = {} +STAT_FUNC_DEFAULTS["dtype"] = None +STAT_FUNC_DEFAULTS["out"] = None + +SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy() +SUM_DEFAULTS["axis"] = None +SUM_DEFAULTS["keepdims"] = False +SUM_DEFAULTS["initial"] = None + +PROD_DEFAULTS = SUM_DEFAULTS.copy() + +MEAN_DEFAULTS = SUM_DEFAULTS.copy() + +MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy() +MEDIAN_DEFAULTS["overwrite_input"] = False +MEDIAN_DEFAULTS["keepdims"] = False + +STAT_FUNC_DEFAULTS["keepdims"] = False + +validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS, method="kwargs") +validate_sum = CompatValidator( + SUM_DEFAULTS, fname="sum", method="both", max_fname_arg_count=1 +) +validate_prod = CompatValidator( + PROD_DEFAULTS, fname="prod", method="both", max_fname_arg_count=1 +) +validate_mean = CompatValidator( + MEAN_DEFAULTS, fname="mean", method="both", max_fname_arg_count=1 +) +validate_median = CompatValidator( + MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 +) + +STAT_DDOF_FUNC_DEFAULTS: dict[str, bool | None] = {} +STAT_DDOF_FUNC_DEFAULTS["dtype"] = None +STAT_DDOF_FUNC_DEFAULTS["out"] = None +STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False +validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs") + +TAKE_DEFAULTS: dict[str, str | None] = {} +TAKE_DEFAULTS["out"] = None +TAKE_DEFAULTS["mode"] = "raise" +validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") + + +def validate_take_with_convert(convert: ndarray | bool | None, args, kwargs) -> bool: + """ + If this function is called via the 'numpy' library, the third parameter in + its signature is 'axis', which takes either an ndarray or 'None', so check + if the 'convert' parameter is either an instance of ndarray or is None + """ + if isinstance(convert, ndarray) or convert is None: + args = (convert,) + args + convert = True + + validate_take(args, kwargs, max_fname_arg_count=3, method="both") + return convert + + +TRANSPOSE_DEFAULTS = {"axes": None} +validate_transpose = CompatValidator( + TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0 +) + + +def validate_groupby_func(name: str, args, kwargs, allowed=None) -> None: + """ + 'args' and 'kwargs' should be empty, except for allowed kwargs because all + of their necessary parameters are explicitly listed in the function + signature + """ + if allowed is None: + allowed = [] + + kwargs = set(kwargs) - set(allowed) + + if len(args) + len(kwargs) > 0: + raise UnsupportedFunctionCall( + "numpy operations are not valid with groupby. " + f"Use .groupby(...).{name}() instead" + ) + + +RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var") + + +def validate_resampler_func(method: str, args, kwargs) -> None: + """ + 'args' and 'kwargs' should be empty because all of their necessary + parameters are explicitly listed in the function signature + """ + if len(args) + len(kwargs) > 0: + if method in RESAMPLER_NUMPY_OPS: + raise UnsupportedFunctionCall( + "numpy operations are not valid with resample. " + f"Use .resample(...).{method}() instead" + ) + raise TypeError("too many arguments passed in") + + +def validate_minmax_axis(axis: AxisInt | None, ndim: int = 1) -> None: + """ + Ensure that the axis argument passed to min, max, argmin, or argmax is zero + or None, as otherwise it will be incorrectly ignored. + + Parameters + ---------- + axis : int or None + ndim : int, default 1 + + Raises + ------ + ValueError + """ + if axis is None: + return + if axis >= ndim or (axis < 0 and ndim + axis < 0): + raise ValueError(f"`axis` must be fewer than the number of dimensions ({ndim})") + + +_validation_funcs = { + "median": validate_median, + "mean": validate_mean, + "min": validate_min, + "max": validate_max, + "sum": validate_sum, + "prod": validate_prod, +} + + +def validate_func(fname, args, kwargs) -> None: + if fname not in _validation_funcs: + return validate_stat_func(args, kwargs, fname=fname) + + validation_func = _validation_funcs[fname] + return validation_func(args, kwargs) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3e58a082dc3a7569d5c3d37f4d1a9a414d02bafe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/accessor.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/accessor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b481921a0229bdbbb8635fb619eb398415afad46 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/accessor.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/algorithms.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/algorithms.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb67e9f224d9e8cbb705f78bc0e37b79cb942ada Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/algorithms.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b232959e1db5c19823988cea22b31cd4ebee90f1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/apply.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/apply.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1c872b344389e4ad09fee34e539eabfe0908367 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/apply.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/arraylike.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/arraylike.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd36d373a3d2f55b5d4c1a3f56f539a3c007364a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/arraylike.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/base.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1233a7762b0728e39207140f4104d6eee5e1616 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/base.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e22d301070b2f2a4af0b44bdd6778d31934357bb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/config_init.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/config_init.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..adba106d372a7bc33eccafaf624e57f60318375d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/config_init.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/construction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/construction.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17d7778600d181ba2a8eb9a61272636684e26795 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/construction.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/flags.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/flags.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23f4bff795df635a84fa825a3fbd28fb9ec17044 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/flags.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/missing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/missing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3628a6dd192f37d49f2426e2db5158bbab0669b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/missing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/nanops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/nanops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0baf028ee872f25d35ef82a590e6ce6522dad1e7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/nanops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/roperator.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/roperator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b2acf5efe1fb819ad9ec16c5477fc175018cd23 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/roperator.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/sample.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/sample.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c977934f16349a0e6795051cf0dc17247090157e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/sample.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/shared_docs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/shared_docs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13e6b81ddda0ccb821bac69c178b23a3a26048f0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/shared_docs.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/sorting.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/sorting.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02b21523cc0b633dfbcfc7ace083c878dee8afc3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/__pycache__/sorting.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c25e2b401bd5c18afed38e5daf4d8613d822bb2d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/__pycache__/executor.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/__pycache__/executor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d72f4fe478b70760ea3fd8fbf795fc339cd10cf5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/__pycache__/executor.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/__pycache__/extensions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/__pycache__/extensions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5bd914538f20a60d9d558274325808172d87fb15 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/__pycache__/extensions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/executor.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/executor.py new file mode 100644 index 0000000000000000000000000000000000000000..0a26acb7df60a20a253ca3b864568c984bb86e3d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/executor.py @@ -0,0 +1,239 @@ +from __future__ import annotations + +import functools +from typing import ( + TYPE_CHECKING, + Any, + Callable, +) + +if TYPE_CHECKING: + from pandas._typing import Scalar + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + + +@functools.cache +def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + nb_compat_func = numba.extending.register_jitable(func) + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def nb_looper(values, axis): + # Operate on the first row/col in order to get + # the output shape + if axis == 0: + first_elem = values[:, 0] + dim0 = values.shape[1] + else: + first_elem = values[0] + dim0 = values.shape[0] + res0 = nb_compat_func(first_elem) + # Use np.asarray to get shape for + # https://github.com/numba/numba/issues/4202#issuecomment-1185981507 + buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape + if axis == 0: + buf_shape = buf_shape[::-1] + buff = np.empty(buf_shape) + + if axis == 1: + buff[0] = res0 + for i in numba.prange(1, values.shape[0]): + buff[i] = nb_compat_func(values[i]) + else: + buff[:, 0] = res0 + for j in numba.prange(1, values.shape[1]): + buff[:, j] = nb_compat_func(values[:, j]) + return buff + + return nb_looper + + +@functools.cache +def make_looper(func, result_dtype, is_grouped_kernel, nopython, nogil, parallel): + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + if is_grouped_kernel: + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def column_looper( + values: np.ndarray, + labels: np.ndarray, + ngroups: int, + min_periods: int, + *args, + ): + result = np.empty((values.shape[0], ngroups), dtype=result_dtype) + na_positions = {} + for i in numba.prange(values.shape[0]): + output, na_pos = func( + values[i], result_dtype, labels, ngroups, min_periods, *args + ) + result[i] = output + if len(na_pos) > 0: + na_positions[i] = np.array(na_pos) + return result, na_positions + + else: + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def column_looper( + values: np.ndarray, + start: np.ndarray, + end: np.ndarray, + min_periods: int, + *args, + ): + result = np.empty((values.shape[0], len(start)), dtype=result_dtype) + na_positions = {} + for i in numba.prange(values.shape[0]): + output, na_pos = func( + values[i], result_dtype, start, end, min_periods, *args + ) + result[i] = output + if len(na_pos) > 0: + na_positions[i] = np.array(na_pos) + return result, na_positions + + return column_looper + + +default_dtype_mapping: dict[np.dtype, Any] = { + np.dtype("int8"): np.int64, + np.dtype("int16"): np.int64, + np.dtype("int32"): np.int64, + np.dtype("int64"): np.int64, + np.dtype("uint8"): np.uint64, + np.dtype("uint16"): np.uint64, + np.dtype("uint32"): np.uint64, + np.dtype("uint64"): np.uint64, + np.dtype("float32"): np.float64, + np.dtype("float64"): np.float64, + np.dtype("complex64"): np.complex128, + np.dtype("complex128"): np.complex128, +} + + +# TODO: Preserve complex dtypes + +float_dtype_mapping: dict[np.dtype, Any] = { + np.dtype("int8"): np.float64, + np.dtype("int16"): np.float64, + np.dtype("int32"): np.float64, + np.dtype("int64"): np.float64, + np.dtype("uint8"): np.float64, + np.dtype("uint16"): np.float64, + np.dtype("uint32"): np.float64, + np.dtype("uint64"): np.float64, + np.dtype("float32"): np.float64, + np.dtype("float64"): np.float64, + np.dtype("complex64"): np.float64, + np.dtype("complex128"): np.float64, +} + +identity_dtype_mapping: dict[np.dtype, Any] = { + np.dtype("int8"): np.int8, + np.dtype("int16"): np.int16, + np.dtype("int32"): np.int32, + np.dtype("int64"): np.int64, + np.dtype("uint8"): np.uint8, + np.dtype("uint16"): np.uint16, + np.dtype("uint32"): np.uint32, + np.dtype("uint64"): np.uint64, + np.dtype("float32"): np.float32, + np.dtype("float64"): np.float64, + np.dtype("complex64"): np.complex64, + np.dtype("complex128"): np.complex128, +} + + +def generate_shared_aggregator( + func: Callable[..., Scalar], + dtype_mapping: dict[np.dtype, np.dtype], + is_grouped_kernel: bool, + nopython: bool, + nogil: bool, + parallel: bool, +): + """ + Generate a Numba function that loops over the columns 2D object and applies + a 1D numba kernel over each column. + + Parameters + ---------- + func : function + aggregation function to be applied to each column + dtype_mapping: dict or None + If not None, maps a dtype to a result dtype. + Otherwise, will fall back to default mapping. + is_grouped_kernel: bool, default False + Whether func operates using the group labels (True) + or using starts/ends arrays + + If true, you also need to pass the number of groups to this function + nopython : bool + nopython to be passed into numba.jit + nogil : bool + nogil to be passed into numba.jit + parallel : bool + parallel to be passed into numba.jit + + Returns + ------- + Numba function + """ + + # A wrapper around the looper function, + # to dispatch based on dtype since numba is unable to do that in nopython mode + + # It also post-processes the values by inserting nans where number of observations + # is less than min_periods + # Cannot do this in numba nopython mode + # (you'll run into type-unification error when you cast int -> float) + def looper_wrapper( + values, + start=None, + end=None, + labels=None, + ngroups=None, + min_periods: int = 0, + **kwargs, + ): + result_dtype = dtype_mapping[values.dtype] + column_looper = make_looper( + func, result_dtype, is_grouped_kernel, nopython, nogil, parallel + ) + # Need to unpack kwargs since numba only supports *args + if is_grouped_kernel: + result, na_positions = column_looper( + values, labels, ngroups, min_periods, *kwargs.values() + ) + else: + result, na_positions = column_looper( + values, start, end, min_periods, *kwargs.values() + ) + if result.dtype.kind == "i": + # Look if na_positions is not empty + # If so, convert the whole block + # This is OK since int dtype cannot hold nan, + # so if min_periods not satisfied for 1 col, it is not satisfied for + # all columns at that index + for na_pos in na_positions.values(): + if len(na_pos) > 0: + result = result.astype("float64") + break + # TODO: Optimize this + for i, na_pos in na_positions.items(): + if len(na_pos) > 0: + result[i, na_pos] = np.nan + return result + + return looper_wrapper diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/extensions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/extensions.py new file mode 100644 index 0000000000000000000000000000000000000000..ee09c9380fb0f2bbc2b84d2ad6fa44b21a0ab8ce --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/extensions.py @@ -0,0 +1,584 @@ +# Disable type checking for this module since numba's internals +# are not typed, and we use numba's internals via its extension API +# mypy: ignore-errors +""" +Utility classes/functions to let numba recognize +pandas Index/Series/DataFrame + +Mostly vendored from https://github.com/numba/numba/blob/main/numba/tests/pdlike_usecase.py +""" + +from __future__ import annotations + +from contextlib import contextmanager +import operator + +import numba +from numba import types +from numba.core import cgutils +from numba.core.datamodel import models +from numba.core.extending import ( + NativeValue, + box, + lower_builtin, + make_attribute_wrapper, + overload, + overload_attribute, + overload_method, + register_model, + type_callable, + typeof_impl, + unbox, +) +from numba.core.imputils import impl_ret_borrowed +import numpy as np + +from pandas._libs import lib + +from pandas.core.indexes.base import Index +from pandas.core.indexing import _iLocIndexer +from pandas.core.internals import SingleBlockManager +from pandas.core.series import Series + + +# Helper function to hack around fact that Index casts numpy string dtype to object +# +# Idea is to set an attribute on a Index called _numba_data +# that is the original data, or the object data casted to numpy string dtype, +# with a context manager that is unset afterwards +@contextmanager +def set_numba_data(index: Index): + numba_data = index._data + if numba_data.dtype == object: + if not lib.is_string_array(numba_data): + raise ValueError( + "The numba engine only supports using string or numeric column names" + ) + numba_data = numba_data.astype("U") + try: + index._numba_data = numba_data + yield index + finally: + del index._numba_data + + +# TODO: Range index support +# (this currently lowers OK, but does not round-trip) +class IndexType(types.Type): + """ + The type class for Index objects. + """ + + def __init__(self, dtype, layout, pyclass: any) -> None: + self.pyclass = pyclass + name = f"index({dtype}, {layout})" + self.dtype = dtype + self.layout = layout + super().__init__(name) + + @property + def key(self): + return self.pyclass, self.dtype, self.layout + + @property + def as_array(self): + return types.Array(self.dtype, 1, self.layout) + + def copy(self, dtype=None, ndim: int = 1, layout=None): + assert ndim == 1 + if dtype is None: + dtype = self.dtype + layout = layout or self.layout + return type(self)(dtype, layout, self.pyclass) + + +class SeriesType(types.Type): + """ + The type class for Series objects. + """ + + def __init__(self, dtype, index, namety) -> None: + assert isinstance(index, IndexType) + self.dtype = dtype + self.index = index + self.values = types.Array(self.dtype, 1, "C") + self.namety = namety + name = f"series({dtype}, {index}, {namety})" + super().__init__(name) + + @property + def key(self): + return self.dtype, self.index, self.namety + + @property + def as_array(self): + return self.values + + def copy(self, dtype=None, ndim: int = 1, layout: str = "C"): + assert ndim == 1 + assert layout == "C" + if dtype is None: + dtype = self.dtype + return type(self)(dtype, self.index, self.namety) + + +@typeof_impl.register(Index) +def typeof_index(val, c): + """ + This will assume that only strings are in object dtype + index. + (you should check this before this gets lowered down to numba) + """ + # arrty = typeof_impl(val._data, c) + arrty = typeof_impl(val._numba_data, c) + assert arrty.ndim == 1 + return IndexType(arrty.dtype, arrty.layout, type(val)) + + +@typeof_impl.register(Series) +def typeof_series(val, c): + index = typeof_impl(val.index, c) + arrty = typeof_impl(val.values, c) + namety = typeof_impl(val.name, c) + assert arrty.ndim == 1 + assert arrty.layout == "C" + return SeriesType(arrty.dtype, index, namety) + + +@type_callable(Series) +def type_series_constructor(context): + def typer(data, index, name=None): + if isinstance(index, IndexType) and isinstance(data, types.Array): + assert data.ndim == 1 + if name is None: + name = types.intp + return SeriesType(data.dtype, index, name) + + return typer + + +@type_callable(Index) +def type_index_constructor(context): + def typer(data, hashmap=None): + if isinstance(data, types.Array): + assert data.layout == "C" + assert data.ndim == 1 + assert hashmap is None or isinstance(hashmap, types.DictType) + return IndexType(data.dtype, layout=data.layout, pyclass=Index) + + return typer + + +# Backend extensions for Index and Series and Frame +@register_model(IndexType) +class IndexModel(models.StructModel): + def __init__(self, dmm, fe_type) -> None: + # We don't want the numpy string scalar type in our hashmap + members = [ + ("data", fe_type.as_array), + # This is an attempt to emulate our hashtable code with a numba + # typed dict + # It maps from values in the index to their integer positions in the array + ("hashmap", types.DictType(fe_type.dtype, types.intp)), + # Pointer to the Index object this was created from, or that it + # boxes to + # https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1 + ("parent", types.pyobject), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +@register_model(SeriesType) +class SeriesModel(models.StructModel): + def __init__(self, dmm, fe_type) -> None: + members = [ + ("index", fe_type.index), + ("values", fe_type.as_array), + ("name", fe_type.namety), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(IndexType, "data", "_data") +make_attribute_wrapper(IndexType, "hashmap", "hashmap") + +make_attribute_wrapper(SeriesType, "index", "index") +make_attribute_wrapper(SeriesType, "values", "values") +make_attribute_wrapper(SeriesType, "name", "name") + + +@lower_builtin(Series, types.Array, IndexType) +def pdseries_constructor(context, builder, sig, args): + data, index = args + series = cgutils.create_struct_proxy(sig.return_type)(context, builder) + series.index = index + series.values = data + series.name = context.get_constant(types.intp, 0) + return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue()) + + +@lower_builtin(Series, types.Array, IndexType, types.intp) +@lower_builtin(Series, types.Array, IndexType, types.float64) +@lower_builtin(Series, types.Array, IndexType, types.unicode_type) +def pdseries_constructor_with_name(context, builder, sig, args): + data, index, name = args + series = cgutils.create_struct_proxy(sig.return_type)(context, builder) + series.index = index + series.values = data + series.name = name + return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue()) + + +@lower_builtin(Index, types.Array, types.DictType, types.pyobject) +def index_constructor_2arg(context, builder, sig, args): + (data, hashmap, parent) = args + index = cgutils.create_struct_proxy(sig.return_type)(context, builder) + + index.data = data + index.hashmap = hashmap + index.parent = parent + return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue()) + + +@lower_builtin(Index, types.Array, types.DictType) +def index_constructor_2arg_parent(context, builder, sig, args): + # Basically same as index_constructor_1arg, but also lets you specify the + # parent object + (data, hashmap) = args + index = cgutils.create_struct_proxy(sig.return_type)(context, builder) + + index.data = data + index.hashmap = hashmap + return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue()) + + +@lower_builtin(Index, types.Array) +def index_constructor_1arg(context, builder, sig, args): + from numba.typed import Dict + + key_type = sig.return_type.dtype + value_type = types.intp + + def index_impl(data): + return Index(data, Dict.empty(key_type, value_type)) + + return context.compile_internal(builder, index_impl, sig, args) + + +# Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type +# (regular string) +def maybe_cast_str(x): + # Dummy function that numba can overload + pass + + +@overload(maybe_cast_str) +def maybe_cast_str_impl(x): + """Converts numba UnicodeCharSeq (numpy string scalar) -> unicode type (string). + Is a no-op for other types.""" + if isinstance(x, types.UnicodeCharSeq): + return lambda x: str(x) + else: + return lambda x: x + + +@unbox(IndexType) +def unbox_index(typ, obj, c): + """ + Convert a Index object to a native structure. + + Note: Object dtype is not allowed here + """ + data_obj = c.pyapi.object_getattr_string(obj, "_numba_data") + index = cgutils.create_struct_proxy(typ)(c.context, c.builder) + # If we see an object array, assume its been validated as only containing strings + # We still need to do the conversion though + index.data = c.unbox(typ.as_array, data_obj).value + typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict)) + # Create an empty typed dict in numba for the hashmap for indexing + # equiv of numba.typed.Dict.empty(typ.dtype, types.intp) + arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype)) + intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp)) + hashmap_obj = c.pyapi.call_method( + typed_dict_obj, "empty", (arr_type_obj, intp_type_obj) + ) + index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value + # Set the parent for speedy boxing. + index.parent = obj + + # Decrefs + c.pyapi.decref(data_obj) + c.pyapi.decref(arr_type_obj) + c.pyapi.decref(intp_type_obj) + c.pyapi.decref(typed_dict_obj) + + return NativeValue(index._getvalue()) + + +@unbox(SeriesType) +def unbox_series(typ, obj, c): + """ + Convert a Series object to a native structure. + """ + index_obj = c.pyapi.object_getattr_string(obj, "index") + values_obj = c.pyapi.object_getattr_string(obj, "values") + name_obj = c.pyapi.object_getattr_string(obj, "name") + + series = cgutils.create_struct_proxy(typ)(c.context, c.builder) + series.index = c.unbox(typ.index, index_obj).value + series.values = c.unbox(typ.values, values_obj).value + series.name = c.unbox(typ.namety, name_obj).value + + # Decrefs + c.pyapi.decref(index_obj) + c.pyapi.decref(values_obj) + c.pyapi.decref(name_obj) + + return NativeValue(series._getvalue()) + + +@box(IndexType) +def box_index(typ, val, c): + """ + Convert a native index structure to a Index object. + + If our native index is of a numpy string dtype, we'll cast it to + object. + """ + # First build a Numpy array object, then wrap it in a Index + index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) + + res = cgutils.alloca_once_value(c.builder, index.parent) + + # Does parent exist? + # (it means already boxed once, or Index same as original df.index or df.columns) + # xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17 + with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as ( + has_parent, + otherwise, + ): + with has_parent: + c.pyapi.incref(index.parent) + with otherwise: + # TODO: preserve the original class for the index + # Also need preserve the name of the Index + # class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass)) + class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index)) + array_obj = c.box(typ.as_array, index.data) + if isinstance(typ.dtype, types.UnicodeCharSeq): + # We converted to numpy string dtype, convert back + # to object since _simple_new won't do that for uss + object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object")) + array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,)) + c.pyapi.decref(object_str_obj) + # this is basically Index._simple_new(array_obj, name_obj) in python + index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,)) + index.parent = index_obj + c.builder.store(index_obj, res) + + # Decrefs + c.pyapi.decref(class_obj) + c.pyapi.decref(array_obj) + return c.builder.load(res) + + +@box(SeriesType) +def box_series(typ, val, c): + """ + Convert a native series structure to a Series object. + """ + series = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) + series_const_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series._from_mgr)) + mgr_const_obj = c.pyapi.unserialize( + c.pyapi.serialize_object(SingleBlockManager.from_array) + ) + index_obj = c.box(typ.index, series.index) + array_obj = c.box(typ.as_array, series.values) + name_obj = c.box(typ.namety, series.name) + # This is basically equivalent of + # pd.Series(data=array_obj, index=index_obj) + # To improve perf, we will construct the Series from a manager + # object to avoid checks. + # We'll also set the name attribute manually to avoid validation + mgr_obj = c.pyapi.call_function_objargs( + mgr_const_obj, + ( + array_obj, + index_obj, + ), + ) + mgr_axes_obj = c.pyapi.object_getattr_string(mgr_obj, "axes") + # Series._constructor_from_mgr(mgr, axes) + series_obj = c.pyapi.call_function_objargs( + series_const_obj, (mgr_obj, mgr_axes_obj) + ) + c.pyapi.object_setattr_string(series_obj, "_name", name_obj) + + # Decrefs + c.pyapi.decref(series_const_obj) + c.pyapi.decref(mgr_axes_obj) + c.pyapi.decref(mgr_obj) + c.pyapi.decref(mgr_const_obj) + c.pyapi.decref(index_obj) + c.pyapi.decref(array_obj) + c.pyapi.decref(name_obj) + + return series_obj + + +# Add common series reductions (e.g. mean, sum), +# and also add common binops (e.g. add, sub, mul, div) +def generate_series_reduction(ser_reduction, ser_method): + @overload_method(SeriesType, ser_reduction) + def series_reduction(series): + def series_reduction_impl(series): + return ser_method(series.values) + + return series_reduction_impl + + return series_reduction + + +def generate_series_binop(binop): + @overload(binop) + def series_binop(series1, value): + if isinstance(series1, SeriesType): + if isinstance(value, SeriesType): + + def series_binop_impl(series1, series2): + # TODO: Check index matching? + return Series( + binop(series1.values, series2.values), + series1.index, + series1.name, + ) + + return series_binop_impl + else: + + def series_binop_impl(series1, value): + return Series( + binop(series1.values, value), series1.index, series1.name + ) + + return series_binop_impl + + return series_binop + + +series_reductions = [ + ("sum", np.sum), + ("mean", np.mean), + # Disabled due to discrepancies between numba std. dev + # and pandas std. dev (no way to specify dof) + # ("std", np.std), + # ("var", np.var), + ("min", np.min), + ("max", np.max), +] +for reduction, reduction_method in series_reductions: + generate_series_reduction(reduction, reduction_method) + +series_binops = [operator.add, operator.sub, operator.mul, operator.truediv] + +for ser_binop in series_binops: + generate_series_binop(ser_binop) + + +# get_loc on Index +@overload_method(IndexType, "get_loc") +def index_get_loc(index, item): + def index_get_loc_impl(index, item): + # Initialize the hash table if not initialized + if len(index.hashmap) == 0: + for i, val in enumerate(index._data): + index.hashmap[val] = i + return index.hashmap[item] + + return index_get_loc_impl + + +# Indexing for Series/Index +@overload(operator.getitem) +def series_indexing(series, item): + if isinstance(series, SeriesType): + + def series_getitem(series, item): + loc = series.index.get_loc(item) + return series.iloc[loc] + + return series_getitem + + +@overload(operator.getitem) +def index_indexing(index, idx): + if isinstance(index, IndexType): + + def index_getitem(index, idx): + return index._data[idx] + + return index_getitem + + +class IlocType(types.Type): + def __init__(self, obj_type) -> None: + self.obj_type = obj_type + name = f"iLocIndexer({obj_type})" + super().__init__(name=name) + + @property + def key(self): + return self.obj_type + + +@typeof_impl.register(_iLocIndexer) +def typeof_iloc(val, c): + objtype = typeof_impl(val.obj, c) + return IlocType(objtype) + + +@type_callable(_iLocIndexer) +def type_iloc_constructor(context): + def typer(obj): + if isinstance(obj, SeriesType): + return IlocType(obj) + + return typer + + +@lower_builtin(_iLocIndexer, SeriesType) +def iloc_constructor(context, builder, sig, args): + (obj,) = args + iloc_indexer = cgutils.create_struct_proxy(sig.return_type)(context, builder) + iloc_indexer.obj = obj + return impl_ret_borrowed( + context, builder, sig.return_type, iloc_indexer._getvalue() + ) + + +@register_model(IlocType) +class ILocModel(models.StructModel): + def __init__(self, dmm, fe_type) -> None: + members = [("obj", fe_type.obj_type)] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(IlocType, "obj", "obj") + + +@overload_attribute(SeriesType, "iloc") +def series_iloc(series): + def get(series): + return _iLocIndexer(series) + + return get + + +@overload(operator.getitem) +def iloc_getitem(iloc_indexer, i): + if isinstance(iloc_indexer, IlocType): + + def getitem_impl(iloc_indexer, i): + return iloc_indexer.obj.values[i] + + return getitem_impl diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1116c61c4ca8e48d94a6c9c6222aaf545d989e86 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__init__.py @@ -0,0 +1,27 @@ +from pandas.core._numba.kernels.mean_ import ( + grouped_mean, + sliding_mean, +) +from pandas.core._numba.kernels.min_max_ import ( + grouped_min_max, + sliding_min_max, +) +from pandas.core._numba.kernels.sum_ import ( + grouped_sum, + sliding_sum, +) +from pandas.core._numba.kernels.var_ import ( + grouped_var, + sliding_var, +) + +__all__ = [ + "sliding_mean", + "grouped_mean", + "sliding_sum", + "grouped_sum", + "sliding_var", + "grouped_var", + "sliding_min_max", + "grouped_min_max", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7ee8ddd19634f4f69ddc04e149e6127c16208b3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/mean_.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/mean_.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..955c3d86cc82b0f4e48660c8bdab0ac4b8575b6e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/mean_.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/min_max_.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/min_max_.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c27acfa185c8b372a0ecb52d59b9871152ad43de Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/min_max_.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/shared.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/shared.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2cb3c3e8047a70fe22fec575d12375a4eca4eda Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/shared.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/sum_.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/sum_.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..065665335120b207dbe59d0c304b0865e6743187 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/sum_.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/var_.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/var_.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bed10b9aef89096d2b2cef0a6975784c24ee87c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/__pycache__/var_.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/mean_.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/mean_.py new file mode 100644 index 0000000000000000000000000000000000000000..f415804781753372a5715b6ffee6a7ab8cc70b64 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/mean_.py @@ -0,0 +1,196 @@ +""" +Numba 1D mean kernels that can be shared by +* Dataframe / Series +* groupby +* rolling / expanding + +Mirrors pandas/_libs/window/aggregation.pyx +""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numba +import numpy as np + +from pandas.core._numba.kernels.shared import is_monotonic_increasing +from pandas.core._numba.kernels.sum_ import grouped_kahan_sum + +if TYPE_CHECKING: + from pandas._typing import npt + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def add_mean( + val: float, + nobs: int, + sum_x: float, + neg_ct: int, + compensation: float, + num_consecutive_same_value: int, + prev_value: float, +) -> tuple[int, float, int, float, int, float]: + if not np.isnan(val): + nobs += 1 + y = val - compensation + t = sum_x + y + compensation = t - sum_x - y + sum_x = t + if val < 0: + neg_ct += 1 + + if val == prev_value: + num_consecutive_same_value += 1 + else: + num_consecutive_same_value = 1 + prev_value = val + + return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def remove_mean( + val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float +) -> tuple[int, float, int, float]: + if not np.isnan(val): + nobs -= 1 + y = -val - compensation + t = sum_x + y + compensation = t - sum_x - y + sum_x = t + if val < 0: + neg_ct -= 1 + return nobs, sum_x, neg_ct, compensation + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def sliding_mean( + values: np.ndarray, + result_dtype: np.dtype, + start: np.ndarray, + end: np.ndarray, + min_periods: int, +) -> tuple[np.ndarray, list[int]]: + N = len(start) + nobs = 0 + sum_x = 0.0 + neg_ct = 0 + compensation_add = 0.0 + compensation_remove = 0.0 + + is_monotonic_increasing_bounds = is_monotonic_increasing( + start + ) and is_monotonic_increasing(end) + + output = np.empty(N, dtype=result_dtype) + + for i in range(N): + s = start[i] + e = end[i] + if i == 0 or not is_monotonic_increasing_bounds: + prev_value = values[s] + num_consecutive_same_value = 0 + + for j in range(s, e): + val = values[j] + ( + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_mean( + val, + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, # pyright: ignore[reportGeneralTypeIssues] + ) + else: + for j in range(start[i - 1], s): + val = values[j] + nobs, sum_x, neg_ct, compensation_remove = remove_mean( + val, nobs, sum_x, neg_ct, compensation_remove + ) + + for j in range(end[i - 1], e): + val = values[j] + ( + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_mean( + val, + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, # pyright: ignore[reportGeneralTypeIssues] + ) + + if nobs >= min_periods and nobs > 0: + result = sum_x / nobs + if num_consecutive_same_value >= nobs: + result = prev_value + elif neg_ct == 0 and result < 0: + result = 0 + elif neg_ct == nobs and result > 0: + result = 0 + else: + result = np.nan + + output[i] = result + + if not is_monotonic_increasing_bounds: + nobs = 0 + sum_x = 0.0 + neg_ct = 0 + compensation_remove = 0.0 + + # na_position is empty list since float64 can already hold nans + # Do list comprehension, since numba cannot figure out that na_pos is + # empty list of ints on its own + na_pos = [0 for i in range(0)] + return output, na_pos + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def grouped_mean( + values: np.ndarray, + result_dtype: np.dtype, + labels: npt.NDArray[np.intp], + ngroups: int, + min_periods: int, +) -> tuple[np.ndarray, list[int]]: + output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( + values, result_dtype, labels, ngroups + ) + + # Post-processing, replace sums that don't satisfy min_periods + for lab in range(ngroups): + nobs = nobs_arr[lab] + num_consecutive_same_value = consecutive_counts[lab] + prev_value = prev_vals[lab] + sum_x = output[lab] + if nobs >= min_periods: + if num_consecutive_same_value >= nobs: + result = prev_value * nobs + else: + result = sum_x + else: + result = np.nan + result /= nobs + output[lab] = result + + # na_position is empty list since float64 can already hold nans + # Do list comprehension, since numba cannot figure out that na_pos is + # empty list of ints on its own + na_pos = [0 for i in range(0)] + return output, na_pos diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/min_max_.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/min_max_.py new file mode 100644 index 0000000000000000000000000000000000000000..c9803980e64a6bf2ec5b274acd582850e3a07420 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/min_max_.py @@ -0,0 +1,125 @@ +""" +Numba 1D min/max kernels that can be shared by +* Dataframe / Series +* groupby +* rolling / expanding + +Mirrors pandas/_libs/window/aggregation.pyx +""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numba +import numpy as np + +if TYPE_CHECKING: + from pandas._typing import npt + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def sliding_min_max( + values: np.ndarray, + result_dtype: np.dtype, + start: np.ndarray, + end: np.ndarray, + min_periods: int, + is_max: bool, +) -> tuple[np.ndarray, list[int]]: + N = len(start) + nobs = 0 + output = np.empty(N, dtype=result_dtype) + na_pos = [] + # Use deque once numba supports it + # https://github.com/numba/numba/issues/7417 + Q: list = [] + W: list = [] + for i in range(N): + curr_win_size = end[i] - start[i] + if i == 0: + st = start[i] + else: + st = end[i - 1] + + for k in range(st, end[i]): + ai = values[k] + if not np.isnan(ai): + nobs += 1 + elif is_max: + ai = -np.inf + else: + ai = np.inf + # Discard previous entries if we find new min or max + if is_max: + while Q and ((ai >= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]): + Q.pop() + else: + while Q and ((ai <= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]): + Q.pop() + Q.append(k) + W.append(k) + + # Discard entries outside and left of current window + while Q and Q[0] <= start[i] - 1: + Q.pop(0) + while W and W[0] <= start[i] - 1: + if not np.isnan(values[W[0]]): + nobs -= 1 + W.pop(0) + + # Save output based on index in input value array + if Q and curr_win_size > 0 and nobs >= min_periods: + output[i] = values[Q[0]] + else: + if values.dtype.kind != "i": + output[i] = np.nan + else: + na_pos.append(i) + + return output, na_pos + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def grouped_min_max( + values: np.ndarray, + result_dtype: np.dtype, + labels: npt.NDArray[np.intp], + ngroups: int, + min_periods: int, + is_max: bool, +) -> tuple[np.ndarray, list[int]]: + N = len(labels) + nobs = np.zeros(ngroups, dtype=np.int64) + na_pos = [] + output = np.empty(ngroups, dtype=result_dtype) + + for i in range(N): + lab = labels[i] + val = values[i] + if lab < 0: + continue + + if values.dtype.kind == "i" or not np.isnan(val): + nobs[lab] += 1 + else: + # NaN value cannot be a min/max value + continue + + if nobs[lab] == 1: + # First element in group, set output equal to this + output[lab] = val + continue + + if is_max: + if val > output[lab]: + output[lab] = val + else: + if val < output[lab]: + output[lab] = val + + # Set labels that don't satisfy min_periods as np.nan + for lab, count in enumerate(nobs): + if count < min_periods: + na_pos.append(lab) + + return output, na_pos diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/shared.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/shared.py new file mode 100644 index 0000000000000000000000000000000000000000..c52372fe6b08f3ec9ec6e836341e08ac804d50f3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/shared.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numba + +if TYPE_CHECKING: + import numpy as np + + +@numba.jit( + # error: Any? not callable + numba.boolean(numba.int64[:]), # type: ignore[misc] + nopython=True, + nogil=True, + parallel=False, +) +def is_monotonic_increasing(bounds: np.ndarray) -> bool: + """Check if int64 values are monotonically increasing.""" + n = len(bounds) + if n < 2: + return True + prev = bounds[0] + for i in range(1, n): + cur = bounds[i] + if cur < prev: + return False + prev = cur + return True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/sum_.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/sum_.py new file mode 100644 index 0000000000000000000000000000000000000000..94db84267ceecb83234fc2e0b231566a2fdffd66 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/sum_.py @@ -0,0 +1,244 @@ +""" +Numba 1D sum kernels that can be shared by +* Dataframe / Series +* groupby +* rolling / expanding + +Mirrors pandas/_libs/window/aggregation.pyx +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, +) + +import numba +from numba.extending import register_jitable +import numpy as np + +if TYPE_CHECKING: + from pandas._typing import npt + +from pandas.core._numba.kernels.shared import is_monotonic_increasing + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def add_sum( + val: Any, + nobs: int, + sum_x: Any, + compensation: Any, + num_consecutive_same_value: int, + prev_value: Any, +) -> tuple[int, Any, Any, int, Any]: + if not np.isnan(val): + nobs += 1 + y = val - compensation + t = sum_x + y + compensation = t - sum_x - y + sum_x = t + + if val == prev_value: + num_consecutive_same_value += 1 + else: + num_consecutive_same_value = 1 + prev_value = val + + return nobs, sum_x, compensation, num_consecutive_same_value, prev_value + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def remove_sum( + val: Any, nobs: int, sum_x: Any, compensation: Any +) -> tuple[int, Any, Any]: + if not np.isnan(val): + nobs -= 1 + y = -val - compensation + t = sum_x + y + compensation = t - sum_x - y + sum_x = t + return nobs, sum_x, compensation + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def sliding_sum( + values: np.ndarray, + result_dtype: np.dtype, + start: np.ndarray, + end: np.ndarray, + min_periods: int, +) -> tuple[np.ndarray, list[int]]: + dtype = values.dtype + + na_val: object = np.nan + if dtype.kind == "i": + na_val = 0 + + N = len(start) + nobs = 0 + sum_x = 0 + compensation_add = 0 + compensation_remove = 0 + na_pos = [] + + is_monotonic_increasing_bounds = is_monotonic_increasing( + start + ) and is_monotonic_increasing(end) + + output = np.empty(N, dtype=result_dtype) + + for i in range(N): + s = start[i] + e = end[i] + if i == 0 or not is_monotonic_increasing_bounds: + prev_value = values[s] + num_consecutive_same_value = 0 + + for j in range(s, e): + val = values[j] + ( + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_sum( + val, + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) + else: + for j in range(start[i - 1], s): + val = values[j] + nobs, sum_x, compensation_remove = remove_sum( + val, nobs, sum_x, compensation_remove + ) + + for j in range(end[i - 1], e): + val = values[j] + ( + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_sum( + val, + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) + + if nobs == 0 == min_periods: + result: object = 0 + elif nobs >= min_periods: + if num_consecutive_same_value >= nobs: + result = prev_value * nobs + else: + result = sum_x + else: + result = na_val + if dtype.kind == "i": + na_pos.append(i) + + output[i] = result + + if not is_monotonic_increasing_bounds: + nobs = 0 + sum_x = 0 + compensation_remove = 0 + + return output, na_pos + + +# Mypy/pyright don't like the fact that the decorator is untyped +@register_jitable # type: ignore[misc] +def grouped_kahan_sum( + values: np.ndarray, + result_dtype: np.dtype, + labels: npt.NDArray[np.intp], + ngroups: int, +) -> tuple[ + np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray +]: + N = len(labels) + + nobs_arr = np.zeros(ngroups, dtype=np.int64) + comp_arr = np.zeros(ngroups, dtype=values.dtype) + consecutive_counts = np.zeros(ngroups, dtype=np.int64) + prev_vals = np.zeros(ngroups, dtype=values.dtype) + output = np.zeros(ngroups, dtype=result_dtype) + + for i in range(N): + lab = labels[i] + val = values[i] + + if lab < 0: + continue + + sum_x = output[lab] + nobs = nobs_arr[lab] + compensation_add = comp_arr[lab] + num_consecutive_same_value = consecutive_counts[lab] + prev_value = prev_vals[lab] + + ( + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_sum( + val, + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) + + output[lab] = sum_x + consecutive_counts[lab] = num_consecutive_same_value + prev_vals[lab] = prev_value + comp_arr[lab] = compensation_add + nobs_arr[lab] = nobs + return output, nobs_arr, comp_arr, consecutive_counts, prev_vals + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def grouped_sum( + values: np.ndarray, + result_dtype: np.dtype, + labels: npt.NDArray[np.intp], + ngroups: int, + min_periods: int, +) -> tuple[np.ndarray, list[int]]: + na_pos = [] + + output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( + values, result_dtype, labels, ngroups + ) + + # Post-processing, replace sums that don't satisfy min_periods + for lab in range(ngroups): + nobs = nobs_arr[lab] + num_consecutive_same_value = consecutive_counts[lab] + prev_value = prev_vals[lab] + sum_x = output[lab] + if nobs >= min_periods: + if num_consecutive_same_value >= nobs: + result = prev_value * nobs + else: + result = sum_x + else: + result = sum_x # Don't change val, will be replaced by nan later + na_pos.append(lab) + output[lab] = result + + return output, na_pos diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/var_.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/var_.py new file mode 100644 index 0000000000000000000000000000000000000000..c63d0b90b0fc3f4a4a532b9ce85e57d8bd823fb1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/_numba/kernels/var_.py @@ -0,0 +1,245 @@ +""" +Numba 1D var kernels that can be shared by +* Dataframe / Series +* groupby +* rolling / expanding + +Mirrors pandas/_libs/window/aggregation.pyx +""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numba +import numpy as np + +if TYPE_CHECKING: + from pandas._typing import npt + +from pandas.core._numba.kernels.shared import is_monotonic_increasing + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def add_var( + val: float, + nobs: int, + mean_x: float, + ssqdm_x: float, + compensation: float, + num_consecutive_same_value: int, + prev_value: float, +) -> tuple[int, float, float, float, int, float]: + if not np.isnan(val): + if val == prev_value: + num_consecutive_same_value += 1 + else: + num_consecutive_same_value = 1 + prev_value = val + + nobs += 1 + prev_mean = mean_x - compensation + y = val - compensation + t = y - mean_x + compensation = t + mean_x - y + delta = t + if nobs: + mean_x += delta / nobs + else: + mean_x = 0 + ssqdm_x += (val - prev_mean) * (val - mean_x) + return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def remove_var( + val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float +) -> tuple[int, float, float, float]: + if not np.isnan(val): + nobs -= 1 + if nobs: + prev_mean = mean_x - compensation + y = val - compensation + t = y - mean_x + compensation = t + mean_x - y + delta = t + mean_x -= delta / nobs + ssqdm_x -= (val - prev_mean) * (val - mean_x) + else: + mean_x = 0 + ssqdm_x = 0 + return nobs, mean_x, ssqdm_x, compensation + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def sliding_var( + values: np.ndarray, + result_dtype: np.dtype, + start: np.ndarray, + end: np.ndarray, + min_periods: int, + ddof: int = 1, +) -> tuple[np.ndarray, list[int]]: + N = len(start) + nobs = 0 + mean_x = 0.0 + ssqdm_x = 0.0 + compensation_add = 0.0 + compensation_remove = 0.0 + + min_periods = max(min_periods, 1) + is_monotonic_increasing_bounds = is_monotonic_increasing( + start + ) and is_monotonic_increasing(end) + + output = np.empty(N, dtype=result_dtype) + + for i in range(N): + s = start[i] + e = end[i] + if i == 0 or not is_monotonic_increasing_bounds: + prev_value = values[s] + num_consecutive_same_value = 0 + + for j in range(s, e): + val = values[j] + ( + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_var( + val, + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) + else: + for j in range(start[i - 1], s): + val = values[j] + nobs, mean_x, ssqdm_x, compensation_remove = remove_var( + val, nobs, mean_x, ssqdm_x, compensation_remove + ) + + for j in range(end[i - 1], e): + val = values[j] + ( + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_var( + val, + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) + + if nobs >= min_periods and nobs > ddof: + if nobs == 1 or num_consecutive_same_value >= nobs: + result = 0.0 + else: + result = ssqdm_x / (nobs - ddof) + else: + result = np.nan + + output[i] = result + + if not is_monotonic_increasing_bounds: + nobs = 0 + mean_x = 0.0 + ssqdm_x = 0.0 + compensation_remove = 0.0 + + # na_position is empty list since float64 can already hold nans + # Do list comprehension, since numba cannot figure out that na_pos is + # empty list of ints on its own + na_pos = [0 for i in range(0)] + return output, na_pos + + +@numba.jit(nopython=True, nogil=True, parallel=False) +def grouped_var( + values: np.ndarray, + result_dtype: np.dtype, + labels: npt.NDArray[np.intp], + ngroups: int, + min_periods: int, + ddof: int = 1, +) -> tuple[np.ndarray, list[int]]: + N = len(labels) + + nobs_arr = np.zeros(ngroups, dtype=np.int64) + comp_arr = np.zeros(ngroups, dtype=values.dtype) + consecutive_counts = np.zeros(ngroups, dtype=np.int64) + prev_vals = np.zeros(ngroups, dtype=values.dtype) + output = np.zeros(ngroups, dtype=result_dtype) + means = np.zeros(ngroups, dtype=result_dtype) + + for i in range(N): + lab = labels[i] + val = values[i] + + if lab < 0: + continue + + mean_x = means[lab] + ssqdm_x = output[lab] + nobs = nobs_arr[lab] + compensation_add = comp_arr[lab] + num_consecutive_same_value = consecutive_counts[lab] + prev_value = prev_vals[lab] + + ( + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_var( + val, + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) + + output[lab] = ssqdm_x + means[lab] = mean_x + consecutive_counts[lab] = num_consecutive_same_value + prev_vals[lab] = prev_value + comp_arr[lab] = compensation_add + nobs_arr[lab] = nobs + + # Post-processing, replace vars that don't satisfy min_periods + for lab in range(ngroups): + nobs = nobs_arr[lab] + num_consecutive_same_value = consecutive_counts[lab] + ssqdm_x = output[lab] + if nobs >= min_periods and nobs > ddof: + if nobs == 1 or num_consecutive_same_value >= nobs: + result = 0.0 + else: + result = ssqdm_x / (nobs - ddof) + else: + result = np.nan + output[lab] = result + + # Second pass to get the std.dev + # na_position is empty list since float64 can already hold nans + # Do list comprehension, since numba cannot figure out that na_pos is + # empty list of ints on its own + na_pos = [0 for i in range(0)] + return output, na_pos diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a7655a013c6cf3fca754086fdeb29b806220d5e4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__init__.py @@ -0,0 +1,9 @@ +""" +core.array_algos is for algorithms that operate on ndarray and ExtensionArray. +These should: + +- Assume that any Index, Series, or DataFrame objects have already been unwrapped. +- Assume that any list arguments have already been cast to ndarray/EA. +- Not depend on Index, Series, or DataFrame, nor import any of these. +- May dispatch to ExtensionArray methods, but should not import from core.arrays. +""" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64700d3439a9ab633c28f3b441bdffe13447108a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/datetimelike_accumulations.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/datetimelike_accumulations.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f45cfb86bf265a9adb5768f4c597fcb138eedbd2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/datetimelike_accumulations.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/masked_accumulations.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/masked_accumulations.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0cd52cc8d2671f3e5b4fd62ead5abbbc5f7ba9a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/masked_accumulations.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/masked_reductions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/masked_reductions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68970f1b4befcef0bce17877a862d34851319d86 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/masked_reductions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/putmask.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/putmask.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..419f8f3f78ace8d5813cd4a05c85f7550d503b0d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/putmask.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/quantile.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/quantile.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9df188424667a372a1ca1ae51bf778201ff95094 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/quantile.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/replace.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/replace.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cac04127457da570fe615e34557178447ac72199 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/replace.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/take.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/take.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c04e34d6a0c22fd852dfa8298feab2936c3a24c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/take.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/transforms.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/transforms.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af72401d1c8ebe6cd46da7a8de681f416a42212f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/__pycache__/transforms.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/datetimelike_accumulations.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/datetimelike_accumulations.py new file mode 100644 index 0000000000000000000000000000000000000000..825fe60ee6cf88a2186a5f501c8696cecaf2657d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/datetimelike_accumulations.py @@ -0,0 +1,67 @@ +""" +datetimelke_accumulations.py is for accumulations of datetimelike extension arrays +""" + +from __future__ import annotations + +from typing import Callable + +import numpy as np + +from pandas._libs import iNaT + +from pandas.core.dtypes.missing import isna + + +def _cum_func( + func: Callable, + values: np.ndarray, + *, + skipna: bool = True, +): + """ + Accumulations for 1D datetimelike arrays. + + Parameters + ---------- + func : np.cumsum, np.maximum.accumulate, np.minimum.accumulate + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). Values is changed is modified inplace. + skipna : bool, default True + Whether to skip NA. + """ + try: + fill_value = { + np.maximum.accumulate: np.iinfo(np.int64).min, + np.cumsum: 0, + np.minimum.accumulate: np.iinfo(np.int64).max, + }[func] + except KeyError: + raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") + + mask = isna(values) + y = values.view("i8") + y[mask] = fill_value + + if not skipna: + mask = np.maximum.accumulate(mask) + + result = func(y) + result[mask] = iNaT + + if values.dtype.kind in "mM": + return result.view(values.dtype.base) + return result + + +def cumsum(values: np.ndarray, *, skipna: bool = True) -> np.ndarray: + return _cum_func(np.cumsum, values, skipna=skipna) + + +def cummin(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.minimum.accumulate, values, skipna=skipna) + + +def cummax(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.maximum.accumulate, values, skipna=skipna) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/masked_accumulations.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/masked_accumulations.py new file mode 100644 index 0000000000000000000000000000000000000000..ad9e96d398a242dc64de2018b749fd2dbca7ed78 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/masked_accumulations.py @@ -0,0 +1,90 @@ +""" +masked_accumulations.py is for accumulation algorithms using a mask-based approach +for missing values. +""" + +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Callable, +) + +import numpy as np + +if TYPE_CHECKING: + from pandas._typing import npt + + +def _cum_func( + func: Callable, + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, +): + """ + Accumulations for 1D masked array. + + We will modify values in place to replace NAs with the appropriate fill value. + + Parameters + ---------- + func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + """ + dtype_info: np.iinfo | np.finfo + if values.dtype.kind == "f": + dtype_info = np.finfo(values.dtype.type) + elif values.dtype.kind in "iu": + dtype_info = np.iinfo(values.dtype.type) + elif values.dtype.kind == "b": + # Max value of bool is 1, but since we are setting into a boolean + # array, 255 is fine as well. Min value has to be 0 when setting + # into the boolean array. + dtype_info = np.iinfo(np.uint8) + else: + raise NotImplementedError( + f"No masked accumulation defined for dtype {values.dtype.type}" + ) + try: + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: dtype_info.min, + np.cumsum: 0, + np.minimum.accumulate: dtype_info.max, + }[func] + except KeyError: + raise NotImplementedError( + f"No accumulation for {func} implemented on BaseMaskedArray" + ) + + values[mask] = fill_value + + if not skipna: + mask = np.maximum.accumulate(mask) + + values = func(values) + return values, mask + + +def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): + return _cum_func(np.cumsum, values, mask, skipna=skipna) + + +def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): + return _cum_func(np.cumprod, values, mask, skipna=skipna) + + +def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): + return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna) + + +def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True): + return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/masked_reductions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/masked_reductions.py new file mode 100644 index 0000000000000000000000000000000000000000..335fa1afc0f4e39956a05b567dcc98f0b98c66e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/masked_reductions.py @@ -0,0 +1,197 @@ +""" +masked_reductions.py is for reduction algorithms using a mask-based approach +for missing values. +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Callable, +) +import warnings + +import numpy as np + +from pandas._libs import missing as libmissing + +from pandas.core.nanops import check_below_min_count + +if TYPE_CHECKING: + from pandas._typing import ( + AxisInt, + npt, + ) + + +def _reductions( + func: Callable, + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + min_count: int = 0, + axis: AxisInt | None = None, + **kwargs, +): + """ + Sum, mean or product for 1D masked array. + + Parameters + ---------- + func : np.sum or np.prod + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray[bool] + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + axis : int, optional, default None + """ + if not skipna: + if mask.any() or check_below_min_count(values.shape, None, min_count): + return libmissing.NA + else: + return func(values, axis=axis, **kwargs) + else: + if check_below_min_count(values.shape, mask, min_count) and ( + axis is None or values.ndim == 1 + ): + return libmissing.NA + + return func(values, where=~mask, axis=axis, **kwargs) + + +def sum( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + min_count: int = 0, + axis: AxisInt | None = None, +): + return _reductions( + np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis + ) + + +def prod( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + min_count: int = 0, + axis: AxisInt | None = None, +): + return _reductions( + np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis + ) + + +def _minmax( + func: Callable, + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + axis: AxisInt | None = None, +): + """ + Reduction for 1D masked array. + + Parameters + ---------- + func : np.min or np.max + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray[bool] + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + axis : int, optional, default None + """ + if not skipna: + if mask.any() or not values.size: + # min/max with empty array raise in numpy, pandas returns NA + return libmissing.NA + else: + return func(values, axis=axis) + else: + subset = values[~mask] + if subset.size: + return func(subset, axis=axis) + else: + # min/max with empty array raise in numpy, pandas returns NA + return libmissing.NA + + +def min( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + axis: AxisInt | None = None, +): + return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis) + + +def max( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + axis: AxisInt | None = None, +): + return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis) + + +def mean( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + axis: AxisInt | None = None, +): + if not values.size or mask.all(): + return libmissing.NA + return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis) + + +def var( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + axis: AxisInt | None = None, + ddof: int = 1, +): + if not values.size or mask.all(): + return libmissing.NA + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + return _reductions( + np.var, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof + ) + + +def std( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + *, + skipna: bool = True, + axis: AxisInt | None = None, + ddof: int = 1, +): + if not values.size or mask.all(): + return libmissing.NA + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + return _reductions( + np.std, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/putmask.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/putmask.py new file mode 100644 index 0000000000000000000000000000000000000000..f65d2d20e028e36b35a397d8ac973f184ce1412c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/putmask.py @@ -0,0 +1,149 @@ +""" +EA-compatible analogue to np.putmask +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, +) + +import numpy as np + +from pandas._libs import lib + +from pandas.core.dtypes.cast import infer_dtype_from +from pandas.core.dtypes.common import is_list_like + +from pandas.core.arrays import ExtensionArray + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + npt, + ) + + from pandas import MultiIndex + + +def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None: + """ + ExtensionArray-compatible implementation of np.putmask. The main + difference is we do not handle repeating or truncating like numpy. + + Parameters + ---------- + values: np.ndarray or ExtensionArray + mask : np.ndarray[bool] + We assume extract_bool_array has already been called. + value : Any + """ + + if ( + not isinstance(values, np.ndarray) + or (values.dtype == object and not lib.is_scalar(value)) + # GH#43424: np.putmask raises TypeError if we cannot cast between types with + # rule = "safe", a stricter guarantee we may not have here + or ( + isinstance(value, np.ndarray) and not np.can_cast(value.dtype, values.dtype) + ) + ): + # GH#19266 using np.putmask gives unexpected results with listlike value + # along with object dtype + if is_list_like(value) and len(value) == len(values): + values[mask] = value[mask] + else: + values[mask] = value + else: + # GH#37833 np.putmask is more performant than __setitem__ + np.putmask(values, mask, value) + + +def putmask_without_repeat( + values: np.ndarray, mask: npt.NDArray[np.bool_], new: Any +) -> None: + """ + np.putmask will truncate or repeat if `new` is a listlike with + len(new) != len(values). We require an exact match. + + Parameters + ---------- + values : np.ndarray + mask : np.ndarray[bool] + new : Any + """ + if getattr(new, "ndim", 0) >= 1: + new = new.astype(values.dtype, copy=False) + + # TODO: this prob needs some better checking for 2D cases + nlocs = mask.sum() + if nlocs > 0 and is_list_like(new) and getattr(new, "ndim", 1) == 1: + shape = np.shape(new) + # np.shape compat for if setitem_datetimelike_compat + # changed arraylike to list e.g. test_where_dt64_2d + if nlocs == shape[-1]: + # GH#30567 + # If length of ``new`` is less than the length of ``values``, + # `np.putmask` would first repeat the ``new`` array and then + # assign the masked values hence produces incorrect result. + # `np.place` on the other hand uses the ``new`` values at it is + # to place in the masked locations of ``values`` + np.place(values, mask, new) + # i.e. values[mask] = new + elif mask.shape[-1] == shape[-1] or shape[-1] == 1: + np.putmask(values, mask, new) + else: + raise ValueError("cannot assign mismatch length to masked array") + else: + np.putmask(values, mask, new) + + +def validate_putmask( + values: ArrayLike | MultiIndex, mask: np.ndarray +) -> tuple[npt.NDArray[np.bool_], bool]: + """ + Validate mask and check if this putmask operation is a no-op. + """ + mask = extract_bool_array(mask) + if mask.shape != values.shape: + raise ValueError("putmask: mask and data must be the same size") + + noop = not mask.any() + return mask, noop + + +def extract_bool_array(mask: ArrayLike) -> npt.NDArray[np.bool_]: + """ + If we have a SparseArray or BooleanArray, convert it to ndarray[bool]. + """ + if isinstance(mask, ExtensionArray): + # We could have BooleanArray, Sparse[bool], ... + # Except for BooleanArray, this is equivalent to just + # np.asarray(mask, dtype=bool) + mask = mask.to_numpy(dtype=bool, na_value=False) + + mask = np.asarray(mask, dtype=bool) + return mask + + +def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other): + """ + Parameters + ---------- + values : np.ndarray + num_set : int + For putmask, this is mask.sum() + other : Any + """ + if values.dtype == object: + dtype, _ = infer_dtype_from(other) + + if lib.is_np_dtype(dtype, "mM"): + # https://github.com/numpy/numpy/issues/12550 + # timedelta64 will incorrectly cast to int + if not is_list_like(other): + other = [other] * num_set + else: + other = list(other) + + return other diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/quantile.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/quantile.py new file mode 100644 index 0000000000000000000000000000000000000000..5c933294fb944f04dd3e9a64e4731ea4349254f9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/quantile.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from pandas.core.dtypes.missing import ( + isna, + na_value_for_dtype, +) + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + Scalar, + npt, + ) + + +def quantile_compat( + values: ArrayLike, qs: npt.NDArray[np.float64], interpolation: str +) -> ArrayLike: + """ + Compute the quantiles of the given values for each quantile in `qs`. + + Parameters + ---------- + values : np.ndarray or ExtensionArray + qs : np.ndarray[float64] + interpolation : str + + Returns + ------- + np.ndarray or ExtensionArray + """ + if isinstance(values, np.ndarray): + fill_value = na_value_for_dtype(values.dtype, compat=False) + mask = isna(values) + return quantile_with_mask(values, mask, fill_value, qs, interpolation) + else: + return values._quantile(qs, interpolation) + + +def quantile_with_mask( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + fill_value, + qs: npt.NDArray[np.float64], + interpolation: str, +) -> np.ndarray: + """ + Compute the quantiles of the given values for each quantile in `qs`. + + Parameters + ---------- + values : np.ndarray + For ExtensionArray, this is _values_for_factorize()[0] + mask : np.ndarray[bool] + mask = isna(values) + For ExtensionArray, this is computed before calling _value_for_factorize + fill_value : Scalar + The value to interpret fill NA entries with + For ExtensionArray, this is _values_for_factorize()[1] + qs : np.ndarray[float64] + interpolation : str + Type of interpolation + + Returns + ------- + np.ndarray + + Notes + ----- + Assumes values is already 2D. For ExtensionArray this means np.atleast_2d + has been called on _values_for_factorize()[0] + + Quantile is computed along axis=1. + """ + assert values.shape == mask.shape + if values.ndim == 1: + # unsqueeze, operate, re-squeeze + values = np.atleast_2d(values) + mask = np.atleast_2d(mask) + res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation) + return res_values[0] + + assert values.ndim == 2 + + is_empty = values.shape[1] == 0 + + if is_empty: + # create the array of na_values + # 2d len(values) * len(qs) + flat = np.array([fill_value] * len(qs)) + result = np.repeat(flat, len(values)).reshape(len(values), len(qs)) + else: + result = _nanpercentile( + values, + qs * 100.0, + na_value=fill_value, + mask=mask, + interpolation=interpolation, + ) + + result = np.asarray(result) + result = result.T + + return result + + +def _nanpercentile_1d( + values: np.ndarray, + mask: npt.NDArray[np.bool_], + qs: npt.NDArray[np.float64], + na_value: Scalar, + interpolation: str, +) -> Scalar | np.ndarray: + """ + Wrapper for np.percentile that skips missing values, specialized to + 1-dimensional case. + + Parameters + ---------- + values : array over which to find quantiles + mask : ndarray[bool] + locations in values that should be considered missing + qs : np.ndarray[float64] of quantile indices to find + na_value : scalar + value to return for empty or all-null values + interpolation : str + + Returns + ------- + quantiles : scalar or array + """ + # mask is Union[ExtensionArray, ndarray] + values = values[~mask] + + if len(values) == 0: + # Can't pass dtype=values.dtype here bc we might have na_value=np.nan + # with values.dtype=int64 see test_quantile_empty + # equiv: 'np.array([na_value] * len(qs))' but much faster + return np.full(len(qs), na_value) + + return np.percentile( + values, + qs, + # error: No overload variant of "percentile" matches argument + # types "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]" + # , "Dict[str, str]" [call-overload] + method=interpolation, # type: ignore[call-overload] + ) + + +def _nanpercentile( + values: np.ndarray, + qs: npt.NDArray[np.float64], + *, + na_value, + mask: npt.NDArray[np.bool_], + interpolation: str, +): + """ + Wrapper for np.percentile that skips missing values. + + Parameters + ---------- + values : np.ndarray[ndim=2] over which to find quantiles + qs : np.ndarray[float64] of quantile indices to find + na_value : scalar + value to return for empty or all-null values + mask : np.ndarray[bool] + locations in values that should be considered missing + interpolation : str + + Returns + ------- + quantiles : scalar or array + """ + + if values.dtype.kind in "mM": + # need to cast to integer to avoid rounding errors in numpy + result = _nanpercentile( + values.view("i8"), + qs=qs, + na_value=na_value.view("i8"), + mask=mask, + interpolation=interpolation, + ) + + # Note: we have to do `astype` and not view because in general we + # have float result at this point, not i8 + return result.astype(values.dtype) + + if mask.any(): + # Caller is responsible for ensuring mask shape match + assert mask.shape == values.shape + result = [ + _nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation) + for (val, m) in zip(list(values), list(mask)) + ] + if values.dtype.kind == "f": + # preserve itemsize + result = np.asarray(result, dtype=values.dtype).T + else: + result = np.asarray(result).T + if ( + result.dtype != values.dtype + and not mask.all() + and (result == result.astype(values.dtype, copy=False)).all() + ): + # mask.all() will never get cast back to int + # e.g. values id integer dtype and result is floating dtype, + # only cast back to integer dtype if result values are all-integer. + result = result.astype(values.dtype, copy=False) + return result + else: + return np.percentile( + values, + qs, + axis=1, + # error: No overload variant of "percentile" matches argument types + # "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]", + # "int", "Dict[str, str]" [call-overload] + method=interpolation, # type: ignore[call-overload] + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/replace.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/replace.py new file mode 100644 index 0000000000000000000000000000000000000000..5f377276be480ec4d01c8cd1671fa95f9504c7c6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/replace.py @@ -0,0 +1,152 @@ +""" +Methods used by Block.replace and related methods. +""" +from __future__ import annotations + +import operator +import re +from re import Pattern +from typing import ( + TYPE_CHECKING, + Any, +) + +import numpy as np + +from pandas.core.dtypes.common import ( + is_bool, + is_re, + is_re_compilable, +) +from pandas.core.dtypes.missing import isna + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + Scalar, + npt, + ) + + +def should_use_regex(regex: bool, to_replace: Any) -> bool: + """ + Decide whether to treat `to_replace` as a regular expression. + """ + if is_re(to_replace): + regex = True + + regex = regex and is_re_compilable(to_replace) + + # Don't use regex if the pattern is empty. + regex = regex and re.compile(to_replace).pattern != "" + return regex + + +def compare_or_regex_search( + a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: npt.NDArray[np.bool_] +) -> ArrayLike: + """ + Compare two array-like inputs of the same shape or two scalar values + + Calls operator.eq or re.search, depending on regex argument. If regex is + True, perform an element-wise regex matching. + + Parameters + ---------- + a : array-like + b : scalar or regex pattern + regex : bool + mask : np.ndarray[bool] + + Returns + ------- + mask : array-like of bool + """ + if isna(b): + return ~mask + + def _check_comparison_types( + result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern + ): + """ + Raises an error if the two arrays (a,b) cannot be compared. + Otherwise, returns the comparison result as expected. + """ + if is_bool(result) and isinstance(a, np.ndarray): + type_names = [type(a).__name__, type(b).__name__] + + type_names[0] = f"ndarray(dtype={a.dtype})" + + raise TypeError( + f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" + ) + + if not regex or not should_use_regex(regex, b): + # TODO: should use missing.mask_missing? + op = lambda x: operator.eq(x, b) + else: + op = np.vectorize( + lambda x: bool(re.search(b, x)) + if isinstance(x, str) and isinstance(b, (str, Pattern)) + else False + ) + + # GH#32621 use mask to avoid comparing to NAs + if isinstance(a, np.ndarray): + a = a[mask] + + result = op(a) + + if isinstance(result, np.ndarray) and mask is not None: + # The shape of the mask can differ to that of the result + # since we may compare only a subset of a's or b's elements + tmp = np.zeros(mask.shape, dtype=np.bool_) + np.place(tmp, mask, result) + result = tmp + + _check_comparison_types(result, a, b) + return result + + +def replace_regex( + values: ArrayLike, rx: re.Pattern, value, mask: npt.NDArray[np.bool_] | None +) -> None: + """ + Parameters + ---------- + values : ArrayLike + Object dtype. + rx : re.Pattern + value : Any + mask : np.ndarray[bool], optional + + Notes + ----- + Alters values in-place. + """ + + # deal with replacing values with objects (strings) that match but + # whose replacement is not a string (numeric, nan, object) + if isna(value) or not isinstance(value, str): + + def re_replacer(s): + if is_re(rx) and isinstance(s, str): + return value if rx.search(s) is not None else s + else: + return s + + else: + # value is guaranteed to be a string here, s can be either a string + # or null if it's null it gets returned + def re_replacer(s): + if is_re(rx) and isinstance(s, str): + return rx.sub(value, s) + else: + return s + + f = np.vectorize(re_replacer, otypes=[np.object_]) + + if mask is None: + values[:] = f(values) + else: + values[mask] = f(values[mask]) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/take.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/take.py new file mode 100644 index 0000000000000000000000000000000000000000..ac674e31586e72040a8e1313f232a00299b961ee --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/take.py @@ -0,0 +1,594 @@ +from __future__ import annotations + +import functools +from typing import ( + TYPE_CHECKING, + cast, + overload, +) + +import numpy as np + +from pandas._libs import ( + algos as libalgos, + lib, +) + +from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.common import ( + ensure_platform_int, + is_1d_only_ea_dtype, +) +from pandas.core.dtypes.missing import na_value_for_dtype + +from pandas.core.construction import ensure_wrapped_if_datetimelike + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + AxisInt, + npt, + ) + + from pandas.core.arrays._mixins import NDArrayBackedExtensionArray + from pandas.core.arrays.base import ExtensionArray + + +@overload +def take_nd( + arr: np.ndarray, + indexer, + axis: AxisInt = ..., + fill_value=..., + allow_fill: bool = ..., +) -> np.ndarray: + ... + + +@overload +def take_nd( + arr: ExtensionArray, + indexer, + axis: AxisInt = ..., + fill_value=..., + allow_fill: bool = ..., +) -> ArrayLike: + ... + + +def take_nd( + arr: ArrayLike, + indexer, + axis: AxisInt = 0, + fill_value=lib.no_default, + allow_fill: bool = True, +) -> ArrayLike: + """ + Specialized Cython take which sets NaN values in one pass + + This dispatches to ``take`` defined on ExtensionArrays. + + Note: this function assumes that the indexer is a valid(ated) indexer with + no out of bound indices. + + Parameters + ---------- + arr : np.ndarray or ExtensionArray + Input array. + indexer : ndarray + 1-D array of indices to take, subarrays corresponding to -1 value + indices are filed with fill_value + axis : int, default 0 + Axis to take from + fill_value : any, default np.nan + Fill value to replace -1 values with + allow_fill : bool, default True + If False, indexer is assumed to contain no -1 values so no filling + will be done. This short-circuits computation of a mask. Result is + undefined if allow_fill == False and -1 is present in indexer. + + Returns + ------- + subarray : np.ndarray or ExtensionArray + May be the same type as the input, or cast to an ndarray. + """ + if fill_value is lib.no_default: + fill_value = na_value_for_dtype(arr.dtype, compat=False) + elif lib.is_np_dtype(arr.dtype, "mM"): + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if arr.dtype != dtype: + # EA.take is strict about returning a new object of the same type + # so for that case cast upfront + arr = arr.astype(dtype) + + if not isinstance(arr, np.ndarray): + # i.e. ExtensionArray, + # includes for EA to catch DatetimeArray, TimedeltaArray + if not is_1d_only_ea_dtype(arr.dtype): + # i.e. DatetimeArray, TimedeltaArray + arr = cast("NDArrayBackedExtensionArray", arr) + return arr.take( + indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis + ) + + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + + arr = np.asarray(arr) + return _take_nd_ndarray(arr, indexer, axis, fill_value, allow_fill) + + +def _take_nd_ndarray( + arr: np.ndarray, + indexer: npt.NDArray[np.intp] | None, + axis: AxisInt, + fill_value, + allow_fill: bool, +) -> np.ndarray: + if indexer is None: + indexer = np.arange(arr.shape[axis], dtype=np.intp) + dtype, fill_value = arr.dtype, arr.dtype.type() + else: + indexer = ensure_platform_int(indexer) + + dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( + arr, indexer, fill_value, allow_fill + ) + + flip_order = False + if arr.ndim == 2 and arr.flags.f_contiguous: + flip_order = True + + if flip_order: + arr = arr.T + axis = arr.ndim - axis - 1 + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + out_shape_ = list(arr.shape) + out_shape_[axis] = len(indexer) + out_shape = tuple(out_shape_) + if arr.flags.f_contiguous and axis == arr.ndim - 1: + # minor tweak that can make an order-of-magnitude difference + # for dataframes initialized directly from 2-d ndarrays + # (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its + # f-contiguous transpose) + out = np.empty(out_shape, dtype=dtype, order="F") + else: + out = np.empty(out_shape, dtype=dtype) + + func = _get_take_nd_function( + arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info + ) + func(arr, indexer, out, fill_value) + + if flip_order: + out = out.T + return out + + +def take_1d( + arr: ArrayLike, + indexer: npt.NDArray[np.intp], + fill_value=None, + allow_fill: bool = True, + mask: npt.NDArray[np.bool_] | None = None, +) -> ArrayLike: + """ + Specialized version for 1D arrays. Differences compared to `take_nd`: + + - Assumes input array has already been converted to numpy array / EA + - Assumes indexer is already guaranteed to be intp dtype ndarray + - Only works for 1D arrays + + To ensure the lowest possible overhead. + + Note: similarly to `take_nd`, this function assumes that the indexer is + a valid(ated) indexer with no out of bound indices. + + Parameters + ---------- + arr : np.ndarray or ExtensionArray + Input array. + indexer : ndarray + 1-D array of indices to take (validated indices, intp dtype). + fill_value : any, default np.nan + Fill value to replace -1 values with + allow_fill : bool, default True + If False, indexer is assumed to contain no -1 values so no filling + will be done. This short-circuits computation of a mask. Result is + undefined if allow_fill == False and -1 is present in indexer. + mask : np.ndarray, optional, default None + If `allow_fill` is True, and the mask (where indexer == -1) is already + known, it can be passed to avoid recomputation. + """ + if not isinstance(arr, np.ndarray): + # ExtensionArray -> dispatch to their method + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + + if not allow_fill: + return arr.take(indexer) + + dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( + arr, indexer, fill_value, True, mask + ) + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + out = np.empty(indexer.shape, dtype=dtype) + + func = _get_take_nd_function( + arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info + ) + func(arr, indexer, out, fill_value) + + return out + + +def take_2d_multi( + arr: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + fill_value=np.nan, +) -> np.ndarray: + """ + Specialized Cython take which sets NaN values in one pass. + """ + # This is only called from one place in DataFrame._reindex_multi, + # so we know indexer is well-behaved. + assert indexer is not None + assert indexer[0] is not None + assert indexer[1] is not None + + row_idx, col_idx = indexer + + row_idx = ensure_platform_int(row_idx) + col_idx = ensure_platform_int(col_idx) + indexer = row_idx, col_idx + mask_info = None + + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype: + # check if promotion is actually required based on indexer + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + mask_info = (row_mask, col_mask), (row_needs, col_needs) + + if not (row_needs or col_needs): + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + out_shape = len(row_idx), len(col_idx) + out = np.empty(out_shape, dtype=dtype) + + func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) + if func is None and arr.dtype != out.dtype: + func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) + if func is not None: + func = _convert_wrapper(func, out.dtype) + + if func is not None: + func(arr, indexer, out=out, fill_value=fill_value) + else: + # test_reindex_multi + _take_2d_multi_object( + arr, indexer, out, fill_value=fill_value, mask_info=mask_info + ) + + return out + + +@functools.lru_cache +def _get_take_nd_function_cached( + ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: AxisInt +): + """ + Part of _get_take_nd_function below that doesn't need `mask_info` and thus + can be cached (mask_info potentially contains a numpy ndarray which is not + hashable and thus cannot be used as argument for cached function). + """ + tup = (arr_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + return func + + # We get here with string, uint, float16, and complex dtypes that could + # potentially be handled in algos_take_helper. + # Also a couple with (M8[ns], object) and (m8[ns], object) + tup = (out_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + func = _convert_wrapper(func, out_dtype) + return func + + return None + + +def _get_take_nd_function( + ndim: int, + arr_dtype: np.dtype, + out_dtype: np.dtype, + axis: AxisInt = 0, + mask_info=None, +): + """ + Get the appropriate "take" implementation for the given dimension, axis + and dtypes. + """ + func = None + if ndim <= 2: + # for this part we don't need `mask_info` -> use the cached algo lookup + func = _get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis) + + if func is None: + + def func(arr, indexer, out, fill_value=np.nan) -> None: + indexer = ensure_platform_int(indexer) + _take_nd_object( + arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info + ) + + return func + + +def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None): + def wrapper( + arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan + ) -> None: + if arr_dtype is not None: + arr = arr.view(arr_dtype) + if out_dtype is not None: + out = out.view(out_dtype) + if fill_wrap is not None: + # FIXME: if we get here with dt64/td64 we need to be sure we have + # matching resos + if fill_value.dtype.kind == "m": + fill_value = fill_value.astype("m8[ns]") + else: + fill_value = fill_value.astype("M8[ns]") + fill_value = fill_wrap(fill_value) + + f(arr, indexer, out, fill_value=fill_value) + + return wrapper + + +def _convert_wrapper(f, conv_dtype): + def wrapper( + arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan + ) -> None: + if conv_dtype == object: + # GH#39755 avoid casting dt64/td64 to integers + arr = ensure_wrapped_if_datetimelike(arr) + arr = arr.astype(conv_dtype) + f(arr, indexer, out, fill_value=fill_value) + + return wrapper + + +_take_1d_dict = { + ("int8", "int8"): libalgos.take_1d_int8_int8, + ("int8", "int32"): libalgos.take_1d_int8_int32, + ("int8", "int64"): libalgos.take_1d_int8_int64, + ("int8", "float64"): libalgos.take_1d_int8_float64, + ("int16", "int16"): libalgos.take_1d_int16_int16, + ("int16", "int32"): libalgos.take_1d_int16_int32, + ("int16", "int64"): libalgos.take_1d_int16_int64, + ("int16", "float64"): libalgos.take_1d_int16_float64, + ("int32", "int32"): libalgos.take_1d_int32_int32, + ("int32", "int64"): libalgos.take_1d_int32_int64, + ("int32", "float64"): libalgos.take_1d_int32_float64, + ("int64", "int64"): libalgos.take_1d_int64_int64, + ("int64", "float64"): libalgos.take_1d_int64_float64, + ("float32", "float32"): libalgos.take_1d_float32_float32, + ("float32", "float64"): libalgos.take_1d_float32_float64, + ("float64", "float64"): libalgos.take_1d_float64_float64, + ("object", "object"): libalgos.take_1d_object_object, + ("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64 + ), + ("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper( + libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64 + ), +} + +_take_2d_axis0_dict = { + ("int8", "int8"): libalgos.take_2d_axis0_int8_int8, + ("int8", "int32"): libalgos.take_2d_axis0_int8_int32, + ("int8", "int64"): libalgos.take_2d_axis0_int8_int64, + ("int8", "float64"): libalgos.take_2d_axis0_int8_float64, + ("int16", "int16"): libalgos.take_2d_axis0_int16_int16, + ("int16", "int32"): libalgos.take_2d_axis0_int16_int32, + ("int16", "int64"): libalgos.take_2d_axis0_int16_int64, + ("int16", "float64"): libalgos.take_2d_axis0_int16_float64, + ("int32", "int32"): libalgos.take_2d_axis0_int32_int32, + ("int32", "int64"): libalgos.take_2d_axis0_int32_int64, + ("int32", "float64"): libalgos.take_2d_axis0_int32_float64, + ("int64", "int64"): libalgos.take_2d_axis0_int64_int64, + ("int64", "float64"): libalgos.take_2d_axis0_int64_float64, + ("float32", "float32"): libalgos.take_2d_axis0_float32_float32, + ("float32", "float64"): libalgos.take_2d_axis0_float32_float64, + ("float64", "float64"): libalgos.take_2d_axis0_float64_float64, + ("object", "object"): libalgos.take_2d_axis0_object_object, + ("bool", "bool"): _view_wrapper( + libalgos.take_2d_axis0_bool_bool, np.uint8, np.uint8 + ), + ("bool", "object"): _view_wrapper( + libalgos.take_2d_axis0_bool_object, np.uint8, None + ), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), + ("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper( + libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + +_take_2d_axis1_dict = { + ("int8", "int8"): libalgos.take_2d_axis1_int8_int8, + ("int8", "int32"): libalgos.take_2d_axis1_int8_int32, + ("int8", "int64"): libalgos.take_2d_axis1_int8_int64, + ("int8", "float64"): libalgos.take_2d_axis1_int8_float64, + ("int16", "int16"): libalgos.take_2d_axis1_int16_int16, + ("int16", "int32"): libalgos.take_2d_axis1_int16_int32, + ("int16", "int64"): libalgos.take_2d_axis1_int16_int64, + ("int16", "float64"): libalgos.take_2d_axis1_int16_float64, + ("int32", "int32"): libalgos.take_2d_axis1_int32_int32, + ("int32", "int64"): libalgos.take_2d_axis1_int32_int64, + ("int32", "float64"): libalgos.take_2d_axis1_int32_float64, + ("int64", "int64"): libalgos.take_2d_axis1_int64_int64, + ("int64", "float64"): libalgos.take_2d_axis1_int64_float64, + ("float32", "float32"): libalgos.take_2d_axis1_float32_float32, + ("float32", "float64"): libalgos.take_2d_axis1_float32_float64, + ("float64", "float64"): libalgos.take_2d_axis1_float64_float64, + ("object", "object"): libalgos.take_2d_axis1_object_object, + ("bool", "bool"): _view_wrapper( + libalgos.take_2d_axis1_bool_bool, np.uint8, np.uint8 + ), + ("bool", "object"): _view_wrapper( + libalgos.take_2d_axis1_bool_object, np.uint8, None + ), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), + ("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper( + libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + +_take_2d_multi_dict = { + ("int8", "int8"): libalgos.take_2d_multi_int8_int8, + ("int8", "int32"): libalgos.take_2d_multi_int8_int32, + ("int8", "int64"): libalgos.take_2d_multi_int8_int64, + ("int8", "float64"): libalgos.take_2d_multi_int8_float64, + ("int16", "int16"): libalgos.take_2d_multi_int16_int16, + ("int16", "int32"): libalgos.take_2d_multi_int16_int32, + ("int16", "int64"): libalgos.take_2d_multi_int16_int64, + ("int16", "float64"): libalgos.take_2d_multi_int16_float64, + ("int32", "int32"): libalgos.take_2d_multi_int32_int32, + ("int32", "int64"): libalgos.take_2d_multi_int32_int64, + ("int32", "float64"): libalgos.take_2d_multi_int32_float64, + ("int64", "int64"): libalgos.take_2d_multi_int64_int64, + ("int64", "float64"): libalgos.take_2d_multi_int64_float64, + ("float32", "float32"): libalgos.take_2d_multi_float32_float32, + ("float32", "float64"): libalgos.take_2d_multi_float32_float64, + ("float64", "float64"): libalgos.take_2d_multi_float64_float64, + ("object", "object"): libalgos.take_2d_multi_object_object, + ("bool", "bool"): _view_wrapper( + libalgos.take_2d_multi_bool_bool, np.uint8, np.uint8 + ), + ("bool", "object"): _view_wrapper( + libalgos.take_2d_multi_bool_object, np.uint8, None + ), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), + ("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper( + libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + + +def _take_nd_object( + arr: np.ndarray, + indexer: npt.NDArray[np.intp], + out: np.ndarray, + axis: AxisInt, + fill_value, + mask_info, +) -> None: + if mask_info is not None: + mask, needs_masking = mask_info + else: + mask = indexer == -1 + needs_masking = mask.any() + if arr.dtype != out.dtype: + arr = arr.astype(out.dtype) + if arr.shape[axis] > 0: + arr.take(indexer, axis=axis, out=out) + if needs_masking: + outindexer = [slice(None)] * arr.ndim + outindexer[axis] = mask + out[tuple(outindexer)] = fill_value + + +def _take_2d_multi_object( + arr: np.ndarray, + indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]], + out: np.ndarray, + fill_value, + mask_info, +) -> None: + # this is not ideal, performance-wise, but it's better than raising + # an exception (best to optimize in Cython to avoid getting here) + row_idx, col_idx = indexer # both np.intp + if mask_info is not None: + (row_mask, col_mask), (row_needs, col_needs) = mask_info + else: + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + if fill_value is not None: + if row_needs: + out[row_mask, :] = fill_value + if col_needs: + out[:, col_mask] = fill_value + for i, u_ in enumerate(row_idx): + if u_ != -1: + for j, v in enumerate(col_idx): + if v != -1: + out[i, j] = arr[u_, v] + + +def _take_preprocess_indexer_and_fill_value( + arr: np.ndarray, + indexer: npt.NDArray[np.intp], + fill_value, + allow_fill: bool, + mask: npt.NDArray[np.bool_] | None = None, +): + mask_info: tuple[np.ndarray | None, bool] | None = None + + if not allow_fill: + dtype, fill_value = arr.dtype, arr.dtype.type() + mask_info = None, False + else: + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype: + # check if promotion is actually required based on indexer + if mask is not None: + needs_masking = True + else: + mask = indexer == -1 + needs_masking = bool(mask.any()) + mask_info = mask, needs_masking + if not needs_masking: + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() + + return dtype, fill_value, mask_info diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/transforms.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..ec67244949e3db92cc811b19cdfcd5d1dd2b4de8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/array_algos/transforms.py @@ -0,0 +1,50 @@ +""" +transforms.py is for shape-preserving functions. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +if TYPE_CHECKING: + from pandas._typing import ( + AxisInt, + Scalar, + ) + + +def shift( + values: np.ndarray, periods: int, axis: AxisInt, fill_value: Scalar +) -> np.ndarray: + new_values = values + + if periods == 0 or values.size == 0: + return new_values.copy() + + # make sure array sent to np.roll is c_contiguous + f_ordered = values.flags.f_contiguous + if f_ordered: + new_values = new_values.T + axis = new_values.ndim - axis - 1 + + if new_values.size: + new_values = np.roll( + new_values, + np.intp(periods), + axis=axis, + ) + + axis_indexer = [slice(None)] * values.ndim + if periods > 0: + axis_indexer[axis] = slice(None, periods) + else: + axis_indexer[axis] = slice(periods, None) + new_values[tuple(axis_indexer)] = fill_value + + # restore original order + if f_ordered: + new_values = new_values.T + + return new_values diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8fc8e9bb7af722d4b66ae9b124c60ad9bf5c374d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_arrow_string_mixins.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_arrow_string_mixins.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c062b7b351fee84580e8b003355cb31df28b787 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_arrow_string_mixins.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_mixins.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_mixins.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..145507a2388d0fad1b78b6b5771c93fe698d8be2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_mixins.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_ranges.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_ranges.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11e68d30222211c86ba1d5b7638007c800681eab Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_ranges.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbfb0ceae63591f26885cd748ba0e2f0d856affc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/_utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/base.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb50e2f8ed96147abbf41a26578a50b98ad02141 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/base.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/boolean.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/boolean.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c59b4c1634e4ef5cbf4e173852c0bb186391c4a9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/boolean.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/datetimelike.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/datetimelike.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7eaa5dcc0397a3428d6a0627b280a86476c44c53 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/datetimelike.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/datetimes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/datetimes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7cdc879d4d48ffb994b485f57ca43f86a55c7be9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/datetimes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/floating.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/floating.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89a465ad5b9e132790f19d3f97d2cc393ece41da Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/floating.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/integer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/integer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae65ad58c825c27d53d6f6595fd7569df1290a1b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/integer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/interval.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/interval.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..466147ac50783a9c3b9ca44014f3fd5f4a88819b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/interval.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/masked.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/masked.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a47981cc5d64469c4fb6dd647cf9e568cb8e7ca Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/masked.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/numeric.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/numeric.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d991e8ce5b4be210d7db99f2cd6660c4618af6e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/numeric.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/numpy_.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/numpy_.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..924328ec808fd3db598708a05ea65b89b2a44a13 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/numpy_.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/period.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/period.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05d4d11cd54a291a2b54812937277a4b7423231d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/period.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/string_.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/string_.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d314b6e40f012ddd879927e562463a8e0d254ca Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/string_.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/string_arrow.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/string_arrow.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12661f10cfe813ba35db4fde53d615e02be7f621 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/string_arrow.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/timedeltas.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/timedeltas.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85cc1ccab490a5814debcb13856fc2fe7e80ae23 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/__pycache__/timedeltas.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6b46396d5efdfa4301a5362c8a5a71678345479b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/_utils.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, +) + +import numpy as np + +from pandas._libs import lib +from pandas.errors import LossySetitemError + +from pandas.core.dtypes.cast import np_can_hold_element +from pandas.core.dtypes.common import is_numeric_dtype + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + npt, + ) + + +def to_numpy_dtype_inference( + arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool +) -> tuple[npt.DTypeLike, Any]: + if dtype is None and is_numeric_dtype(arr.dtype): + dtype_given = False + if hasna: + if arr.dtype.kind == "b": + dtype = np.dtype(np.object_) + else: + if arr.dtype.kind in "iu": + dtype = np.dtype(np.float64) + else: + dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] + if na_value is lib.no_default: + na_value = np.nan + else: + dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] + elif dtype is not None: + dtype = np.dtype(dtype) + dtype_given = True + else: + dtype_given = True + + if na_value is lib.no_default: + if dtype is None or not hasna: + na_value = arr.dtype.na_value + elif dtype.kind == "f": # type: ignore[union-attr] + na_value = np.nan + elif dtype.kind == "M": # type: ignore[union-attr] + na_value = np.datetime64("nat") + elif dtype.kind == "m": # type: ignore[union-attr] + na_value = np.timedelta64("nat") + else: + na_value = arr.dtype.na_value + + if not dtype_given and hasna: + try: + np_can_hold_element(dtype, na_value) # type: ignore[arg-type] + except LossySetitemError: + dtype = np.dtype(np.object_) + return dtype, na_value diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5fc50f786fc6a6c51f78ef9ebd4ee6ed26a2bab3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__init__.py @@ -0,0 +1,7 @@ +from pandas.core.arrays.arrow.accessors import ( + ListAccessor, + StructAccessor, +) +from pandas.core.arrays.arrow.array import ArrowExtensionArray + +__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..085c42d1c4b557758cb9f27729a41c02eb4b8301 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/_arrow_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/_arrow_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ad7601f1b64f7891ab9f07524d14fa9b8bc099e1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/_arrow_utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/accessors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/accessors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bbaa3343c6886aab621809d9d88f91fea43e44d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/accessors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/extension_types.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/extension_types.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b893bcb654c7ca88489b198260f4cf925cbe468b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/__pycache__/extension_types.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/_arrow_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/_arrow_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2a053fac2985c7d5a311ff896dcce846b6aa6e97 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/_arrow_utils.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import warnings + +import numpy as np +import pyarrow + +from pandas.errors import PerformanceWarning +from pandas.util._exceptions import find_stack_level + + +def fallback_performancewarning(version: str | None = None) -> None: + """ + Raise a PerformanceWarning for falling back to ExtensionArray's + non-pyarrow method + """ + msg = "Falling back on a non-pyarrow code path which may decrease performance." + if version is not None: + msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning." + warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) + + +def pyarrow_array_to_numpy_and_mask( + arr, dtype: np.dtype +) -> tuple[np.ndarray, np.ndarray]: + """ + Convert a primitive pyarrow.Array to a numpy array and boolean mask based + on the buffers of the Array. + + At the moment pyarrow.BooleanArray is not supported. + + Parameters + ---------- + arr : pyarrow.Array + dtype : numpy.dtype + + Returns + ------- + (data, mask) + Tuple of two numpy arrays with the raw data (with specified dtype) and + a boolean mask (validity mask, so False means missing) + """ + dtype = np.dtype(dtype) + + if pyarrow.types.is_null(arr.type): + # No initialization of data is needed since everything is null + data = np.empty(len(arr), dtype=dtype) + mask = np.zeros(len(arr), dtype=bool) + return data, mask + buflist = arr.buffers() + # Since Arrow buffers might contain padding and the data might be offset, + # the buffer gets sliced here before handing it to numpy. + # See also https://github.com/pandas-dev/pandas/issues/40896 + offset = arr.offset * dtype.itemsize + length = len(arr) * dtype.itemsize + data_buf = buflist[1][offset : offset + length] + data = np.frombuffer(data_buf, dtype=dtype) + bitmask = buflist[0] + if bitmask is not None: + mask = pyarrow.BooleanArray.from_buffers( + pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset + ) + mask = np.asarray(mask) + else: + mask = np.ones(len(arr), dtype=bool) + return data, mask diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/accessors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/accessors.py new file mode 100644 index 0000000000000000000000000000000000000000..124f8fb6ad8bce4b55703e99d730def1dbff90f9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/accessors.py @@ -0,0 +1,473 @@ +"""Accessors for arrow-backed data.""" + +from __future__ import annotations + +from abc import ( + ABCMeta, + abstractmethod, +) +from typing import ( + TYPE_CHECKING, + cast, +) + +from pandas.compat import ( + pa_version_under10p1, + pa_version_under11p0, +) + +from pandas.core.dtypes.common import is_list_like + +if not pa_version_under10p1: + import pyarrow as pa + import pyarrow.compute as pc + + from pandas.core.dtypes.dtypes import ArrowDtype + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pandas import ( + DataFrame, + Series, + ) + + +class ArrowAccessor(metaclass=ABCMeta): + @abstractmethod + def __init__(self, data, validation_msg: str) -> None: + self._data = data + self._validation_msg = validation_msg + self._validate(data) + + @abstractmethod + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: + pass + + def _validate(self, data): + dtype = data.dtype + if not isinstance(dtype, ArrowDtype): + # Raise AttributeError so that inspect can handle non-struct Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype): + # Raise AttributeError so that inspect can handle invalid Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + @property + def _pa_array(self): + return self._data.array._pa_array + + +class ListAccessor(ArrowAccessor): + """ + Accessor object for list data properties of the Series values. + + Parameters + ---------- + data : Series + Series containing Arrow list data. + """ + + def __init__(self, data=None) -> None: + super().__init__( + data, + validation_msg="Can only use the '.list' accessor with " + "'list[pyarrow]' dtype, not {dtype}.", + ) + + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: + return ( + pa.types.is_list(pyarrow_dtype) + or pa.types.is_fixed_size_list(pyarrow_dtype) + or pa.types.is_large_list(pyarrow_dtype) + ) + + def len(self) -> Series: + """ + Return the length of each list in the Series. + + Returns + ------- + pandas.Series + The length of each list. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list.len() + 0 3 + 1 1 + dtype: int32[pyarrow] + """ + from pandas import Series + + value_lengths = pc.list_value_length(self._pa_array) + return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) + + def __getitem__(self, key: int | slice) -> Series: + """ + Index or slice lists in the Series. + + Parameters + ---------- + key : int | slice + Index or slice of indices to access from each list. + + Returns + ------- + pandas.Series + The list at requested index. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list[0] + 0 1 + 1 3 + dtype: int64[pyarrow] + """ + from pandas import Series + + if isinstance(key, int): + # TODO: Support negative key but pyarrow does not allow + # element index to be an array. + # if key < 0: + # key = pc.add(key, pc.list_value_length(self._pa_array)) + element = pc.list_element(self._pa_array, key) + return Series(element, dtype=ArrowDtype(element.type)) + elif isinstance(key, slice): + if pa_version_under11p0: + raise NotImplementedError( + f"List slice not supported by pyarrow {pa.__version__}." + ) + + # TODO: Support negative start/stop/step, ideally this would be added + # upstream in pyarrow. + start, stop, step = key.start, key.stop, key.step + if start is None: + # TODO: When adding negative step support + # this should be setto last element of array + # when step is negative. + start = 0 + if step is None: + step = 1 + sliced = pc.list_slice(self._pa_array, start, stop, step) + return Series(sliced, dtype=ArrowDtype(sliced.type)) + else: + raise ValueError(f"key must be an int or slice, got {type(key).__name__}") + + def __iter__(self) -> Iterator: + raise TypeError(f"'{type(self).__name__}' object is not iterable") + + def flatten(self) -> Series: + """ + Flatten list values. + + Returns + ------- + pandas.Series + The data from all lists in the series flattened. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list.flatten() + 0 1 + 1 2 + 2 3 + 3 3 + dtype: int64[pyarrow] + """ + from pandas import Series + + flattened = pc.list_flatten(self._pa_array) + return Series(flattened, dtype=ArrowDtype(flattened.type)) + + +class StructAccessor(ArrowAccessor): + """ + Accessor object for structured data properties of the Series values. + + Parameters + ---------- + data : Series + Series containing Arrow struct data. + """ + + def __init__(self, data=None) -> None: + super().__init__( + data, + validation_msg=( + "Can only use the '.struct' accessor with 'struct[pyarrow]' " + "dtype, not {dtype}." + ), + ) + + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: + return pa.types.is_struct(pyarrow_dtype) + + @property + def dtypes(self) -> Series: + """ + Return the dtype object of each child field of the struct. + + Returns + ------- + pandas.Series + The data type of each child field. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + >>> s.struct.dtypes + version int64[pyarrow] + project string[pyarrow] + dtype: object + """ + from pandas import ( + Index, + Series, + ) + + pa_type = self._data.dtype.pyarrow_dtype + types = [ArrowDtype(struct.type) for struct in pa_type] + names = [struct.name for struct in pa_type] + return Series(types, index=Index(names)) + + def field( + self, + name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + ) -> Series: + """ + Extract a child field of a struct as a Series. + + Parameters + ---------- + name_or_index : str | bytes | int | expression | list + Name or index of the child field to extract. + + For list-like inputs, this will index into a nested + struct. + + Returns + ------- + pandas.Series + The data corresponding to the selected child field. + + See Also + -------- + Series.struct.explode : Return all child fields as a DataFrame. + + Notes + ----- + The name of the resulting Series will be set using the following + rules: + + - For string, bytes, or integer `name_or_index` (or a list of these, for + a nested selection), the Series name is set to the selected + field's name. + - For a :class:`pyarrow.compute.Expression`, this is set to + the string form of the expression. + - For list-like `name_or_index`, the name will be set to the + name of the final field selected. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + Extract by field name. + + >>> s.struct.field("project") + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + Extract by field index. + + >>> s.struct.field(0) + 0 1 + 1 2 + 2 1 + Name: version, dtype: int64[pyarrow] + + Or an expression + + >>> import pyarrow.compute as pc + >>> s.struct.field(pc.field("project")) + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + For nested struct types, you can pass a list of values to index + multiple levels: + + >>> version_type = pa.struct([ + ... ("major", pa.int64()), + ... ("minor", pa.int64()), + ... ]) + >>> s = pd.Series( + ... [ + ... {"version": {"major": 1, "minor": 5}, "project": "pandas"}, + ... {"version": {"major": 2, "minor": 1}, "project": "pandas"}, + ... {"version": {"major": 1, "minor": 26}, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", version_type), ("project", pa.string())] + ... )) + ... ) + >>> s.struct.field(["version", "minor"]) + 0 5 + 1 1 + 2 26 + Name: minor, dtype: int64[pyarrow] + >>> s.struct.field([0, 0]) + 0 1 + 1 2 + 2 1 + Name: major, dtype: int64[pyarrow] + """ + from pandas import Series + + def get_name( + level_name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + data: pa.ChunkedArray, + ): + if isinstance(level_name_or_index, int): + name = data.type.field(level_name_or_index).name + elif isinstance(level_name_or_index, (str, bytes)): + name = level_name_or_index + elif isinstance(level_name_or_index, pc.Expression): + name = str(level_name_or_index) + elif is_list_like(level_name_or_index): + # For nested input like [2, 1, 2] + # iteratively get the struct and field name. The last + # one is used for the name of the index. + level_name_or_index = list(reversed(level_name_or_index)) + selected = data + while level_name_or_index: + # we need the cast, otherwise mypy complains about + # getting ints, bytes, or str here, which isn't possible. + level_name_or_index = cast(list, level_name_or_index) + name_or_index = level_name_or_index.pop() + name = get_name(name_or_index, selected) + selected = selected.type.field(selected.type.get_field_index(name)) + name = selected.name + else: + raise ValueError( + "name_or_index must be an int, str, bytes, " + "pyarrow.compute.Expression, or list of those" + ) + return name + + pa_arr = self._data.array._pa_array + name = get_name(name_or_index, pa_arr) + field_arr = pc.struct_field(pa_arr, name_or_index) + + return Series( + field_arr, + dtype=ArrowDtype(field_arr.type), + index=self._data.index, + name=name, + ) + + def explode(self) -> DataFrame: + """ + Extract all child fields of a struct as a DataFrame. + + Returns + ------- + pandas.DataFrame + The data corresponding to all child fields. + + See Also + -------- + Series.struct.field : Return a single child field as a Series. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + >>> s.struct.explode() + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy + """ + from pandas import concat + + pa_type = self._pa_array.type + return concat( + [self.field(i) for i in range(pa_type.num_fields)], axis="columns" + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/array.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/array.py new file mode 100644 index 0000000000000000000000000000000000000000..f2b8aa75ca5bfa0a860fe6709539cb19b316758d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/array.py @@ -0,0 +1,2942 @@ +from __future__ import annotations + +import functools +import operator +import re +import textwrap +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + cast, +) +import unicodedata + +import numpy as np + +from pandas._libs import lib +from pandas._libs.tslibs import ( + NaT, + Timedelta, + Timestamp, + timezones, +) +from pandas.compat import ( + pa_version_under10p1, + pa_version_under11p0, + pa_version_under13p0, +) +from pandas.util._decorators import doc +from pandas.util._validators import validate_fillna_kwargs + +from pandas.core.dtypes.cast import ( + can_hold_element, + infer_dtype_from_scalar, +) +from pandas.core.dtypes.common import ( + CategoricalDtype, + is_array_like, + is_bool_dtype, + is_float_dtype, + is_integer, + is_list_like, + is_numeric_dtype, + is_scalar, +) +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.missing import isna + +from pandas.core import ( + algorithms as algos, + missing, + ops, + roperator, +) +from pandas.core.algorithms import map_array +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin +from pandas.core.arrays._utils import to_numpy_dtype_inference +from pandas.core.arrays.base import ( + ExtensionArray, + ExtensionArraySupportsAnyAll, +) +from pandas.core.arrays.masked import BaseMaskedArray +from pandas.core.arrays.string_ import StringDtype +import pandas.core.common as com +from pandas.core.indexers import ( + check_array_indexer, + unpack_tuple_and_ellipses, + validate_indices, +) +from pandas.core.strings.base import BaseStringArrayMethods + +from pandas.io._util import _arrow_dtype_mapping +from pandas.tseries.frequencies import to_offset + +if not pa_version_under10p1: + import pyarrow as pa + import pyarrow.compute as pc + + from pandas.core.dtypes.dtypes import ArrowDtype + + ARROW_CMP_FUNCS = { + "eq": pc.equal, + "ne": pc.not_equal, + "lt": pc.less, + "gt": pc.greater, + "le": pc.less_equal, + "ge": pc.greater_equal, + } + + ARROW_LOGICAL_FUNCS = { + "and_": pc.and_kleene, + "rand_": lambda x, y: pc.and_kleene(y, x), + "or_": pc.or_kleene, + "ror_": lambda x, y: pc.or_kleene(y, x), + "xor": pc.xor, + "rxor": lambda x, y: pc.xor(y, x), + } + + ARROW_BIT_WISE_FUNCS = { + "and_": pc.bit_wise_and, + "rand_": lambda x, y: pc.bit_wise_and(y, x), + "or_": pc.bit_wise_or, + "ror_": lambda x, y: pc.bit_wise_or(y, x), + "xor": pc.bit_wise_xor, + "rxor": lambda x, y: pc.bit_wise_xor(y, x), + } + + def cast_for_truediv( + arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar + ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]: + # Ensure int / int -> float mirroring Python/Numpy behavior + # as pc.divide_checked(int, int) -> int + if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( + pa_object.type + ): + # GH: 56645. + # https://github.com/apache/arrow/issues/35563 + return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast( + pa_object, pa.float64(), safe=False + ) + + return arrow_array, pa_object + + def floordiv_compat( + left: pa.ChunkedArray | pa.Array | pa.Scalar, + right: pa.ChunkedArray | pa.Array | pa.Scalar, + ) -> pa.ChunkedArray: + # TODO: Replace with pyarrow floordiv kernel. + # https://github.com/apache/arrow/issues/39386 + if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): + divided = pc.divide_checked(left, right) + if pa.types.is_signed_integer(divided.type): + # GH 56676 + has_remainder = pc.not_equal(pc.multiply(divided, right), left) + has_one_negative_operand = pc.less( + pc.bit_wise_xor(left, right), + pa.scalar(0, type=divided.type), + ) + result = pc.if_else( + pc.and_( + has_remainder, + has_one_negative_operand, + ), + # GH: 55561 + pc.subtract(divided, pa.scalar(1, type=divided.type)), + divided, + ) + else: + result = divided + result = result.cast(left.type) + else: + divided = pc.divide(left, right) + result = pc.floor(divided) + return result + + ARROW_ARITHMETIC_FUNCS = { + "add": pc.add_checked, + "radd": lambda x, y: pc.add_checked(y, x), + "sub": pc.subtract_checked, + "rsub": lambda x, y: pc.subtract_checked(y, x), + "mul": pc.multiply_checked, + "rmul": lambda x, y: pc.multiply_checked(y, x), + "truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)), + "rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)), + "floordiv": lambda x, y: floordiv_compat(x, y), + "rfloordiv": lambda x, y: floordiv_compat(y, x), + "mod": NotImplemented, + "rmod": NotImplemented, + "divmod": NotImplemented, + "rdivmod": NotImplemented, + "pow": pc.power_checked, + "rpow": lambda x, y: pc.power_checked(y, x), + } + +if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import ( + ArrayLike, + AxisInt, + Dtype, + FillnaOptions, + InterpolateOptions, + Iterator, + NpDtype, + NumpySorter, + NumpyValueArrayLike, + PositionalIndexer, + Scalar, + Self, + SortKind, + TakeIndexer, + TimeAmbiguous, + TimeNonexistent, + npt, + ) + + from pandas import Series + from pandas.core.arrays.datetimes import DatetimeArray + from pandas.core.arrays.timedeltas import TimedeltaArray + + +def get_unit_from_pa_dtype(pa_dtype): + # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804 + if pa_version_under11p0: + unit = str(pa_dtype).split("[", 1)[-1][:-1] + if unit not in ["s", "ms", "us", "ns"]: + raise ValueError(pa_dtype) + return unit + return pa_dtype.unit + + +def to_pyarrow_type( + dtype: ArrowDtype | pa.DataType | Dtype | None, +) -> pa.DataType | None: + """ + Convert dtype to a pyarrow type instance. + """ + if isinstance(dtype, ArrowDtype): + return dtype.pyarrow_dtype + elif isinstance(dtype, pa.DataType): + return dtype + elif isinstance(dtype, DatetimeTZDtype): + return pa.timestamp(dtype.unit, dtype.tz) + elif dtype: + try: + # Accepts python types too + # Doesn't handle all numpy types + return pa.from_numpy_dtype(dtype) + except pa.ArrowNotImplementedError: + pass + return None + + +class ArrowExtensionArray( + OpsMixin, + ExtensionArraySupportsAnyAll, + ArrowStringArrayMixin, + BaseStringArrayMethods, +): + """ + Pandas ExtensionArray backed by a PyArrow ChunkedArray. + + .. warning:: + + ArrowExtensionArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + ArrowExtensionArray + + Notes + ----- + Most methods are implemented using `pyarrow compute functions. `__ + Some methods may either raise an exception or raise a ``PerformanceWarning`` if an + associated compute function is not available based on the installed version of PyArrow. + + Please install the latest version of PyArrow to enable the best functionality and avoid + potential bugs in prior versions of PyArrow. + + Examples + -------- + Create an ArrowExtensionArray with :func:`pandas.array`: + + >>> pd.array([1, 1, None], dtype="int64[pyarrow]") + + [1, 1, ] + Length: 3, dtype: int64[pyarrow] + """ # noqa: E501 (http link too long) + + _pa_array: pa.ChunkedArray + _dtype: ArrowDtype + + def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: + if pa_version_under10p1: + msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray." + raise ImportError(msg) + if isinstance(values, pa.Array): + self._pa_array = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self._pa_array = values + else: + raise ValueError( + f"Unsupported type '{type(values)}' for ArrowExtensionArray" + ) + self._dtype = ArrowDtype(self._pa_array.type) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + """ + Construct a new ExtensionArray from a sequence of scalars. + """ + pa_type = to_pyarrow_type(dtype) + pa_array = cls._box_pa_array(scalars, pa_type=pa_type, copy=copy) + arr = cls(pa_array) + return arr + + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: Dtype | None = None, copy: bool = False + ): + """ + Construct a new ExtensionArray from a sequence of strings. + """ + pa_type = to_pyarrow_type(dtype) + if ( + pa_type is None + or pa.types.is_binary(pa_type) + or pa.types.is_string(pa_type) + or pa.types.is_large_string(pa_type) + ): + # pa_type is None: Let pa.array infer + # pa_type is string/binary: scalars already correct type + scalars = strings + elif pa.types.is_timestamp(pa_type): + from pandas.core.tools.datetimes import to_datetime + + scalars = to_datetime(strings, errors="raise") + elif pa.types.is_date(pa_type): + from pandas.core.tools.datetimes import to_datetime + + scalars = to_datetime(strings, errors="raise").date + elif pa.types.is_duration(pa_type): + from pandas.core.tools.timedeltas import to_timedelta + + scalars = to_timedelta(strings, errors="raise") + if pa_type.unit != "ns": + # GH51175: test_from_sequence_of_strings_pa_array + # attempt to parse as int64 reflecting pyarrow's + # duration to string casting behavior + mask = isna(scalars) + if not isinstance(strings, (pa.Array, pa.ChunkedArray)): + strings = pa.array(strings, type=pa.string(), from_pandas=True) + strings = pc.if_else(mask, None, strings) + try: + scalars = strings.cast(pa.int64()) + except pa.ArrowInvalid: + pass + elif pa.types.is_time(pa_type): + from pandas.core.tools.times import to_time + + # "coerce" to allow "null times" (None) to not raise + scalars = to_time(strings, errors="coerce") + elif pa.types.is_boolean(pa_type): + # pyarrow string->bool casting is case-insensitive: + # "true" or "1" -> True + # "false" or "0" -> False + # Note: BooleanArray was previously used to parse these strings + # and allows "1.0" and "0.0". Pyarrow casting does not support + # this, but we allow it here. + if isinstance(strings, (pa.Array, pa.ChunkedArray)): + scalars = strings + else: + scalars = pa.array(strings, type=pa.string(), from_pandas=True) + scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars) + scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars) + scalars = scalars.cast(pa.bool_()) + elif ( + pa.types.is_integer(pa_type) + or pa.types.is_floating(pa_type) + or pa.types.is_decimal(pa_type) + ): + from pandas.core.tools.numeric import to_numeric + + scalars = to_numeric(strings, errors="raise") + else: + raise NotImplementedError( + f"Converting strings to {pa_type} is not implemented." + ) + return cls._from_sequence(scalars, dtype=pa_type, copy=copy) + + @classmethod + def _box_pa( + cls, value, pa_type: pa.DataType | None = None + ) -> pa.Array | pa.ChunkedArray | pa.Scalar: + """ + Box value into a pyarrow Array, ChunkedArray or Scalar. + + Parameters + ---------- + value : any + pa_type : pa.DataType | None + + Returns + ------- + pa.Array or pa.ChunkedArray or pa.Scalar + """ + if isinstance(value, pa.Scalar) or not is_list_like(value): + return cls._box_pa_scalar(value, pa_type) + return cls._box_pa_array(value, pa_type) + + @classmethod + def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: + """ + Box value into a pyarrow Scalar. + + Parameters + ---------- + value : any + pa_type : pa.DataType | None + + Returns + ------- + pa.Scalar + """ + if isinstance(value, pa.Scalar): + pa_scalar = value + elif isna(value): + pa_scalar = pa.scalar(None, type=pa_type) + else: + # Workaround https://github.com/apache/arrow/issues/37291 + if isinstance(value, Timedelta): + if pa_type is None: + pa_type = pa.duration(value.unit) + elif value.unit != pa_type.unit: + value = value.as_unit(pa_type.unit) + value = value._value + elif isinstance(value, Timestamp): + if pa_type is None: + pa_type = pa.timestamp(value.unit, tz=value.tz) + elif value.unit != pa_type.unit: + value = value.as_unit(pa_type.unit) + value = value._value + + pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) + + if pa_type is not None and pa_scalar.type != pa_type: + pa_scalar = pa_scalar.cast(pa_type) + + return pa_scalar + + @classmethod + def _box_pa_array( + cls, value, pa_type: pa.DataType | None = None, copy: bool = False + ) -> pa.Array | pa.ChunkedArray: + """ + Box value into a pyarrow Array or ChunkedArray. + + Parameters + ---------- + value : Sequence + pa_type : pa.DataType | None + + Returns + ------- + pa.Array or pa.ChunkedArray + """ + if isinstance(value, cls): + pa_array = value._pa_array + elif isinstance(value, (pa.Array, pa.ChunkedArray)): + pa_array = value + elif isinstance(value, BaseMaskedArray): + # GH 52625 + if copy: + value = value.copy() + pa_array = value.__arrow_array__() + else: + if ( + isinstance(value, np.ndarray) + and pa_type is not None + and ( + pa.types.is_large_binary(pa_type) + or pa.types.is_large_string(pa_type) + ) + ): + # See https://github.com/apache/arrow/issues/35289 + value = value.tolist() + elif copy and is_array_like(value): + # pa array should not get updated when numpy array is updated + value = value.copy() + + if ( + pa_type is not None + and pa.types.is_duration(pa_type) + and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi") + ): + # Workaround https://github.com/apache/arrow/issues/37291 + from pandas.core.tools.timedeltas import to_timedelta + + value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) + value = value.to_numpy() + + try: + pa_array = pa.array(value, type=pa_type, from_pandas=True) + except (pa.ArrowInvalid, pa.ArrowTypeError): + # GH50430: let pyarrow infer type, then cast + pa_array = pa.array(value, from_pandas=True) + + if pa_type is None and pa.types.is_duration(pa_array.type): + # Workaround https://github.com/apache/arrow/issues/37291 + from pandas.core.tools.timedeltas import to_timedelta + + value = to_timedelta(value) + value = value.to_numpy() + pa_array = pa.array(value, type=pa_type, from_pandas=True) + + if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0: + # GH52843: upstream bug for duration types when originally + # constructed with data containing numpy NaT. + # https://github.com/apache/arrow/issues/35088 + arr = cls(pa_array) + arr = arr.fillna(arr.dtype.na_value) + pa_array = arr._pa_array + + if pa_type is not None and pa_array.type != pa_type: + if pa.types.is_dictionary(pa_type): + pa_array = pa_array.dictionary_encode() + else: + try: + pa_array = pa_array.cast(pa_type) + except ( + pa.ArrowInvalid, + pa.ArrowTypeError, + pa.ArrowNotImplementedError, + ): + if pa.types.is_string(pa_array.type) or pa.types.is_large_string( + pa_array.type + ): + # TODO: Move logic in _from_sequence_of_strings into + # _box_pa_array + return cls._from_sequence_of_strings( + value, dtype=pa_type + )._pa_array + else: + raise + + return pa_array + + def __getitem__(self, item: PositionalIndexer): + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, np.ndarray): + if not len(item): + # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] + if self._dtype.name == "string" and self._dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ): + pa_dtype = pa.string() + else: + pa_dtype = self._dtype.pyarrow_dtype + return type(self)(pa.chunked_array([], type=pa_dtype)) + elif item.dtype.kind in "iu": + return self.take(item) + elif item.dtype.kind == "b": + return type(self)(self._pa_array.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + elif isinstance(item, tuple): + item = unpack_tuple_and_ellipses(item) + + if item is Ellipsis: + # TODO: should be handled by pyarrow? + item = slice(None) + + if is_scalar(item) and not is_integer(item): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) + # We are not an array indexer, so maybe e.g. a slice or integer + # indexer. We dispatch to pyarrow. + if isinstance(item, slice): + # Arrow bug https://github.com/apache/arrow/issues/38768 + if item.start == item.stop: + pass + elif ( + item.stop is not None + and item.stop < -len(self) + and item.step is not None + and item.step < 0 + ): + item = slice(item.start, None, item.step) + + value = self._pa_array[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + pa_type = self._pa_array.type + scalar = value.as_py() + if scalar is None: + return self._dtype.na_value + elif pa.types.is_timestamp(pa_type) and pa_type.unit != "ns": + # GH 53326 + return Timestamp(scalar).as_unit(pa_type.unit) + elif pa.types.is_duration(pa_type) and pa_type.unit != "ns": + # GH 53326 + return Timedelta(scalar).as_unit(pa_type.unit) + else: + return scalar + + def __iter__(self) -> Iterator[Any]: + """ + Iterate over elements of the array. + """ + na_value = self._dtype.na_value + # GH 53326 + pa_type = self._pa_array.type + box_timestamp = pa.types.is_timestamp(pa_type) and pa_type.unit != "ns" + box_timedelta = pa.types.is_duration(pa_type) and pa_type.unit != "ns" + for value in self._pa_array: + val = value.as_py() + if val is None: + yield na_value + elif box_timestamp: + yield Timestamp(val).as_unit(pa_type.unit) + elif box_timedelta: + yield Timedelta(val).as_unit(pa_type.unit) + else: + yield val + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow ChunkedArray.""" + return self._pa_array + + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.to_numpy(dtype=dtype) + + def __invert__(self) -> Self: + # This is a bit wise op for integer types + if pa.types.is_integer(self._pa_array.type): + return type(self)(pc.bit_wise_not(self._pa_array)) + elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( + self._pa_array.type + ): + # Raise TypeError instead of pa.ArrowNotImplementedError + raise TypeError("__invert__ is not supported for string dtypes") + else: + return type(self)(pc.invert(self._pa_array)) + + def __neg__(self) -> Self: + return type(self)(pc.negate_checked(self._pa_array)) + + def __pos__(self) -> Self: + return type(self)(self._pa_array) + + def __abs__(self) -> Self: + return type(self)(pc.abs_checked(self._pa_array)) + + # GH 42600: __getstate__/__setstate__ not necessary once + # https://issues.apache.org/jira/browse/ARROW-10739 is addressed + def __getstate__(self): + state = self.__dict__.copy() + state["_pa_array"] = self._pa_array.combine_chunks() + return state + + def __setstate__(self, state) -> None: + if "_data" in state: + data = state.pop("_data") + else: + data = state["_pa_array"] + state["_pa_array"] = pa.chunked_array(data) + self.__dict__.update(state) + + def _cmp_method(self, other, op): + pc_func = ARROW_CMP_FUNCS[op.__name__] + if isinstance( + other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) + ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): + result = pc_func(self._pa_array, self._box_pa(other)) + elif is_scalar(other): + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): + mask = isna(self) | isna(other) + valid = ~mask + result = np.zeros(len(self), dtype="bool") + np_array = np.array(self) + try: + result[valid] = op(np_array[valid], other) + except TypeError: + result = ops.invalid_comparison(np_array, other, op) + result = pa.array(result, type=pa.bool_()) + result = pc.if_else(valid, result, None) + else: + raise NotImplementedError( + f"{op.__name__} not implemented for {type(other)}" + ) + return ArrowExtensionArray(result) + + def _evaluate_op_method(self, other, op, arrow_funcs): + pa_type = self._pa_array.type + other = self._box_pa(other) + + if ( + pa.types.is_string(pa_type) + or pa.types.is_large_string(pa_type) + or pa.types.is_binary(pa_type) + ): + if op in [operator.add, roperator.radd]: + sep = pa.scalar("", type=pa_type) + if op is operator.add: + result = pc.binary_join_element_wise(self._pa_array, other, sep) + elif op is roperator.radd: + result = pc.binary_join_element_wise(other, self._pa_array, sep) + return type(self)(result) + elif op in [operator.mul, roperator.rmul]: + binary = self._pa_array + integral = other + if not pa.types.is_integer(integral.type): + raise TypeError("Can only string multiply by an integer.") + pa_integral = pc.if_else(pc.less(integral, 0), 0, integral) + result = pc.binary_repeat(binary, pa_integral) + return type(self)(result) + elif ( + pa.types.is_string(other.type) + or pa.types.is_binary(other.type) + or pa.types.is_large_string(other.type) + ) and op in [operator.mul, roperator.rmul]: + binary = other + integral = self._pa_array + if not pa.types.is_integer(integral.type): + raise TypeError("Can only string multiply by an integer.") + pa_integral = pc.if_else(pc.less(integral, 0), 0, integral) + result = pc.binary_repeat(binary, pa_integral) + return type(self)(result) + if ( + isinstance(other, pa.Scalar) + and pc.is_null(other).as_py() + and op.__name__ in ARROW_LOGICAL_FUNCS + ): + # pyarrow kleene ops require null to be typed + other = other.cast(pa_type) + + pc_func = arrow_funcs[op.__name__] + if pc_func is NotImplemented: + raise NotImplementedError(f"{op.__name__} not implemented.") + + result = pc_func(self._pa_array, other) + return type(self)(result) + + def _logical_method(self, other, op): + # For integer types `^`, `|`, `&` are bitwise operators and return + # integer types. Otherwise these are boolean ops. + if pa.types.is_integer(self._pa_array.type): + return self._evaluate_op_method(other, op, ARROW_BIT_WISE_FUNCS) + else: + return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) + + def _arith_method(self, other, op): + return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) + + def equals(self, other) -> bool: + if not isinstance(other, ArrowExtensionArray): + return False + # I'm told that pyarrow makes __eq__ behave like pandas' equals; + # TODO: is this documented somewhere? + return self._pa_array == other._pa_array + + @property + def dtype(self) -> ArrowDtype: + """ + An instance of 'ExtensionDtype'. + """ + return self._dtype + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self._pa_array.nbytes + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self._pa_array) + + def __contains__(self, key) -> bool: + # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604 + if isna(key) and key is not self.dtype.na_value: + if self.dtype.kind == "f" and lib.is_float(key): + return pc.any(pc.is_nan(self._pa_array)).as_py() + + # e.g. date or timestamp types we do not allow None here to match pd.NA + return False + # TODO: maybe complex? object? + + return bool(super().__contains__(key)) + + @property + def _hasna(self) -> bool: + return self._pa_array.null_count > 0 + + def isna(self) -> npt.NDArray[np.bool_]: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + # GH51630: fast paths + null_count = self._pa_array.null_count + if null_count == 0: + return np.zeros(len(self), dtype=np.bool_) + elif null_count == len(self): + return np.ones(len(self), dtype=np.bool_) + + return self._pa_array.is_null().to_numpy() + + def any(self, *, skipna: bool = True, **kwargs): + """ + Return whether any element is truthy. + + Returns False unless there is at least one element that is truthy. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be False, as for an empty array. + If `skipna` is False, the result will still be True if there is + at least one element that is truthy, otherwise NA will be returned + if there are NA's present. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + ArrowExtensionArray.all : Return whether all elements are truthy. + + Examples + -------- + The result indicates whether any element is truthy (and by default + skips NAs): + + >>> pd.array([True, False, True], dtype="boolean[pyarrow]").any() + True + >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any() + True + >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any() + False + >>> pd.array([], dtype="boolean[pyarrow]").any() + False + >>> pd.array([pd.NA], dtype="boolean[pyarrow]").any() + False + >>> pd.array([pd.NA], dtype="float64[pyarrow]").any() + False + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) + True + >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) + True + >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) + + >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) + + """ + return self._reduce("any", skipna=skipna, **kwargs) + + def all(self, *, skipna: bool = True, **kwargs): + """ + Return whether all elements are truthy. + + Returns True unless there is at least one element that is falsey. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be True, as for an empty array. + If `skipna` is False, the result will still be False if there is + at least one element that is falsey, otherwise NA will be returned + if there are NA's present. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + ArrowExtensionArray.any : Return whether any element is truthy. + + Examples + -------- + The result indicates whether all elements are truthy (and by default + skips NAs): + + >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all() + True + >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all() + True + >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all() + False + >>> pd.array([], dtype="boolean[pyarrow]").all() + True + >>> pd.array([pd.NA], dtype="boolean[pyarrow]").all() + True + >>> pd.array([pd.NA], dtype="float64[pyarrow]").all() + True + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) + + >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) + + >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) + False + >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) + False + """ + return self._reduce("all", skipna=skipna, **kwargs) + + def argsort( + self, + *, + ascending: bool = True, + kind: SortKind = "quicksort", + na_position: str = "last", + **kwargs, + ) -> np.ndarray: + order = "ascending" if ascending else "descending" + null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None) + if null_placement is None: + raise ValueError(f"invalid na_position: {na_position}") + + result = pc.array_sort_indices( + self._pa_array, order=order, null_placement=null_placement + ) + np_result = result.to_numpy() + return np_result.astype(np.intp, copy=False) + + def _argmin_max(self, skipna: bool, method: str) -> int: + if self._pa_array.length() in (0, self._pa_array.null_count) or ( + self._hasna and not skipna + ): + # For empty or all null, pyarrow returns -1 but pandas expects TypeError + # For skipna=False and data w/ null, pandas expects NotImplementedError + # let ExtensionArray.arg{max|min} raise + return getattr(super(), f"arg{method}")(skipna=skipna) + + data = self._pa_array + if pa.types.is_duration(data.type): + data = data.cast(pa.int64()) + + value = getattr(pc, method)(data, skip_nulls=skipna) + return pc.index(data, value).as_py() + + def argmin(self, skipna: bool = True) -> int: + return self._argmin_max(skipna, "min") + + def argmax(self, skipna: bool = True) -> int: + return self._argmin_max(skipna, "max") + + def copy(self) -> Self: + """ + Return a shallow copy of the array. + + Underlying ChunkedArray is immutable, so a deep copy is unnecessary. + + Returns + ------- + type(self) + """ + return type(self)(self._pa_array) + + def dropna(self) -> Self: + """ + Return ArrowExtensionArray without NA values. + + Returns + ------- + ArrowExtensionArray + """ + return type(self)(pc.drop_null(self._pa_array)) + + def _pad_or_backfill( + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + if not self._hasna: + # TODO(CoW): Not necessary anymore when CoW is the default + return self.copy() + + if limit is None and limit_area is None: + method = missing.clean_fill_method(method) + try: + if method == "pad": + return type(self)(pc.fill_null_forward(self._pa_array)) + elif method == "backfill": + return type(self)(pc.fill_null_backward(self._pa_array)) + except pa.ArrowNotImplementedError: + # ArrowNotImplementedError: Function 'coalesce' has no kernel + # matching input types (duration[ns], duration[ns]) + # TODO: remove try/except wrapper if/when pyarrow implements + # a kernel for duration types. + pass + + # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove + # this method entirely. + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) + + @doc(ExtensionArray.fillna) + def fillna( + self, + value: object | ArrayLike | None = None, + method: FillnaOptions | None = None, + limit: int | None = None, + copy: bool = True, + ) -> Self: + value, method = validate_fillna_kwargs(value, method) + + if not self._hasna: + # TODO(CoW): Not necessary anymore when CoW is the default + return self.copy() + + if limit is not None: + return super().fillna(value=value, method=method, limit=limit, copy=copy) + + if method is not None: + return super().fillna(method=method, limit=limit, copy=copy) + + if isinstance(value, (np.ndarray, ExtensionArray)): + # Similar to check_value_size, but we do not mask here since we may + # end up passing it to the super() method. + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {len(self)}" + ) + + try: + fill_value = self._box_pa(value, pa_type=self._pa_array.type) + except pa.ArrowTypeError as err: + msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + raise TypeError(msg) from err + + try: + return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value)) + except pa.ArrowNotImplementedError: + # ArrowNotImplementedError: Function 'coalesce' has no kernel + # matching input types (duration[ns], duration[ns]) + # TODO: remove try/except wrapper if/when pyarrow implements + # a kernel for duration types. + pass + + return super().fillna(value=value, method=method, limit=limit, copy=copy) + + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + # short-circuit to return all False array. + if not len(values): + return np.zeros(len(self), dtype=bool) + + result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True)) + # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls + # to False + return np.array(result, dtype=np.bool_) + + def _values_for_factorize(self) -> tuple[np.ndarray, Any]: + """ + Return an array and missing value suitable for factorization. + + Returns + ------- + values : ndarray + na_value : pd.NA + + Notes + ----- + The values returned by this method are also used in + :func:`pandas.util.hash_pandas_object`. + """ + values = self._pa_array.to_numpy() + return values, self.dtype.na_value + + @doc(ExtensionArray.factorize) + def factorize( + self, + use_na_sentinel: bool = True, + ) -> tuple[np.ndarray, ExtensionArray]: + null_encoding = "mask" if use_na_sentinel else "encode" + + data = self._pa_array + pa_type = data.type + if pa_version_under11p0 and pa.types.is_duration(pa_type): + # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 + data = data.cast(pa.int64()) + + if pa.types.is_dictionary(data.type): + encoded = data + else: + encoded = data.dictionary_encode(null_encoding=null_encoding) + if encoded.length() == 0: + indices = np.array([], dtype=np.intp) + uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type)) + else: + # GH 54844 + combined = encoded.combine_chunks() + pa_indices = combined.indices + if pa_indices.null_count > 0: + pa_indices = pc.fill_null(pa_indices, -1) + indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype( + np.intp, copy=False + ) + uniques = type(self)(combined.dictionary) + + if pa_version_under11p0 and pa.types.is_duration(pa_type): + uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype)) + return indices, uniques + + def reshape(self, *args, **kwargs): + raise NotImplementedError( + f"{type(self)} does not support reshape " + f"as backed by a 1D pyarrow.ChunkedArray." + ) + + def round(self, decimals: int = 0, *args, **kwargs) -> Self: + """ + Round each value in the array a to the given number of decimals. + + Parameters + ---------- + decimals : int, default 0 + Number of decimal places to round to. If decimals is negative, + it specifies the number of positions to the left of the decimal point. + *args, **kwargs + Additional arguments and keywords have no effect. + + Returns + ------- + ArrowExtensionArray + Rounded values of the ArrowExtensionArray. + + See Also + -------- + DataFrame.round : Round values of a DataFrame. + Series.round : Round values of a Series. + """ + return type(self)(pc.round(self._pa_array, ndigits=decimals)) + + @doc(ExtensionArray.searchsorted) + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + if self._hasna: + raise ValueError( + "searchsorted requires array to be sorted, which is impossible " + "with NAs present." + ) + if isinstance(value, ExtensionArray): + value = value.astype(object) + # Base class searchsorted would cast to object, which is *much* slower. + dtype = None + if isinstance(self.dtype, ArrowDtype): + pa_dtype = self.dtype.pyarrow_dtype + if ( + pa.types.is_timestamp(pa_dtype) or pa.types.is_duration(pa_dtype) + ) and pa_dtype.unit == "ns": + # np.array[datetime/timedelta].searchsorted(datetime/timedelta) + # erroneously fails when numpy type resolution is nanoseconds + dtype = object + return self.to_numpy(dtype=dtype).searchsorted(value, side=side, sorter=sorter) + + def take( + self, + indices: TakeIndexer, + allow_fill: bool = False, + fill_value: Any = None, + ) -> ArrowExtensionArray: + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int or one-dimensional np.ndarray of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + indices_array = np.asanyarray(indices) + + if len(self._pa_array) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.size > 0 and indices_array.max() >= len(self._pa_array): + raise IndexError("out of bounds value in 'indices'.") + + if allow_fill: + fill_mask = indices_array < 0 + if fill_mask.any(): + validate_indices(indices_array, len(self._pa_array)) + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=fill_mask) + result = self._pa_array.take(indices_array) + if isna(fill_value): + return type(self)(result) + # TODO: ArrowNotImplementedError: Function fill_null has no + # kernel matching input types (array[string], scalar[string]) + result = type(self)(result) + result[fill_mask] = fill_value + return result + # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self._pa_array.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self._pa_array) + return type(self)(self._pa_array.take(indices_array)) + + def _maybe_convert_datelike_array(self): + """Maybe convert to a datelike array.""" + pa_type = self._pa_array.type + if pa.types.is_timestamp(pa_type): + return self._to_datetimearray() + elif pa.types.is_duration(pa_type): + return self._to_timedeltaarray() + return self + + def _to_datetimearray(self) -> DatetimeArray: + """Convert a pyarrow timestamp typed array to a DatetimeArray.""" + from pandas.core.arrays.datetimes import ( + DatetimeArray, + tz_to_dtype, + ) + + pa_type = self._pa_array.type + assert pa.types.is_timestamp(pa_type) + np_dtype = np.dtype(f"M8[{pa_type.unit}]") + dtype = tz_to_dtype(pa_type.tz, pa_type.unit) + np_array = self._pa_array.to_numpy() + np_array = np_array.astype(np_dtype) + return DatetimeArray._simple_new(np_array, dtype=dtype) + + def _to_timedeltaarray(self) -> TimedeltaArray: + """Convert a pyarrow duration typed array to a TimedeltaArray.""" + from pandas.core.arrays.timedeltas import TimedeltaArray + + pa_type = self._pa_array.type + assert pa.types.is_duration(pa_type) + np_dtype = np.dtype(f"m8[{pa_type.unit}]") + np_array = self._pa_array.to_numpy() + np_array = np_array.astype(np_dtype) + return TimedeltaArray._simple_new(np_array, dtype=np_dtype) + + def _values_for_json(self) -> np.ndarray: + if is_numeric_dtype(self.dtype): + return np.asarray(self, dtype=object) + return super()._values_for_json() + + @doc(ExtensionArray.to_numpy) + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + original_na_value = na_value + dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna) + pa_type = self._pa_array.type + if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): + data = self + else: + data = self.fillna(na_value) + copy = False + + if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): + # GH 55997 + if dtype != object and na_value is self.dtype.na_value: + na_value = lib.no_default + result = data._maybe_convert_datelike_array().to_numpy( + dtype=dtype, na_value=na_value + ) + elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type): + # convert to list of python datetime.time objects before + # wrapping in ndarray + result = np.array(list(data), dtype=dtype) + if data._hasna: + result[data.isna()] = na_value + elif pa.types.is_null(pa_type): + if dtype is not None and isna(na_value): + na_value = None + result = np.full(len(data), fill_value=na_value, dtype=dtype) + elif not data._hasna or ( + pa.types.is_floating(pa_type) + and ( + na_value is np.nan + or original_na_value is lib.no_default + and is_float_dtype(dtype) + ) + ): + result = data._pa_array.to_numpy() + if dtype is not None: + result = result.astype(dtype, copy=False) + if copy: + result = result.copy() + else: + if dtype is None: + empty = pa.array([], type=pa_type).to_numpy(zero_copy_only=False) + if can_hold_element(empty, na_value): + dtype = empty.dtype + else: + dtype = np.object_ + result = np.empty(len(data), dtype=dtype) + mask = data.isna() + result[mask] = na_value + result[~mask] = data[~mask]._pa_array.to_numpy() + return result + + def map(self, mapper, na_action=None): + if is_numeric_dtype(self.dtype): + return map_array(self.to_numpy(), mapper, na_action=na_action) + else: + return super().map(mapper, na_action) + + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + pa_type = self._pa_array.type + if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type): + values = self.to_numpy(na_value=0) + elif pa.types.is_boolean(pa_type): + values = self.to_numpy(na_value=False) + elif pa.types.is_temporal(pa_type): + if pa_type.bit_width == 32: + pa_type = pa.int32() + else: + pa_type = pa.int64() + arr = self.astype(ArrowDtype(pa_type)) + values = arr.to_numpy(na_value=0) + else: + # factorize the values to avoid the performance penalty of + # converting to object dtype + values = self.factorize()[0] + + mask = self.isna() if self._hasna else None + return algos.duplicated(values, keep=keep, mask=mask) + + def unique(self) -> Self: + """ + Compute the ArrowExtensionArray of unique values. + + Returns + ------- + ArrowExtensionArray + """ + pa_type = self._pa_array.type + + if pa_version_under11p0 and pa.types.is_duration(pa_type): + # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 + data = self._pa_array.cast(pa.int64()) + else: + data = self._pa_array + + pa_result = pc.unique(data) + + if pa_version_under11p0 and pa.types.is_duration(pa_type): + pa_result = pa_result.cast(pa_type) + + return type(self)(pa_result) + + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + pa_type = self._pa_array.type + if pa_version_under11p0 and pa.types.is_duration(pa_type): + # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 + data = self._pa_array.cast(pa.int64()) + else: + data = self._pa_array + + from pandas import ( + Index, + Series, + ) + + vc = data.value_counts() + + values = vc.field(0) + counts = vc.field(1) + if dropna and data.null_count > 0: + mask = values.is_valid() + values = values.filter(mask) + counts = counts.filter(mask) + + if pa_version_under11p0 and pa.types.is_duration(pa_type): + values = values.cast(pa_type) + + counts = ArrowExtensionArray(counts) + + index = Index(type(self)(values)) + + return Series(counts, index=index, name="count", copy=False) + + @classmethod + def _concat_same_type(cls, to_concat) -> Self: + """ + Concatenate multiple ArrowExtensionArrays. + + Parameters + ---------- + to_concat : sequence of ArrowExtensionArrays + + Returns + ------- + ArrowExtensionArray + """ + chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] + if to_concat[0].dtype == "string": + # StringDtype has no attribute pyarrow_dtype + pa_dtype = pa.large_string() + else: + pa_dtype = to_concat[0].dtype.pyarrow_dtype + arr = pa.chunked_array(chunks, type=pa_dtype) + return cls(arr) + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ArrowExtensionArray | ExtensionArray: + """ + Return an ExtensionArray performing an accumulation operation. + + The underlying data type might change. + + Parameters + ---------- + name : str + Name of the function, supported values are: + - cummin + - cummax + - cumsum + - cumprod + skipna : bool, default True + If True, skip NA values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, there is no supported kwarg. + + Returns + ------- + array + + Raises + ------ + NotImplementedError : subclass does not define accumulations + """ + pyarrow_name = { + "cummax": "cumulative_max", + "cummin": "cumulative_min", + "cumprod": "cumulative_prod_checked", + "cumsum": "cumulative_sum_checked", + }.get(name, name) + pyarrow_meth = getattr(pc, pyarrow_name, None) + if pyarrow_meth is None: + return super()._accumulate(name, skipna=skipna, **kwargs) + + data_to_accum = self._pa_array + + pa_dtype = data_to_accum.type + + convert_to_int = ( + pa.types.is_temporal(pa_dtype) and name in ["cummax", "cummin"] + ) or (pa.types.is_duration(pa_dtype) and name == "cumsum") + + if convert_to_int: + if pa_dtype.bit_width == 32: + data_to_accum = data_to_accum.cast(pa.int32()) + else: + data_to_accum = data_to_accum.cast(pa.int64()) + + result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + + if convert_to_int: + result = result.cast(pa_dtype) + + return type(self)(result) + + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: + """ + Return a pyarrow scalar result of performing the reduction operation. + + Parameters + ---------- + name : str + Name of the function, supported values are: + { any, all, min, max, sum, mean, median, prod, + std, var, sem, kurt, skew }. + skipna : bool, default True + If True, skip NaN values. + **kwargs + Additional keyword arguments passed to the reduction function. + Currently, `ddof` is the only supported kwarg. + + Returns + ------- + pyarrow scalar + + Raises + ------ + TypeError : subclass does not define reductions + """ + pa_type = self._pa_array.type + + data_to_reduce = self._pa_array + + cast_kwargs = {} if pa_version_under13p0 else {"safe": False} + + if name in ["any", "all"] and ( + pa.types.is_integer(pa_type) + or pa.types.is_floating(pa_type) + or pa.types.is_duration(pa_type) + or pa.types.is_decimal(pa_type) + ): + # pyarrow only supports any/all for boolean dtype, we allow + # for other dtypes, matching our non-pyarrow behavior + + if pa.types.is_duration(pa_type): + data_to_cmp = self._pa_array.cast(pa.int64()) + else: + data_to_cmp = self._pa_array + + not_eq = pc.not_equal(data_to_cmp, 0) + data_to_reduce = not_eq + + elif name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): + data_to_reduce = self._pa_array.cast(pa.int64()) + + elif name in ["median", "mean", "std", "sem"] and pa.types.is_temporal(pa_type): + nbits = pa_type.bit_width + if nbits == 32: + data_to_reduce = self._pa_array.cast(pa.int32()) + else: + data_to_reduce = self._pa_array.cast(pa.int64()) + + if name == "sem": + + def pyarrow_meth(data, skip_nulls, **kwargs): + numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs) + denominator = pc.sqrt_checked(pc.count(self._pa_array)) + return pc.divide_checked(numerator, denominator) + + else: + pyarrow_name = { + "median": "quantile", + "prod": "product", + "std": "stddev", + "var": "variance", + }.get(name, name) + # error: Incompatible types in assignment + # (expression has type "Optional[Any]", variable has type + # "Callable[[Any, Any, KwArg(Any)], Any]") + pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment] + if pyarrow_meth is None: + # Let ExtensionArray._reduce raise the TypeError + return super()._reduce(name, skipna=skipna, **kwargs) + + # GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0 + if name in ["any", "all"] and "min_count" not in kwargs: + kwargs["min_count"] = 0 + elif name == "median": + # GH 52679: Use quantile instead of approximate_median + kwargs["q"] = 0.5 + + try: + result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs) + except (AttributeError, NotImplementedError, TypeError) as err: + msg = ( + f"'{type(self).__name__}' with dtype {self.dtype} " + f"does not support reduction '{name}' with pyarrow " + f"version {pa.__version__}. '{name}' may be supported by " + f"upgrading pyarrow." + ) + raise TypeError(msg) from err + if name == "median": + # GH 52679: Use quantile instead of approximate_median; returns array + result = result[0] + if pc.is_null(result).as_py(): + return result + + if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): + result = result.cast(pa_type) + if name in ["median", "mean"] and pa.types.is_temporal(pa_type): + if not pa_version_under13p0: + nbits = pa_type.bit_width + if nbits == 32: + result = result.cast(pa.int32(), **cast_kwargs) + else: + result = result.cast(pa.int64(), **cast_kwargs) + result = result.cast(pa_type) + if name in ["std", "sem"] and pa.types.is_temporal(pa_type): + result = result.cast(pa.int64(), **cast_kwargs) + if pa.types.is_duration(pa_type): + result = result.cast(pa_type) + elif pa.types.is_time(pa_type): + unit = get_unit_from_pa_dtype(pa_type) + result = result.cast(pa.duration(unit)) + elif pa.types.is_date(pa_type): + # go with closest available unit, i.e. "s" + result = result.cast(pa.duration("s")) + else: + # i.e. timestamp + result = result.cast(pa.duration(pa_type.unit)) + + return result + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + """ + Return a scalar result of performing the reduction operation. + + Parameters + ---------- + name : str + Name of the function, supported values are: + { any, all, min, max, sum, mean, median, prod, + std, var, sem, kurt, skew }. + skipna : bool, default True + If True, skip NaN values. + **kwargs + Additional keyword arguments passed to the reduction function. + Currently, `ddof` is the only supported kwarg. + + Returns + ------- + scalar + + Raises + ------ + TypeError : subclass does not define reductions + """ + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) + if isinstance(result, pa.Array): + return type(self)(result) + else: + return result + + def _reduce_calc( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) + + if keepdims: + if isinstance(pa_result, pa.Scalar): + result = pa.array([pa_result.as_py()], type=pa_result.type) + else: + result = pa.array( + [pa_result], + type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]), + ) + return result + + if pc.is_null(pa_result).as_py(): + return self.dtype.na_value + elif isinstance(pa_result, pa.Scalar): + return pa_result.as_py() + else: + return pa_result + + def _explode(self): + """ + See Series.explode.__doc__. + """ + # child class explode method supports only list types; return + # default implementation for non list types. + if not pa.types.is_list(self.dtype.pyarrow_dtype): + return super()._explode() + values = self + counts = pa.compute.list_value_length(values._pa_array) + counts = counts.fill_null(1).to_numpy() + fill_value = pa.scalar([None], type=self._pa_array.type) + mask = counts == 0 + if mask.any(): + values = values.copy() + values[mask] = fill_value + counts = counts.copy() + counts[mask] = 1 + values = values.fillna(fill_value) + values = type(self)(pa.compute.list_flatten(values._pa_array)) + return values, counts + + def __setitem__(self, key, value) -> None: + """Set one or more values inplace. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + # GH50085: unwrap 1D indexers + if isinstance(key, tuple) and len(key) == 1: + key = key[0] + + key = check_array_indexer(self, key) + value = self._maybe_convert_setitem_value(value) + + if com.is_null_slice(key): + # fast path (GH50248) + data = self._if_else(True, value, self._pa_array) + + elif is_integer(key): + # fast path + key = cast(int, key) + n = len(self) + if key < 0: + key += n + if not 0 <= key < n: + raise IndexError( + f"index {key} is out of bounds for axis 0 with size {n}" + ) + if isinstance(value, pa.Scalar): + value = value.as_py() + elif is_list_like(value): + raise ValueError("Length of indexer and values mismatch") + chunks = [ + *self._pa_array[:key].chunks, + pa.array([value], type=self._pa_array.type, from_pandas=True), + *self._pa_array[key + 1 :].chunks, + ] + data = pa.chunked_array(chunks).combine_chunks() + + elif is_bool_dtype(key): + key = np.asarray(key, dtype=np.bool_) + data = self._replace_with_mask(self._pa_array, key, value) + + elif is_scalar(value) or isinstance(value, pa.Scalar): + mask = np.zeros(len(self), dtype=np.bool_) + mask[key] = True + data = self._if_else(mask, value, self._pa_array) + + else: + indices = np.arange(len(self))[key] + if len(indices) != len(value): + raise ValueError("Length of indexer and values mismatch") + if len(indices) == 0: + return + argsort = np.argsort(indices) + indices = indices[argsort] + value = value.take(argsort) + mask = np.zeros(len(self), dtype=np.bool_) + mask[indices] = True + data = self._replace_with_mask(self._pa_array, mask, value) + + if isinstance(data, pa.Array): + data = pa.chunked_array([data]) + self._pa_array = data + + def _rank_calc( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + if axis != 0: + ranked = super()._rank( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + # keep dtypes consistent with the implementation below + if method == "average" or pct: + pa_type = pa.float64() + else: + pa_type = pa.uint64() + result = pa.array(ranked, type=pa_type, from_pandas=True) + return result + + data = self._pa_array.combine_chunks() + sort_keys = "ascending" if ascending else "descending" + null_placement = "at_start" if na_option == "top" else "at_end" + tiebreaker = "min" if method == "average" else method + + result = pc.rank( + data, + sort_keys=sort_keys, + null_placement=null_placement, + tiebreaker=tiebreaker, + ) + + if na_option == "keep": + mask = pc.is_null(self._pa_array) + null = pa.scalar(None, type=result.type) + result = pc.if_else(mask, null, result) + + if method == "average": + result_max = pc.rank( + data, + sort_keys=sort_keys, + null_placement=null_placement, + tiebreaker="max", + ) + result_max = result_max.cast(pa.float64()) + result_min = result.cast(pa.float64()) + result = pc.divide(pc.add(result_min, result_max), 2) + + if pct: + if not pa.types.is_floating(result.type): + result = result.cast(pa.float64()) + if method == "dense": + divisor = pc.max(result) + else: + divisor = pc.count(result) + result = pc.divide(result, divisor) + + return result + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return type(self)( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) + + def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self: + """ + Compute the quantiles of self for each quantile in `qs`. + + Parameters + ---------- + qs : np.ndarray[float64] + interpolation: str + + Returns + ------- + same type as self + """ + pa_dtype = self._pa_array.type + + data = self._pa_array + if pa.types.is_temporal(pa_dtype): + # https://github.com/apache/arrow/issues/33769 in these cases + # we can cast to ints and back + nbits = pa_dtype.bit_width + if nbits == 32: + data = data.cast(pa.int32()) + else: + data = data.cast(pa.int64()) + + result = pc.quantile(data, q=qs, interpolation=interpolation) + + if pa.types.is_temporal(pa_dtype): + if pa.types.is_floating(result.type): + result = pc.floor(result) + nbits = pa_dtype.bit_width + if nbits == 32: + result = result.cast(pa.int32()) + else: + result = result.cast(pa.int64()) + result = result.cast(pa_dtype) + + return type(self)(result) + + def _mode(self, dropna: bool = True) -> Self: + """ + Returns the mode(s) of the ExtensionArray. + + Always returns `ExtensionArray` even if only one value. + + Parameters + ---------- + dropna : bool, default True + Don't consider counts of NA values. + + Returns + ------- + same type as self + Sorted, if possible. + """ + pa_type = self._pa_array.type + if pa.types.is_temporal(pa_type): + nbits = pa_type.bit_width + if nbits == 32: + data = self._pa_array.cast(pa.int32()) + elif nbits == 64: + data = self._pa_array.cast(pa.int64()) + else: + raise NotImplementedError(pa_type) + else: + data = self._pa_array + + if dropna: + data = data.drop_null() + + res = pc.value_counts(data) + most_common = res.field("values").filter( + pc.equal(res.field("counts"), pc.max(res.field("counts"))) + ) + + if pa.types.is_temporal(pa_type): + most_common = most_common.cast(pa_type) + + most_common = most_common.take(pc.array_sort_indices(most_common)) + return type(self)(most_common) + + def _maybe_convert_setitem_value(self, value): + """Maybe convert value to be pyarrow compatible.""" + try: + value = self._box_pa(value, self._pa_array.type) + except pa.ArrowTypeError as err: + msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + raise TypeError(msg) from err + return value + + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> Self: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + mask = self.isna() + if self.dtype.kind == "f": + data = self._pa_array.to_numpy() + elif self.dtype.kind in "iu": + data = self.to_numpy(dtype="f8", na_value=0.0) + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + return type(self)(self._box_pa_array(pa.array(data, mask=mask))) + + @classmethod + def _if_else( + cls, + cond: npt.NDArray[np.bool_] | bool, + left: ArrayLike | Scalar, + right: ArrayLike | Scalar, + ): + """ + Choose values based on a condition. + + Analogous to pyarrow.compute.if_else, with logic + to fallback to numpy for unsupported types. + + Parameters + ---------- + cond : npt.NDArray[np.bool_] or bool + left : ArrayLike | Scalar + right : ArrayLike | Scalar + + Returns + ------- + pa.Array + """ + try: + return pc.if_else(cond, left, right) + except pa.ArrowNotImplementedError: + pass + + def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: + if isinstance(value, (pa.Array, pa.ChunkedArray)): + pa_type = value.type + elif isinstance(value, pa.Scalar): + pa_type = value.type + value = value.as_py() + else: + pa_type = None + return np.array(value, dtype=object), pa_type + + left, left_type = _to_numpy_and_type(left) + right, right_type = _to_numpy_and_type(right) + pa_type = left_type or right_type + result = np.where(cond, left, right) + return pa.array(result, type=pa_type, from_pandas=True) + + @classmethod + def _replace_with_mask( + cls, + values: pa.Array | pa.ChunkedArray, + mask: npt.NDArray[np.bool_] | bool, + replacements: ArrayLike | Scalar, + ): + """ + Replace items selected with a mask. + + Analogous to pyarrow.compute.replace_with_mask, with logic + to fallback to numpy for unsupported types. + + Parameters + ---------- + values : pa.Array or pa.ChunkedArray + mask : npt.NDArray[np.bool_] or bool + replacements : ArrayLike or Scalar + Replacement value(s) + + Returns + ------- + pa.Array or pa.ChunkedArray + """ + if isinstance(replacements, pa.ChunkedArray): + # replacements must be array or scalar, not ChunkedArray + replacements = replacements.combine_chunks() + if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type): + # GH#52059 replace_with_mask segfaults for chunked array + # https://github.com/apache/arrow/issues/34634 + values = values.combine_chunks() + try: + return pc.replace_with_mask(values, mask, replacements) + except pa.ArrowNotImplementedError: + pass + if isinstance(replacements, pa.Array): + replacements = np.array(replacements, dtype=object) + elif isinstance(replacements, pa.Scalar): + replacements = replacements.as_py() + result = np.array(values, dtype=object) + result[mask] = replacements + return pa.array(result, type=values.type, from_pandas=True) + + # ------------------------------------------------------------------ + # GroupBy Methods + + def _to_masked(self): + pa_dtype = self._pa_array.type + + if pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype): + na_value = 1 + elif pa.types.is_boolean(pa_dtype): + na_value = True + else: + raise NotImplementedError + + dtype = _arrow_dtype_mapping()[pa_dtype] + mask = self.isna() + arr = self.to_numpy(dtype=dtype.numpy_dtype, na_value=na_value) + return dtype.construct_array_type()(arr, mask) + + def _groupby_op( + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, + ): + if isinstance(self.dtype, StringDtype): + return super()._groupby_op( + how=how, + has_dropped_na=has_dropped_na, + min_count=min_count, + ngroups=ngroups, + ids=ids, + **kwargs, + ) + + # maybe convert to a compatible dtype optimized for groupby + values: ExtensionArray + pa_type = self._pa_array.type + if pa.types.is_timestamp(pa_type): + values = self._to_datetimearray() + elif pa.types.is_duration(pa_type): + values = self._to_timedeltaarray() + else: + values = self._to_masked() + + result = values._groupby_op( + how=how, + has_dropped_na=has_dropped_na, + min_count=min_count, + ngroups=ngroups, + ids=ids, + **kwargs, + ) + if isinstance(result, np.ndarray): + return result + return type(self)._from_sequence(result, copy=False) + + def _apply_elementwise(self, func: Callable) -> list[list[Any]]: + """Apply a callable to each element while maintaining the chunking structure.""" + return [ + [ + None if val is None else func(val) + for val in chunk.to_numpy(zero_copy_only=False) + ] + for chunk in self._pa_array.iterchunks() + ] + + def _str_count(self, pat: str, flags: int = 0): + if flags: + raise NotImplementedError(f"count not implemented with {flags=}") + return type(self)(pc.count_substring_regex(self._pa_array, pat)) + + def _str_contains( + self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + ): + if flags: + raise NotImplementedError(f"contains not implemented with {flags=}") + + if regex: + pa_contains = pc.match_substring_regex + else: + pa_contains = pc.match_substring + result = pa_contains(self._pa_array, pat, ignore_case=not case) + if not isna(na): + result = result.fill_null(na) + return type(self)(result) + + def _str_startswith(self, pat: str | tuple[str, ...], na=None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + if not isna(na): + result = result.fill_null(na) + return type(self)(result) + + def _str_endswith(self, pat: str | tuple[str, ...], na=None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + if not isna(na): + result = result.fill_null(na) + return type(self)(result) + + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): + if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + raise NotImplementedError( + "replace is not supported with a re.Pattern, callable repl, " + "case=False, or flags!=0" + ) + + func = pc.replace_substring_regex if regex else pc.replace_substring + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) + return type(self)(result) + + def _str_repeat(self, repeats: int | Sequence[int]): + if not isinstance(repeats, int): + raise NotImplementedError( + f"repeat is not implemented when repeats is {type(repeats).__name__}" + ) + else: + return type(self)(pc.binary_repeat(self._pa_array, repeats)) + + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.startswith("^"): + pat = f"^{pat}" + return self._str_contains(pat, case, flags, na, regex=True) + + def _str_fullmatch( + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.endswith("$") or pat.endswith("\\$"): + pat = f"{pat}$" + return self._str_match(pat, case, flags, na) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + start_offset = max(0, start) + offset_result = pc.add(result, start_offset) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + raise NotImplementedError( + f"find not implemented with {sub=}, {start=}, {end=}" + ) + return type(self)(result) + + def _str_join(self, sep: str): + if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( + self._pa_array.type + ): + result = self._apply_elementwise(list) + result = pa.chunked_array(result, type=pa.list_(pa.string())) + else: + result = self._pa_array + return type(self)(pc.binary_join(result, sep)) + + def _str_partition(self, sep: str, expand: bool): + predicate = lambda val: val.partition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_rpartition(self, sep: str, expand: bool): + predicate = lambda val: val.rpartition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + + def _str_isalnum(self): + return type(self)(pc.utf8_is_alnum(self._pa_array)) + + def _str_isalpha(self): + return type(self)(pc.utf8_is_alpha(self._pa_array)) + + def _str_isdecimal(self): + return type(self)(pc.utf8_is_decimal(self._pa_array)) + + def _str_isdigit(self): + return type(self)(pc.utf8_is_digit(self._pa_array)) + + def _str_islower(self): + return type(self)(pc.utf8_is_lower(self._pa_array)) + + def _str_isnumeric(self): + return type(self)(pc.utf8_is_numeric(self._pa_array)) + + def _str_isspace(self): + return type(self)(pc.utf8_is_space(self._pa_array)) + + def _str_istitle(self): + return type(self)(pc.utf8_is_title(self._pa_array)) + + def _str_isupper(self): + return type(self)(pc.utf8_is_upper(self._pa_array)) + + def _str_len(self): + return type(self)(pc.utf8_length(self._pa_array)) + + def _str_lower(self): + return type(self)(pc.utf8_lower(self._pa_array)) + + def _str_upper(self): + return type(self)(pc.utf8_upper(self._pa_array)) + + def _str_strip(self, to_strip=None): + if to_strip is None: + result = pc.utf8_trim_whitespace(self._pa_array) + else: + result = pc.utf8_trim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_lstrip(self, to_strip=None): + if to_strip is None: + result = pc.utf8_ltrim_whitespace(self._pa_array) + else: + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_rstrip(self, to_strip=None): + if to_strip is None: + result = pc.utf8_rtrim_whitespace(self._pa_array) + else: + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + predicate = lambda val: val.removeprefix(prefix) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_casefold(self): + predicate = lambda val: val.casefold() + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_encode(self, encoding: str, errors: str = "strict"): + predicate = lambda val: val.encode(encoding, errors) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): + if flags: + raise NotImplementedError("Only flags=0 is implemented.") + groups = re.compile(pat).groupindex.keys() + if len(groups) == 0: + raise ValueError(f"{pat=} must contain a symbolic group name.") + result = pc.extract_regex(self._pa_array, pat) + if expand: + return { + col: type(self)(pc.struct_field(result, [i])) + for col, i in zip(groups, range(result.type.num_fields)) + } + else: + return type(self)(pc.struct_field(result, [0])) + + def _str_findall(self, pat: str, flags: int = 0): + regex = re.compile(pat, flags=flags) + predicate = lambda val: regex.findall(val) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_get_dummies(self, sep: str = "|"): + split = pc.split_pattern(self._pa_array, sep) + flattened_values = pc.list_flatten(split) + uniques = flattened_values.unique() + uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques)) + lengths = pc.list_value_length(split).fill_null(0).to_numpy() + n_rows = len(self) + n_cols = len(uniques) + indices = pc.index_in(flattened_values, uniques_sorted).to_numpy() + indices = indices + np.arange(n_rows).repeat(lengths) * n_cols + dummies = np.zeros(n_rows * n_cols, dtype=np.bool_) + dummies[indices] = True + dummies = dummies.reshape((n_rows, n_cols)) + result = type(self)(pa.array(list(dummies))) + return result, uniques_sorted.to_pylist() + + def _str_index(self, sub: str, start: int = 0, end: int | None = None): + predicate = lambda val: val.index(sub, start, end) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_rindex(self, sub: str, start: int = 0, end: int | None = None): + predicate = lambda val: val.rindex(sub, start, end) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_normalize(self, form: str): + predicate = lambda val: unicodedata.normalize(form, val) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_rfind(self, sub: str, start: int = 0, end=None): + predicate = lambda val: val.rfind(sub, start, end) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_split( + self, + pat: str | None = None, + n: int | None = -1, + expand: bool = False, + regex: bool | None = None, + ): + if n in {-1, 0}: + n = None + if pat is None: + split_func = pc.utf8_split_whitespace + elif regex: + split_func = functools.partial(pc.split_pattern_regex, pattern=pat) + else: + split_func = functools.partial(pc.split_pattern, pattern=pat) + return type(self)(split_func(self._pa_array, max_splits=n)) + + def _str_rsplit(self, pat: str | None = None, n: int | None = -1): + if n in {-1, 0}: + n = None + if pat is None: + return type(self)( + pc.utf8_split_whitespace(self._pa_array, max_splits=n, reverse=True) + ) + else: + return type(self)( + pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True) + ) + + def _str_translate(self, table: dict[int, str]): + predicate = lambda val: val.translate(table) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_wrap(self, width: int, **kwargs): + kwargs["width"] = width + tw = textwrap.TextWrapper(**kwargs) + predicate = lambda val: "\n".join(tw.wrap(val)) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + @property + def _dt_days(self): + return type(self)( + pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) + ) + + @property + def _dt_hours(self): + return type(self)( + pa.array( + [ + td.components.hours if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_minutes(self): + return type(self)( + pa.array( + [ + td.components.minutes if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_seconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() + ) + ) + + @property + def _dt_milliseconds(self): + return type(self)( + pa.array( + [ + td.components.milliseconds if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_microseconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().microseconds, + from_pandas=True, + type=pa.int32(), + ) + ) + + @property + def _dt_nanoseconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() + ) + ) + + def _dt_to_pytimedelta(self): + data = self._pa_array.to_pylist() + if self._dtype.pyarrow_dtype.unit == "ns": + data = [None if ts is None else ts.to_pytimedelta() for ts in data] + return np.array(data, dtype=object) + + def _dt_total_seconds(self): + return type(self)( + pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True) + ) + + def _dt_as_unit(self, unit: str): + if pa.types.is_date(self.dtype.pyarrow_dtype): + raise NotImplementedError("as_unit not implemented for date types") + pd_array = self._maybe_convert_datelike_array() + # Don't just cast _pa_array in order to follow pandas unit conversion rules + return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True)) + + @property + def _dt_year(self): + return type(self)(pc.year(self._pa_array)) + + @property + def _dt_day(self): + return type(self)(pc.day(self._pa_array)) + + @property + def _dt_day_of_week(self): + return type(self)(pc.day_of_week(self._pa_array)) + + _dt_dayofweek = _dt_day_of_week + _dt_weekday = _dt_day_of_week + + @property + def _dt_day_of_year(self): + return type(self)(pc.day_of_year(self._pa_array)) + + _dt_dayofyear = _dt_day_of_year + + @property + def _dt_hour(self): + return type(self)(pc.hour(self._pa_array)) + + def _dt_isocalendar(self): + return type(self)(pc.iso_calendar(self._pa_array)) + + @property + def _dt_is_leap_year(self): + return type(self)(pc.is_leap_year(self._pa_array)) + + @property + def _dt_is_month_start(self): + return type(self)(pc.equal(pc.day(self._pa_array), 1)) + + @property + def _dt_is_month_end(self): + result = pc.equal( + pc.days_between( + pc.floor_temporal(self._pa_array, unit="day"), + pc.ceil_temporal(self._pa_array, unit="month"), + ), + 1, + ) + return type(self)(result) + + @property + def _dt_is_year_start(self): + return type(self)( + pc.and_( + pc.equal(pc.month(self._pa_array), 1), + pc.equal(pc.day(self._pa_array), 1), + ) + ) + + @property + def _dt_is_year_end(self): + return type(self)( + pc.and_( + pc.equal(pc.month(self._pa_array), 12), + pc.equal(pc.day(self._pa_array), 31), + ) + ) + + @property + def _dt_is_quarter_start(self): + result = pc.equal( + pc.floor_temporal(self._pa_array, unit="quarter"), + pc.floor_temporal(self._pa_array, unit="day"), + ) + return type(self)(result) + + @property + def _dt_is_quarter_end(self): + result = pc.equal( + pc.days_between( + pc.floor_temporal(self._pa_array, unit="day"), + pc.ceil_temporal(self._pa_array, unit="quarter"), + ), + 1, + ) + return type(self)(result) + + @property + def _dt_days_in_month(self): + result = pc.days_between( + pc.floor_temporal(self._pa_array, unit="month"), + pc.ceil_temporal(self._pa_array, unit="month"), + ) + return type(self)(result) + + _dt_daysinmonth = _dt_days_in_month + + @property + def _dt_microsecond(self): + return type(self)(pc.microsecond(self._pa_array)) + + @property + def _dt_minute(self): + return type(self)(pc.minute(self._pa_array)) + + @property + def _dt_month(self): + return type(self)(pc.month(self._pa_array)) + + @property + def _dt_nanosecond(self): + return type(self)(pc.nanosecond(self._pa_array)) + + @property + def _dt_quarter(self): + return type(self)(pc.quarter(self._pa_array)) + + @property + def _dt_second(self): + return type(self)(pc.second(self._pa_array)) + + @property + def _dt_date(self): + return type(self)(self._pa_array.cast(pa.date32())) + + @property + def _dt_time(self): + unit = ( + self.dtype.pyarrow_dtype.unit + if self.dtype.pyarrow_dtype.unit in {"us", "ns"} + else "ns" + ) + return type(self)(self._pa_array.cast(pa.time64(unit))) + + @property + def _dt_tz(self): + return timezones.maybe_get_tz(self.dtype.pyarrow_dtype.tz) + + @property + def _dt_unit(self): + return self.dtype.pyarrow_dtype.unit + + def _dt_normalize(self): + return type(self)(pc.floor_temporal(self._pa_array, 1, "day")) + + def _dt_strftime(self, format: str): + return type(self)(pc.strftime(self._pa_array, format=format)) + + def _round_temporally( + self, + method: Literal["ceil", "floor", "round"], + freq, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + ): + if ambiguous != "raise": + raise NotImplementedError("ambiguous is not supported.") + if nonexistent != "raise": + raise NotImplementedError("nonexistent is not supported.") + offset = to_offset(freq) + if offset is None: + raise ValueError(f"Must specify a valid frequency: {freq}") + pa_supported_unit = { + "Y": "year", + "YS": "year", + "Q": "quarter", + "QS": "quarter", + "M": "month", + "MS": "month", + "W": "week", + "D": "day", + "h": "hour", + "min": "minute", + "s": "second", + "ms": "millisecond", + "us": "microsecond", + "ns": "nanosecond", + } + unit = pa_supported_unit.get(offset._prefix, None) + if unit is None: + raise ValueError(f"{freq=} is not supported") + multiple = offset.n + rounding_method = getattr(pc, f"{method}_temporal") + return type(self)(rounding_method(self._pa_array, multiple=multiple, unit=unit)) + + def _dt_ceil( + self, + freq, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + ): + return self._round_temporally("ceil", freq, ambiguous, nonexistent) + + def _dt_floor( + self, + freq, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + ): + return self._round_temporally("floor", freq, ambiguous, nonexistent) + + def _dt_round( + self, + freq, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + ): + return self._round_temporally("round", freq, ambiguous, nonexistent) + + def _dt_day_name(self, locale: str | None = None): + if locale is None: + locale = "C" + return type(self)(pc.strftime(self._pa_array, format="%A", locale=locale)) + + def _dt_month_name(self, locale: str | None = None): + if locale is None: + locale = "C" + return type(self)(pc.strftime(self._pa_array, format="%B", locale=locale)) + + def _dt_to_pydatetime(self): + if pa.types.is_date(self.dtype.pyarrow_dtype): + raise ValueError( + f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. " + "Convert to pyarrow timestamp type." + ) + data = self._pa_array.to_pylist() + if self._dtype.pyarrow_dtype.unit == "ns": + data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data] + return np.array(data, dtype=object) + + def _dt_tz_localize( + self, + tz, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + ): + if ambiguous != "raise": + raise NotImplementedError(f"{ambiguous=} is not supported") + nonexistent_pa = { + "raise": "raise", + "shift_backward": "earliest", + "shift_forward": "latest", + }.get( + nonexistent, None # type: ignore[arg-type] + ) + if nonexistent_pa is None: + raise NotImplementedError(f"{nonexistent=} is not supported") + if tz is None: + result = self._pa_array.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit)) + else: + result = pc.assume_timezone( + self._pa_array, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa + ) + return type(self)(result) + + def _dt_tz_convert(self, tz): + if self.dtype.pyarrow_dtype.tz is None: + raise TypeError( + "Cannot convert tz-naive timestamps, use tz_localize to localize" + ) + current_unit = self.dtype.pyarrow_dtype.unit + result = self._pa_array.cast(pa.timestamp(current_unit, tz)) + return type(self)(result) + + +def transpose_homogeneous_pyarrow( + arrays: Sequence[ArrowExtensionArray], +) -> list[ArrowExtensionArray]: + """Transpose arrow extension arrays in a list, but faster. + + Input should be a list of arrays of equal length and all have the same + dtype. The caller is responsible for ensuring validity of input data. + """ + arrays = list(arrays) + nrows, ncols = len(arrays[0]), len(arrays) + indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten() + arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks]) + arr = arr.take(indices) + return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/extension_types.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/extension_types.py new file mode 100644 index 0000000000000000000000000000000000000000..72bfd6f2212f8fae6ea7786599de44beaeb3f902 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/arrow/extension_types.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +import pyarrow + +from pandas.compat import pa_version_under14p1 + +from pandas.core.dtypes.dtypes import ( + IntervalDtype, + PeriodDtype, +) + +from pandas.core.arrays.interval import VALID_CLOSED + +if TYPE_CHECKING: + from pandas._typing import IntervalClosedType + + +class ArrowPeriodType(pyarrow.ExtensionType): + def __init__(self, freq) -> None: + # attributes need to be set first before calling + # super init (as that calls serialize) + self._freq = freq + pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") + + @property + def freq(self): + return self._freq + + def __arrow_ext_serialize__(self) -> bytes: + metadata = {"freq": self.freq} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowPeriodType: + metadata = json.loads(serialized.decode()) + return ArrowPeriodType(metadata["freq"]) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return type(self) == type(other) and self.freq == other.freq + else: + return NotImplemented + + def __ne__(self, other) -> bool: + return not self == other + + def __hash__(self) -> int: + return hash((str(self), self.freq)) + + def to_pandas_dtype(self) -> PeriodDtype: + return PeriodDtype(freq=self.freq) + + +# register the type with a dummy instance +_period_type = ArrowPeriodType("D") +pyarrow.register_extension_type(_period_type) + + +class ArrowIntervalType(pyarrow.ExtensionType): + def __init__(self, subtype, closed: IntervalClosedType) -> None: + # attributes need to be set first before calling + # super init (as that calls serialize) + assert closed in VALID_CLOSED + self._closed: IntervalClosedType = closed + if not isinstance(subtype, pyarrow.DataType): + subtype = pyarrow.type_for_alias(str(subtype)) + self._subtype = subtype + + storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) + pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") + + @property + def subtype(self): + return self._subtype + + @property + def closed(self) -> IntervalClosedType: + return self._closed + + def __arrow_ext_serialize__(self) -> bytes: + metadata = {"subtype": str(self.subtype), "closed": self.closed} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType: + metadata = json.loads(serialized.decode()) + subtype = pyarrow.type_for_alias(metadata["subtype"]) + closed = metadata["closed"] + return ArrowIntervalType(subtype, closed) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return ( + type(self) == type(other) + and self.subtype == other.subtype + and self.closed == other.closed + ) + else: + return NotImplemented + + def __ne__(self, other) -> bool: + return not self == other + + def __hash__(self) -> int: + return hash((str(self), str(self.subtype), self.closed)) + + def to_pandas_dtype(self) -> IntervalDtype: + return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed) + + +# register the type with a dummy instance +_interval_type = ArrowIntervalType(pyarrow.int64(), "left") +pyarrow.register_extension_type(_interval_type) + + +_ERROR_MSG = """\ +Disallowed deserialization of 'arrow.py_extension_type': +storage_type = {storage_type} +serialized = {serialized} +pickle disassembly:\n{pickle_disassembly} + +Reading of untrusted Parquet or Feather files with a PyExtensionType column +allows arbitrary code execution. +If you trust this file, you can enable reading the extension type by one of: + +- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)` +- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running + `import pyarrow_hotfix; pyarrow_hotfix.uninstall()` + +We strongly recommend updating your Parquet/Feather files to use extension types +derived from `pyarrow.ExtensionType` instead, and register this type explicitly. +""" + + +def patch_pyarrow(): + # starting from pyarrow 14.0.1, it has its own mechanism + if not pa_version_under14p1: + return + + # if https://github.com/pitrou/pyarrow-hotfix was installed and enabled + if getattr(pyarrow, "_hotfix_installed", False): + return + + class ForbiddenExtensionType(pyarrow.ExtensionType): + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + import io + import pickletools + + out = io.StringIO() + pickletools.dis(serialized, out) + raise RuntimeError( + _ERROR_MSG.format( + storage_type=storage_type, + serialized=serialized, + pickle_disassembly=out.getvalue(), + ) + ) + + pyarrow.unregister_extension_type("arrow.py_extension_type") + pyarrow.register_extension_type( + ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type") + ) + + pyarrow._hotfix_installed = True + + +patch_pyarrow() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/floating.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/floating.py new file mode 100644 index 0000000000000000000000000000000000000000..74b8cfb65cbc7887b7d2a164121c90eda0833121 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/floating.py @@ -0,0 +1,173 @@ +from __future__ import annotations + +from typing import ClassVar + +import numpy as np + +from pandas.core.dtypes.base import register_extension_dtype +from pandas.core.dtypes.common import is_float_dtype + +from pandas.core.arrays.numeric import ( + NumericArray, + NumericDtype, +) + + +class FloatingDtype(NumericDtype): + """ + An ExtensionDtype to hold a single size of floating dtype. + + These specific implementations are subclasses of the non-public + FloatingDtype. For example we have Float32Dtype to represent float32. + + The attributes name & type are set when these subclasses are created. + """ + + _default_np_dtype = np.dtype(np.float64) + _checker = is_float_dtype + + @classmethod + def construct_array_type(cls) -> type[FloatingArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return FloatingArray + + @classmethod + def _get_dtype_mapping(cls) -> dict[np.dtype, FloatingDtype]: + return NUMPY_FLOAT_TO_DTYPE + + @classmethod + def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: + """ + Safely cast the values to the given dtype. + + "safe" in this context means the casting is lossless. + """ + # This is really only here for compatibility with IntegerDtype + # Here for compat with IntegerDtype + return values.astype(dtype, copy=copy) + + +class FloatingArray(NumericArray): + """ + Array of floating (optional missing) values. + + .. warning:: + + FloatingArray is currently experimental, and its API or internal + implementation may change without warning. Especially the behaviour + regarding NaN (distinct from NA missing values) is subject to change. + + We represent a FloatingArray with 2 numpy arrays: + + - data: contains a numpy float array of the appropriate dtype + - mask: a boolean array holding a mask on the data, True is missing + + To construct an FloatingArray from generic array-like input, use + :func:`pandas.array` with one of the float dtypes (see examples). + + See :ref:`integer_na` for more. + + Parameters + ---------- + values : numpy.ndarray + A 1-d float-dtype array. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values. + copy : bool, default False + Whether to copy the `values` and `mask`. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + FloatingArray + + Examples + -------- + Create an FloatingArray with :func:`pandas.array`: + + >>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype()) + + [0.1, , 0.3] + Length: 3, dtype: Float32 + + String aliases for the dtypes are also available. They are capitalized. + + >>> pd.array([0.1, None, 0.3], dtype="Float32") + + [0.1, , 0.3] + Length: 3, dtype: Float32 + """ + + _dtype_cls = FloatingDtype + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = np.nan + # Fill values used for any/all + # Incompatible types in assignment (expression has type "float", base class + # "BaseMaskedArray" defined the type as "") + _truthy_value = 1.0 # type: ignore[assignment] + _falsey_value = 0.0 # type: ignore[assignment] + + +_dtype_docstring = """ +An ExtensionDtype for {dtype} data. + +This dtype uses ``pd.NA`` as missing value indicator. + +Attributes +---------- +None + +Methods +------- +None + +Examples +-------- +For Float32Dtype: + +>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float32Dtype()) +>>> ser.dtype +Float32Dtype() + +For Float64Dtype: + +>>> ser = pd.Series([2.25, pd.NA], dtype=pd.Float64Dtype()) +>>> ser.dtype +Float64Dtype() +""" + +# create the Dtype + + +@register_extension_dtype +class Float32Dtype(FloatingDtype): + type = np.float32 + name: ClassVar[str] = "Float32" + __doc__ = _dtype_docstring.format(dtype="float32") + + +@register_extension_dtype +class Float64Dtype(FloatingDtype): + type = np.float64 + name: ClassVar[str] = "Float64" + __doc__ = _dtype_docstring.format(dtype="float64") + + +NUMPY_FLOAT_TO_DTYPE: dict[np.dtype, FloatingDtype] = { + np.dtype(np.float32): Float32Dtype(), + np.dtype(np.float64): Float64Dtype(), +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/integer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/integer.py new file mode 100644 index 0000000000000000000000000000000000000000..f9384e25ba9d9f32caf826efc01b4eb58a454d65 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/integer.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +from typing import ClassVar + +import numpy as np + +from pandas.core.dtypes.base import register_extension_dtype +from pandas.core.dtypes.common import is_integer_dtype + +from pandas.core.arrays.numeric import ( + NumericArray, + NumericDtype, +) + + +class IntegerDtype(NumericDtype): + """ + An ExtensionDtype to hold a single size & kind of integer dtype. + + These specific implementations are subclasses of the non-public + IntegerDtype. For example, we have Int8Dtype to represent signed int 8s. + + The attributes name & type are set when these subclasses are created. + """ + + _default_np_dtype = np.dtype(np.int64) + _checker = is_integer_dtype + + @classmethod + def construct_array_type(cls) -> type[IntegerArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return IntegerArray + + @classmethod + def _get_dtype_mapping(cls) -> dict[np.dtype, IntegerDtype]: + return NUMPY_INT_TO_DTYPE + + @classmethod + def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: + """ + Safely cast the values to the given dtype. + + "safe" in this context means the casting is lossless. e.g. if 'values' + has a floating dtype, each value must be an integer. + """ + try: + return values.astype(dtype, casting="safe", copy=copy) + except TypeError as err: + casted = values.astype(dtype, copy=copy) + if (casted == values).all(): + return casted + + raise TypeError( + f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}" + ) from err + + +class IntegerArray(NumericArray): + """ + Array of integer (optional missing) values. + + Uses :attr:`pandas.NA` as the missing value. + + .. warning:: + + IntegerArray is currently experimental, and its API or internal + implementation may change without warning. + + We represent an IntegerArray with 2 numpy arrays: + + - data: contains a numpy integer array of the appropriate dtype + - mask: a boolean array holding a mask on the data, True is missing + + To construct an IntegerArray from generic array-like input, use + :func:`pandas.array` with one of the integer dtypes (see examples). + + See :ref:`integer_na` for more. + + Parameters + ---------- + values : numpy.ndarray + A 1-d integer-dtype array. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values. + copy : bool, default False + Whether to copy the `values` and `mask`. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + IntegerArray + + Examples + -------- + Create an IntegerArray with :func:`pandas.array`. + + >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype()) + >>> int_array + + [1, , 3] + Length: 3, dtype: Int32 + + String aliases for the dtypes are also available. They are capitalized. + + >>> pd.array([1, None, 3], dtype='Int32') + + [1, , 3] + Length: 3, dtype: Int32 + + >>> pd.array([1, None, 3], dtype='UInt16') + + [1, , 3] + Length: 3, dtype: UInt16 + """ + + _dtype_cls = IntegerDtype + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 1 + # Fill values used for any/all + # Incompatible types in assignment (expression has type "int", base class + # "BaseMaskedArray" defined the type as "") + _truthy_value = 1 # type: ignore[assignment] + _falsey_value = 0 # type: ignore[assignment] + + +_dtype_docstring = """ +An ExtensionDtype for {dtype} integer data. + +Uses :attr:`pandas.NA` as its missing value, rather than :attr:`numpy.nan`. + +Attributes +---------- +None + +Methods +------- +None + +Examples +-------- +For Int8Dtype: + +>>> ser = pd.Series([2, pd.NA], dtype=pd.Int8Dtype()) +>>> ser.dtype +Int8Dtype() + +For Int16Dtype: + +>>> ser = pd.Series([2, pd.NA], dtype=pd.Int16Dtype()) +>>> ser.dtype +Int16Dtype() + +For Int32Dtype: + +>>> ser = pd.Series([2, pd.NA], dtype=pd.Int32Dtype()) +>>> ser.dtype +Int32Dtype() + +For Int64Dtype: + +>>> ser = pd.Series([2, pd.NA], dtype=pd.Int64Dtype()) +>>> ser.dtype +Int64Dtype() + +For UInt8Dtype: + +>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt8Dtype()) +>>> ser.dtype +UInt8Dtype() + +For UInt16Dtype: + +>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt16Dtype()) +>>> ser.dtype +UInt16Dtype() + +For UInt32Dtype: + +>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt32Dtype()) +>>> ser.dtype +UInt32Dtype() + +For UInt64Dtype: + +>>> ser = pd.Series([2, pd.NA], dtype=pd.UInt64Dtype()) +>>> ser.dtype +UInt64Dtype() +""" + +# create the Dtype + + +@register_extension_dtype +class Int8Dtype(IntegerDtype): + type = np.int8 + name: ClassVar[str] = "Int8" + __doc__ = _dtype_docstring.format(dtype="int8") + + +@register_extension_dtype +class Int16Dtype(IntegerDtype): + type = np.int16 + name: ClassVar[str] = "Int16" + __doc__ = _dtype_docstring.format(dtype="int16") + + +@register_extension_dtype +class Int32Dtype(IntegerDtype): + type = np.int32 + name: ClassVar[str] = "Int32" + __doc__ = _dtype_docstring.format(dtype="int32") + + +@register_extension_dtype +class Int64Dtype(IntegerDtype): + type = np.int64 + name: ClassVar[str] = "Int64" + __doc__ = _dtype_docstring.format(dtype="int64") + + +@register_extension_dtype +class UInt8Dtype(IntegerDtype): + type = np.uint8 + name: ClassVar[str] = "UInt8" + __doc__ = _dtype_docstring.format(dtype="uint8") + + +@register_extension_dtype +class UInt16Dtype(IntegerDtype): + type = np.uint16 + name: ClassVar[str] = "UInt16" + __doc__ = _dtype_docstring.format(dtype="uint16") + + +@register_extension_dtype +class UInt32Dtype(IntegerDtype): + type = np.uint32 + name: ClassVar[str] = "UInt32" + __doc__ = _dtype_docstring.format(dtype="uint32") + + +@register_extension_dtype +class UInt64Dtype(IntegerDtype): + type = np.uint64 + name: ClassVar[str] = "UInt64" + __doc__ = _dtype_docstring.format(dtype="uint64") + + +NUMPY_INT_TO_DTYPE: dict[np.dtype, IntegerDtype] = { + np.dtype(np.int8): Int8Dtype(), + np.dtype(np.int16): Int16Dtype(), + np.dtype(np.int32): Int32Dtype(), + np.dtype(np.int64): Int64Dtype(), + np.dtype(np.uint8): UInt8Dtype(), + np.dtype(np.uint16): UInt16Dtype(), + np.dtype(np.uint32): UInt32Dtype(), + np.dtype(np.uint64): UInt64Dtype(), +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/interval.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/interval.py new file mode 100644 index 0000000000000000000000000000000000000000..91db7f11bcbe025045318c70451f1c9e75fd779a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/interval.py @@ -0,0 +1,1917 @@ +from __future__ import annotations + +import operator +from operator import ( + le, + lt, +) +import textwrap +from typing import ( + TYPE_CHECKING, + Literal, + Union, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas._libs.interval import ( + VALID_CLOSED, + Interval, + IntervalMixin, + intervals_to_interval_bounds, +) +from pandas._libs.missing import NA +from pandas._typing import ( + ArrayLike, + AxisInt, + Dtype, + FillnaOptions, + IntervalClosedType, + NpDtype, + PositionalIndexer, + ScalarIndexer, + Self, + SequenceIndexer, + SortKind, + TimeArrayLike, + npt, +) +from pandas.compat.numpy import function as nv +from pandas.errors import IntCastingNaNError +from pandas.util._decorators import Appender + +from pandas.core.dtypes.cast import ( + LossySetitemError, + maybe_upcast_numeric_to_64bit, +) +from pandas.core.dtypes.common import ( + is_float_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, + is_string_dtype, + needs_i8_conversion, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + IntervalDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCDatetimeIndex, + ABCIntervalIndex, + ABCPeriodIndex, +) +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna, + notna, +) + +from pandas.core.algorithms import ( + isin, + take, + unique, + value_counts_internal as value_counts, +) +from pandas.core.arrays import ArrowExtensionArray +from pandas.core.arrays.base import ( + ExtensionArray, + _extension_array_shared_docs, +) +from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.timedeltas import TimedeltaArray +import pandas.core.common as com +from pandas.core.construction import ( + array as pd_array, + ensure_wrapped_if_datetimelike, + extract_array, +) +from pandas.core.indexers import check_array_indexer +from pandas.core.ops import ( + invalid_comparison, + unpack_zerodim_and_defer, +) + +if TYPE_CHECKING: + from collections.abc import ( + Iterator, + Sequence, + ) + + from pandas import ( + Index, + Series, + ) + + +IntervalSide = Union[TimeArrayLike, np.ndarray] +IntervalOrNA = Union[Interval, float] + +_interval_shared_docs: dict[str, str] = {} + +_shared_docs_kwargs = { + "klass": "IntervalArray", + "qualname": "arrays.IntervalArray", + "name": "", +} + + +_interval_shared_docs[ + "class" +] = """ +%(summary)s + +Parameters +---------- +data : array-like (1-dimensional) + Array-like (ndarray, :class:`DateTimeArray`, :class:`TimeDeltaArray`) containing + Interval objects from which to build the %(klass)s. +closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both or + neither. +dtype : dtype or None, default None + If None, dtype will be inferred. +copy : bool, default False + Copy the input data. +%(name)s\ +verify_integrity : bool, default True + Verify that the %(klass)s is valid. + +Attributes +---------- +left +right +closed +mid +length +is_empty +is_non_overlapping_monotonic +%(extra_attributes)s\ + +Methods +------- +from_arrays +from_tuples +from_breaks +contains +overlaps +set_closed +to_tuples +%(extra_methods)s\ + +See Also +-------- +Index : The base pandas Index type. +Interval : A bounded slice-like interval; the elements of an %(klass)s. +interval_range : Function to create a fixed frequency IntervalIndex. +cut : Bin values into discrete Intervals. +qcut : Bin values into equal-sized Intervals based on rank or sample quantiles. + +Notes +----- +See the `user guide +`__ +for more. + +%(examples)s\ +""" + + +@Appender( + _interval_shared_docs["class"] + % { + "klass": "IntervalArray", + "summary": "Pandas array for interval data that are closed on the same side.", + "name": "", + "extra_attributes": "", + "extra_methods": "", + "examples": textwrap.dedent( + """\ + Examples + -------- + A new ``IntervalArray`` can be constructed directly from an array-like of + ``Interval`` objects: + + >>> pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + + It may also be constructed using one of the constructor + methods: :meth:`IntervalArray.from_arrays`, + :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`. + """ + ), + } +) +class IntervalArray(IntervalMixin, ExtensionArray): + can_hold_na = True + _na_value = _fill_value = np.nan + + @property + def ndim(self) -> Literal[1]: + return 1 + + # To make mypy recognize the fields + _left: IntervalSide + _right: IntervalSide + _dtype: IntervalDtype + + # --------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + data, + closed: IntervalClosedType | None = None, + dtype: Dtype | None = None, + copy: bool = False, + verify_integrity: bool = True, + ): + data = extract_array(data, extract_numpy=True) + + if isinstance(data, cls): + left: IntervalSide = data._left + right: IntervalSide = data._right + closed = closed or data.closed + dtype = IntervalDtype(left.dtype, closed=closed) + else: + # don't allow scalars + if is_scalar(data): + msg = ( + f"{cls.__name__}(...) must be called with a collection " + f"of some kind, {data} was passed" + ) + raise TypeError(msg) + + # might need to convert empty or purely na data + data = _maybe_convert_platform_interval(data) + left, right, infer_closed = intervals_to_interval_bounds( + data, validate_closed=closed is None + ) + if left.dtype == object: + left = lib.maybe_convert_objects(left) + right = lib.maybe_convert_objects(right) + closed = closed or infer_closed + + left, right, dtype = cls._ensure_simple_new_inputs( + left, + right, + closed=closed, + copy=copy, + dtype=dtype, + ) + + if verify_integrity: + cls._validate(left, right, dtype=dtype) + + return cls._simple_new( + left, + right, + dtype=dtype, + ) + + @classmethod + def _simple_new( + cls, + left: IntervalSide, + right: IntervalSide, + dtype: IntervalDtype, + ) -> Self: + result = IntervalMixin.__new__(cls) + result._left = left + result._right = right + result._dtype = dtype + + return result + + @classmethod + def _ensure_simple_new_inputs( + cls, + left, + right, + closed: IntervalClosedType | None = None, + copy: bool = False, + dtype: Dtype | None = None, + ) -> tuple[IntervalSide, IntervalSide, IntervalDtype]: + """Ensure correctness of input parameters for cls._simple_new.""" + from pandas.core.indexes.base import ensure_index + + left = ensure_index(left, copy=copy) + left = maybe_upcast_numeric_to_64bit(left) + + right = ensure_index(right, copy=copy) + right = maybe_upcast_numeric_to_64bit(right) + + if closed is None and isinstance(dtype, IntervalDtype): + closed = dtype.closed + + closed = closed or "right" + + if dtype is not None: + # GH 19262: dtype must be an IntervalDtype to override inferred + dtype = pandas_dtype(dtype) + if isinstance(dtype, IntervalDtype): + if dtype.subtype is not None: + left = left.astype(dtype.subtype) + right = right.astype(dtype.subtype) + else: + msg = f"dtype must be an IntervalDtype, got {dtype}" + raise TypeError(msg) + + if dtype.closed is None: + # possibly loading an old pickle + dtype = IntervalDtype(dtype.subtype, closed) + elif closed != dtype.closed: + raise ValueError("closed keyword does not match dtype.closed") + + # coerce dtypes to match if needed + if is_float_dtype(left.dtype) and is_integer_dtype(right.dtype): + right = right.astype(left.dtype) + elif is_float_dtype(right.dtype) and is_integer_dtype(left.dtype): + left = left.astype(right.dtype) + + if type(left) != type(right): + msg = ( + f"must not have differing left [{type(left).__name__}] and " + f"right [{type(right).__name__}] types" + ) + raise ValueError(msg) + if isinstance(left.dtype, CategoricalDtype) or is_string_dtype(left.dtype): + # GH 19016 + msg = ( + "category, object, and string subtypes are not supported " + "for IntervalArray" + ) + raise TypeError(msg) + if isinstance(left, ABCPeriodIndex): + msg = "Period dtypes are not supported, use a PeriodIndex instead" + raise ValueError(msg) + if isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): + msg = ( + "left and right must have the same time zone, got " + f"'{left.tz}' and '{right.tz}'" + ) + raise ValueError(msg) + elif needs_i8_conversion(left.dtype) and left.unit != right.unit: + # e.g. m8[s] vs m8[ms], try to cast to a common dtype GH#55714 + left_arr, right_arr = left._data._ensure_matching_resos(right._data) + left = ensure_index(left_arr) + right = ensure_index(right_arr) + + # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray + left = ensure_wrapped_if_datetimelike(left) + left = extract_array(left, extract_numpy=True) + right = ensure_wrapped_if_datetimelike(right) + right = extract_array(right, extract_numpy=True) + + if isinstance(left, ArrowExtensionArray) or isinstance( + right, ArrowExtensionArray + ): + pass + else: + lbase = getattr(left, "_ndarray", left) + lbase = getattr(lbase, "_data", lbase).base + rbase = getattr(right, "_ndarray", right) + rbase = getattr(rbase, "_data", rbase).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() + + dtype = IntervalDtype(left.dtype, closed=closed) + + return left, right, dtype + + @classmethod + def _from_sequence( + cls, + scalars, + *, + dtype: Dtype | None = None, + copy: bool = False, + ) -> Self: + return cls(scalars, dtype=dtype, copy=copy) + + @classmethod + def _from_factorized(cls, values: np.ndarray, original: IntervalArray) -> Self: + return cls._from_sequence(values, dtype=original.dtype) + + _interval_shared_docs["from_breaks"] = textwrap.dedent( + """ + Construct an %(klass)s from an array of splits. + + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither.\ + %(name)s + copy : bool, default False + Copy the data. + dtype : dtype or None, default None + If None, dtype will be inferred. + + Returns + ------- + %(klass)s + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_arrays : Construct from a left and right array. + %(klass)s.from_tuples : Construct from a sequence of tuples. + + %(examples)s\ + """ + ) + + @classmethod + @Appender( + _interval_shared_docs["from_breaks"] + % { + "klass": "IntervalArray", + "name": "", + "examples": textwrap.dedent( + """\ + Examples + -------- + >>> pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3]) + + [(0, 1], (1, 2], (2, 3]] + Length: 3, dtype: interval[int64, right] + """ + ), + } + ) + def from_breaks( + cls, + breaks, + closed: IntervalClosedType | None = "right", + copy: bool = False, + dtype: Dtype | None = None, + ) -> Self: + breaks = _maybe_convert_platform_interval(breaks) + + return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, dtype=dtype) + + _interval_shared_docs["from_arrays"] = textwrap.dedent( + """ + Construct from two arrays defining the left and right bounds. + + Parameters + ---------- + left : array-like (1-dimensional) + Left bounds for each interval. + right : array-like (1-dimensional) + Right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither.\ + %(name)s + copy : bool, default False + Copy the data. + dtype : dtype, optional + If None, dtype will be inferred. + + Returns + ------- + %(klass)s + + Raises + ------ + ValueError + When a value is missing in only one of `left` or `right`. + When a value in `left` is greater than the corresponding value + in `right`. + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits. + %(klass)s.from_tuples : Construct an %(klass)s from an + array-like of tuples. + + Notes + ----- + Each element of `left` must be less than or equal to the `right` + element at the same position. If an element is missing, it must be + missing in both `left` and `right`. A TypeError is raised when + using an unsupported type for `left` or `right`. At the moment, + 'category', 'object', and 'string' subtypes are not supported. + + %(examples)s\ + """ + ) + + @classmethod + @Appender( + _interval_shared_docs["from_arrays"] + % { + "klass": "IntervalArray", + "name": "", + "examples": textwrap.dedent( + """\ + Examples + -------- + >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3]) + + [(0, 1], (1, 2], (2, 3]] + Length: 3, dtype: interval[int64, right] + """ + ), + } + ) + def from_arrays( + cls, + left, + right, + closed: IntervalClosedType | None = "right", + copy: bool = False, + dtype: Dtype | None = None, + ) -> Self: + left = _maybe_convert_platform_interval(left) + right = _maybe_convert_platform_interval(right) + + left, right, dtype = cls._ensure_simple_new_inputs( + left, + right, + closed=closed, + copy=copy, + dtype=dtype, + ) + cls._validate(left, right, dtype=dtype) + + return cls._simple_new(left, right, dtype=dtype) + + _interval_shared_docs["from_tuples"] = textwrap.dedent( + """ + Construct an %(klass)s from an array-like of tuples. + + Parameters + ---------- + data : array-like (1-dimensional) + Array of tuples. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither.\ + %(name)s + copy : bool, default False + By-default copy the data, this is compat only and ignored. + dtype : dtype or None, default None + If None, dtype will be inferred. + + Returns + ------- + %(klass)s + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_arrays : Construct an %(klass)s from a left and + right array. + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits. + + %(examples)s\ + """ + ) + + @classmethod + @Appender( + _interval_shared_docs["from_tuples"] + % { + "klass": "IntervalArray", + "name": "", + "examples": textwrap.dedent( + """\ + Examples + -------- + >>> pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) + + [(0, 1], (1, 2]] + Length: 2, dtype: interval[int64, right] + """ + ), + } + ) + def from_tuples( + cls, + data, + closed: IntervalClosedType | None = "right", + copy: bool = False, + dtype: Dtype | None = None, + ) -> Self: + if len(data): + left, right = [], [] + else: + # ensure that empty data keeps input dtype + left = right = data + + for d in data: + if not isinstance(d, tuple) and isna(d): + lhs = rhs = np.nan + else: + name = cls.__name__ + try: + # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...] + lhs, rhs = d + except ValueError as err: + msg = f"{name}.from_tuples requires tuples of length 2, got {d}" + raise ValueError(msg) from err + except TypeError as err: + msg = f"{name}.from_tuples received an invalid item, {d}" + raise TypeError(msg) from err + left.append(lhs) + right.append(rhs) + + return cls.from_arrays(left, right, closed, copy=False, dtype=dtype) + + @classmethod + def _validate(cls, left, right, dtype: IntervalDtype) -> None: + """ + Verify that the IntervalArray is valid. + + Checks that + + * dtype is correct + * left and right match lengths + * left and right have the same missing values + * left is always below right + """ + if not isinstance(dtype, IntervalDtype): + msg = f"invalid dtype: {dtype}" + raise ValueError(msg) + if len(left) != len(right): + msg = "left and right must have the same length" + raise ValueError(msg) + left_mask = notna(left) + right_mask = notna(right) + if not (left_mask == right_mask).all(): + msg = ( + "missing values must be missing in the same " + "location both left and right sides" + ) + raise ValueError(msg) + if not (left[left_mask] <= right[left_mask]).all(): + msg = "left side of interval must be <= right side" + raise ValueError(msg) + + def _shallow_copy(self, left, right) -> Self: + """ + Return a new IntervalArray with the replacement attributes + + Parameters + ---------- + left : Index + Values to be used for the left-side of the intervals. + right : Index + Values to be used for the right-side of the intervals. + """ + dtype = IntervalDtype(left.dtype, closed=self.closed) + left, right, dtype = self._ensure_simple_new_inputs(left, right, dtype=dtype) + + return self._simple_new(left, right, dtype=dtype) + + # --------------------------------------------------------------------- + # Descriptive + + @property + def dtype(self) -> IntervalDtype: + return self._dtype + + @property + def nbytes(self) -> int: + return self.left.nbytes + self.right.nbytes + + @property + def size(self) -> int: + # Avoid materializing self.values + return self.left.size + + # --------------------------------------------------------------------- + # EA Interface + + def __iter__(self) -> Iterator: + return iter(np.asarray(self)) + + def __len__(self) -> int: + return len(self._left) + + @overload + def __getitem__(self, key: ScalarIndexer) -> IntervalOrNA: + ... + + @overload + def __getitem__(self, key: SequenceIndexer) -> Self: + ... + + def __getitem__(self, key: PositionalIndexer) -> Self | IntervalOrNA: + key = check_array_indexer(self, key) + left = self._left[key] + right = self._right[key] + + if not isinstance(left, (np.ndarray, ExtensionArray)): + # scalar + if is_scalar(left) and isna(left): + return self._fill_value + return Interval(left, right, self.closed) + if np.ndim(left) > 1: + # GH#30588 multi-dimensional indexer disallowed + raise ValueError("multi-dimensional indexing not allowed") + # Argument 2 to "_simple_new" of "IntervalArray" has incompatible type + # "Union[Period, Timestamp, Timedelta, NaTType, DatetimeArray, TimedeltaArray, + # ndarray[Any, Any]]"; expected "Union[Union[DatetimeArray, TimedeltaArray], + # ndarray[Any, Any]]" + return self._simple_new(left, right, dtype=self.dtype) # type: ignore[arg-type] + + def __setitem__(self, key, value) -> None: + value_left, value_right = self._validate_setitem_value(value) + key = check_array_indexer(self, key) + + self._left[key] = value_left + self._right[key] = value_right + + def _cmp_method(self, other, op): + # ensure pandas array for list-like and eliminate non-interval scalars + if is_list_like(other): + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + other = pd_array(other) + elif not isinstance(other, Interval): + # non-interval scalar -> no matches + if other is NA: + # GH#31882 + from pandas.core.arrays import BooleanArray + + arr = np.empty(self.shape, dtype=bool) + mask = np.ones(self.shape, dtype=bool) + return BooleanArray(arr, mask) + return invalid_comparison(self, other, op) + + # determine the dtype of the elements we want to compare + if isinstance(other, Interval): + other_dtype = pandas_dtype("interval") + elif not isinstance(other.dtype, CategoricalDtype): + other_dtype = other.dtype + else: + # for categorical defer to categories for dtype + other_dtype = other.categories.dtype + + # extract intervals if we have interval categories with matching closed + if isinstance(other_dtype, IntervalDtype): + if self.closed != other.categories.closed: + return invalid_comparison(self, other, op) + + other = other.categories._values.take( + other.codes, allow_fill=True, fill_value=other.categories._na_value + ) + + # interval-like -> need same closed and matching endpoints + if isinstance(other_dtype, IntervalDtype): + if self.closed != other.closed: + return invalid_comparison(self, other, op) + elif not isinstance(other, Interval): + other = type(self)(other) + + if op is operator.eq: + return (self._left == other.left) & (self._right == other.right) + elif op is operator.ne: + return (self._left != other.left) | (self._right != other.right) + elif op is operator.gt: + return (self._left > other.left) | ( + (self._left == other.left) & (self._right > other.right) + ) + elif op is operator.ge: + return (self == other) | (self > other) + elif op is operator.lt: + return (self._left < other.left) | ( + (self._left == other.left) & (self._right < other.right) + ) + else: + # operator.lt + return (self == other) | (self < other) + + # non-interval/non-object dtype -> no matches + if not is_object_dtype(other_dtype): + return invalid_comparison(self, other, op) + + # object dtype -> iteratively check for intervals + result = np.zeros(len(self), dtype=bool) + for i, obj in enumerate(other): + try: + result[i] = op(self[i], obj) + except TypeError: + if obj is NA: + # comparison with np.nan returns NA + # github.com/pandas-dev/pandas/pull/37124#discussion_r509095092 + result = result.astype(object) + result[i] = NA + else: + raise + return result + + @unpack_zerodim_and_defer("__eq__") + def __eq__(self, other): + return self._cmp_method(other, operator.eq) + + @unpack_zerodim_and_defer("__ne__") + def __ne__(self, other): + return self._cmp_method(other, operator.ne) + + @unpack_zerodim_and_defer("__gt__") + def __gt__(self, other): + return self._cmp_method(other, operator.gt) + + @unpack_zerodim_and_defer("__ge__") + def __ge__(self, other): + return self._cmp_method(other, operator.ge) + + @unpack_zerodim_and_defer("__lt__") + def __lt__(self, other): + return self._cmp_method(other, operator.lt) + + @unpack_zerodim_and_defer("__le__") + def __le__(self, other): + return self._cmp_method(other, operator.le) + + def argsort( + self, + *, + ascending: bool = True, + kind: SortKind = "quicksort", + na_position: str = "last", + **kwargs, + ) -> np.ndarray: + ascending = nv.validate_argsort_with_ascending(ascending, (), kwargs) + + if ascending and kind == "quicksort" and na_position == "last": + # TODO: in an IntervalIndex we can reuse the cached + # IntervalTree.left_sorter + return np.lexsort((self.right, self.left)) + + # TODO: other cases we can use lexsort for? much more performant. + return super().argsort( + ascending=ascending, kind=kind, na_position=na_position, **kwargs + ) + + def min(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOrNA: + nv.validate_minmax_axis(axis, self.ndim) + + if not len(self): + return self._na_value + + mask = self.isna() + if mask.any(): + if not skipna: + return self._na_value + obj = self[~mask] + else: + obj = self + + indexer = obj.argsort()[0] + return obj[indexer] + + def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOrNA: + nv.validate_minmax_axis(axis, self.ndim) + + if not len(self): + return self._na_value + + mask = self.isna() + if mask.any(): + if not skipna: + return self._na_value + obj = self[~mask] + else: + obj = self + + indexer = obj.argsort()[-1] + return obj[indexer] + + def _pad_or_backfill( # pylint: disable=useless-parent-delegation + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove + # this method entirely. + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) + + def fillna( + self, value=None, method=None, limit: int | None = None, copy: bool = True + ) -> Self: + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, dict, Series + If a scalar value is passed it is used to fill all missing values. + Alternatively, a Series or dict can be used to fill in different + values for each index. The value should not be a list. The + value(s) passed should be either Interval objects or NA/NaN. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + (Not implemented yet for IntervalArray) + Method to use for filling holes in reindexed Series + limit : int, default None + (Not implemented yet for IntervalArray) + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + copy : bool, default True + Whether to make a copy of the data before filling. If False, then + the original should be modified and no new memory should be allocated. + For ExtensionArray subclasses that cannot do this, it is at the + author's discretion whether to ignore "copy=False" or to raise. + + Returns + ------- + filled : IntervalArray with NA/NaN filled + """ + if copy is False: + raise NotImplementedError + if method is not None: + return super().fillna(value=value, method=method, limit=limit) + + value_left, value_right = self._validate_scalar(value) + + left = self.left.fillna(value=value_left) + right = self.right.fillna(value=value_right) + return self._shallow_copy(left, right) + + def astype(self, dtype, copy: bool = True): + """ + Cast to an ExtensionArray or NumPy array with dtype 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ExtensionArray or ndarray + ExtensionArray or NumPy ndarray with 'dtype' for its dtype. + """ + from pandas import Index + + if dtype is not None: + dtype = pandas_dtype(dtype) + + if isinstance(dtype, IntervalDtype): + if dtype == self.dtype: + return self.copy() if copy else self + + if is_float_dtype(self.dtype.subtype) and needs_i8_conversion( + dtype.subtype + ): + # This is allowed on the Index.astype but we disallow it here + msg = ( + f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" + ) + raise TypeError(msg) + + # need to cast to different subtype + try: + # We need to use Index rules for astype to prevent casting + # np.nan entries to int subtypes + new_left = Index(self._left, copy=False).astype(dtype.subtype) + new_right = Index(self._right, copy=False).astype(dtype.subtype) + except IntCastingNaNError: + # e.g test_subtype_integer + raise + except (TypeError, ValueError) as err: + # e.g. test_subtype_integer_errors f8->u8 can be lossy + # and raises ValueError + msg = ( + f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" + ) + raise TypeError(msg) from err + return self._shallow_copy(new_left, new_right) + else: + try: + return super().astype(dtype, copy=copy) + except (TypeError, ValueError) as err: + msg = f"Cannot cast {type(self).__name__} to dtype {dtype}" + raise TypeError(msg) from err + + def equals(self, other) -> bool: + if type(self) != type(other): + return False + + return bool( + self.closed == other.closed + and self.left.equals(other.left) + and self.right.equals(other.right) + ) + + @classmethod + def _concat_same_type(cls, to_concat: Sequence[IntervalArray]) -> Self: + """ + Concatenate multiple IntervalArray + + Parameters + ---------- + to_concat : sequence of IntervalArray + + Returns + ------- + IntervalArray + """ + closed_set = {interval.closed for interval in to_concat} + if len(closed_set) != 1: + raise ValueError("Intervals must all be closed on the same side.") + closed = closed_set.pop() + + left: IntervalSide = np.concatenate([interval.left for interval in to_concat]) + right: IntervalSide = np.concatenate([interval.right for interval in to_concat]) + + left, right, dtype = cls._ensure_simple_new_inputs(left, right, closed=closed) + + return cls._simple_new(left, right, dtype=dtype) + + def copy(self) -> Self: + """ + Return a copy of the array. + + Returns + ------- + IntervalArray + """ + left = self._left.copy() + right = self._right.copy() + dtype = self.dtype + return self._simple_new(left, right, dtype=dtype) + + def isna(self) -> np.ndarray: + return isna(self._left) + + def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: + if not len(self) or periods == 0: + return self.copy() + + self._validate_scalar(fill_value) + + # ExtensionArray.shift doesn't work for two reasons + # 1. IntervalArray.dtype.na_value may not be correct for the dtype. + # 2. IntervalArray._from_sequence only accepts NaN for missing values, + # not other values like NaT + + empty_len = min(abs(periods), len(self)) + if isna(fill_value): + from pandas import Index + + fill_value = Index(self._left, copy=False)._na_value + empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) + else: + empty = self._from_sequence([fill_value] * empty_len, dtype=self.dtype) + + if periods > 0: + a = empty + b = self[:-periods] + else: + a = self[abs(periods) :] + b = empty + return self._concat_same_type([a, b]) + + def take( + self, + indices, + *, + allow_fill: bool = False, + fill_value=None, + axis=None, + **kwargs, + ) -> Self: + """ + Take elements from the IntervalArray. + + Parameters + ---------- + indices : sequence of integers + Indices to be taken. + + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : Interval or NA, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + axis : any, default None + Present for compat with IntervalIndex; does nothing. + + Returns + ------- + IntervalArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + """ + nv.validate_take((), kwargs) + + fill_left = fill_right = fill_value + if allow_fill: + fill_left, fill_right = self._validate_scalar(fill_value) + + left_take = take( + self._left, indices, allow_fill=allow_fill, fill_value=fill_left + ) + right_take = take( + self._right, indices, allow_fill=allow_fill, fill_value=fill_right + ) + + return self._shallow_copy(left_take, right_take) + + def _validate_listlike(self, value): + # list-like of intervals + try: + array = IntervalArray(value) + self._check_closed_matches(array, name="value") + value_left, value_right = array.left, array.right + except TypeError as err: + # wrong type: not interval or NA + msg = f"'value' should be an interval type, got {type(value)} instead." + raise TypeError(msg) from err + + try: + self.left._validate_fill_value(value_left) + except (LossySetitemError, TypeError) as err: + msg = ( + "'value' should be a compatible interval type, " + f"got {type(value)} instead." + ) + raise TypeError(msg) from err + + return value_left, value_right + + def _validate_scalar(self, value): + if isinstance(value, Interval): + self._check_closed_matches(value, name="value") + left, right = value.left, value.right + # TODO: check subdtype match like _validate_setitem_value? + elif is_valid_na_for_dtype(value, self.left.dtype): + # GH#18295 + left = right = self.left._na_value + else: + raise TypeError( + "can only insert Interval objects and NA into an IntervalArray" + ) + return left, right + + def _validate_setitem_value(self, value): + if is_valid_na_for_dtype(value, self.left.dtype): + # na value: need special casing to set directly on numpy arrays + value = self.left._na_value + if is_integer_dtype(self.dtype.subtype): + # can't set NaN on a numpy integer array + # GH#45484 TypeError, not ValueError, matches what we get with + # non-NA un-holdable value. + raise TypeError("Cannot set float NaN to integer-backed IntervalArray") + value_left, value_right = value, value + + elif isinstance(value, Interval): + # scalar interval + self._check_closed_matches(value, name="value") + value_left, value_right = value.left, value.right + self.left._validate_fill_value(value_left) + self.left._validate_fill_value(value_right) + + else: + return self._validate_listlike(value) + + return value_left, value_right + + def value_counts(self, dropna: bool = True) -> Series: + """ + Returns a Series containing counts of each interval. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of NaN. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + # TODO: implement this is a non-naive way! + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "The behavior of value_counts with object-dtype is deprecated", + category=FutureWarning, + ) + result = value_counts(np.asarray(self), dropna=dropna) + # Once the deprecation is enforced, we will need to do + # `result.index = result.index.astype(self.dtype)` + return result + + # --------------------------------------------------------------------- + # Rendering Methods + + def _formatter(self, boxed: bool = False): + # returning 'str' here causes us to render as e.g. "(0, 1]" instead of + # "Interval(0, 1, closed='right')" + return str + + # --------------------------------------------------------------------- + # Vectorized Interval Properties/Attributes + + @property + def left(self) -> Index: + """ + Return the left endpoints of each Interval in the IntervalArray as an Index. + + Examples + -------- + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(2, 5)]) + >>> interv_arr + + [(0, 1], (2, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.left + Index([0, 2], dtype='int64') + """ + from pandas import Index + + return Index(self._left, copy=False) + + @property + def right(self) -> Index: + """ + Return the right endpoints of each Interval in the IntervalArray as an Index. + + Examples + -------- + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(2, 5)]) + >>> interv_arr + + [(0, 1], (2, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.right + Index([1, 5], dtype='int64') + """ + from pandas import Index + + return Index(self._right, copy=False) + + @property + def length(self) -> Index: + """ + Return an Index with entries denoting the length of each Interval. + + Examples + -------- + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.length + Index([1, 4], dtype='int64') + """ + return self.right - self.left + + @property + def mid(self) -> Index: + """ + Return the midpoint of each Interval in the IntervalArray as an Index. + + Examples + -------- + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.mid + Index([0.5, 3.0], dtype='float64') + """ + try: + return 0.5 * (self.left + self.right) + except TypeError: + # datetime safe version + return self.left + 0.5 * self.length + + _interval_shared_docs["overlaps"] = textwrap.dedent( + """ + Check elementwise if an Interval overlaps the values in the %(klass)s. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + Parameters + ---------- + other : %(klass)s + Interval to check against for an overlap. + + Returns + ------- + ndarray + Boolean array positionally indicating where an overlap occurs. + + See Also + -------- + Interval.overlaps : Check whether two Interval objects overlap. + + Examples + -------- + %(examples)s + >>> intervals.overlaps(pd.Interval(0.5, 1.5)) + array([ True, True, False]) + + Intervals that share closed endpoints overlap: + + >>> intervals.overlaps(pd.Interval(1, 3, closed='left')) + array([ True, True, True]) + + Intervals that only have an open endpoint in common do not overlap: + + >>> intervals.overlaps(pd.Interval(1, 2, closed='right')) + array([False, True, False]) + """ + ) + + @Appender( + _interval_shared_docs["overlaps"] + % { + "klass": "IntervalArray", + "examples": textwrap.dedent( + """\ + >>> data = [(0, 1), (1, 3), (2, 4)] + >>> intervals = pd.arrays.IntervalArray.from_tuples(data) + >>> intervals + + [(0, 1], (1, 3], (2, 4]] + Length: 3, dtype: interval[int64, right] + """ + ), + } + ) + def overlaps(self, other): + if isinstance(other, (IntervalArray, ABCIntervalIndex)): + raise NotImplementedError + if not isinstance(other, Interval): + msg = f"`other` must be Interval-like, got {type(other).__name__}" + raise TypeError(msg) + + # equality is okay if both endpoints are closed (overlap at a point) + op1 = le if (self.closed_left and other.closed_right) else lt + op2 = le if (other.closed_left and self.closed_right) else lt + + # overlaps is equivalent negation of two interval being disjoint: + # disjoint = (A.left > B.right) or (B.left > A.right) + # (simplifying the negation allows this to be done in less operations) + return op1(self.left, other.right) & op2(other.left, self.right) + + # --------------------------------------------------------------------- + + @property + def closed(self) -> IntervalClosedType: + """ + String describing the inclusive side the intervals. + + Either ``left``, ``right``, ``both`` or ``neither``. + + Examples + -------- + + For arrays: + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.closed + 'right' + + For Interval Index: + + >>> interv_idx = pd.interval_range(start=0, end=2) + >>> interv_idx + IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]') + >>> interv_idx.closed + 'right' + """ + return self.dtype.closed + + _interval_shared_docs["set_closed"] = textwrap.dedent( + """ + Return an identical %(klass)s closed on the specified side. + + Parameters + ---------- + closed : {'left', 'right', 'both', 'neither'} + Whether the intervals are closed on the left-side, right-side, both + or neither. + + Returns + ------- + %(klass)s + + %(examples)s\ + """ + ) + + @Appender( + _interval_shared_docs["set_closed"] + % { + "klass": "IntervalArray", + "examples": textwrap.dedent( + """\ + Examples + -------- + >>> index = pd.arrays.IntervalArray.from_breaks(range(4)) + >>> index + + [(0, 1], (1, 2], (2, 3]] + Length: 3, dtype: interval[int64, right] + >>> index.set_closed('both') + + [[0, 1], [1, 2], [2, 3]] + Length: 3, dtype: interval[int64, both] + """ + ), + } + ) + def set_closed(self, closed: IntervalClosedType) -> Self: + if closed not in VALID_CLOSED: + msg = f"invalid option for 'closed': {closed}" + raise ValueError(msg) + + left, right = self._left, self._right + dtype = IntervalDtype(left.dtype, closed=closed) + return self._simple_new(left, right, dtype=dtype) + + _interval_shared_docs[ + "is_non_overlapping_monotonic" + ] = """ + Return a boolean whether the %(klass)s is non-overlapping and monotonic. + + Non-overlapping means (no Intervals share points), and monotonic means + either monotonic increasing or monotonic decreasing. + + Examples + -------- + For arrays: + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) + >>> interv_arr + + [(0, 1], (1, 5]] + Length: 2, dtype: interval[int64, right] + >>> interv_arr.is_non_overlapping_monotonic + True + + >>> interv_arr = pd.arrays.IntervalArray([pd.Interval(0, 1), + ... pd.Interval(-1, 0.1)]) + >>> interv_arr + + [(0.0, 1.0], (-1.0, 0.1]] + Length: 2, dtype: interval[float64, right] + >>> interv_arr.is_non_overlapping_monotonic + False + + For Interval Index: + + >>> interv_idx = pd.interval_range(start=0, end=2) + >>> interv_idx + IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]') + >>> interv_idx.is_non_overlapping_monotonic + True + + >>> interv_idx = pd.interval_range(start=0, end=2, closed='both') + >>> interv_idx + IntervalIndex([[0, 1], [1, 2]], dtype='interval[int64, both]') + >>> interv_idx.is_non_overlapping_monotonic + False + """ + + @property + @Appender( + _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs + ) + def is_non_overlapping_monotonic(self) -> bool: + # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) + # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) + # we already require left <= right + + # strict inequality for closed == 'both'; equality implies overlapping + # at a point when both sides of intervals are included + if self.closed == "both": + return bool( + (self._right[:-1] < self._left[1:]).all() + or (self._left[:-1] > self._right[1:]).all() + ) + + # non-strict inequality when closed != 'both'; at least one side is + # not included in the intervals, so equality does not imply overlapping + return bool( + (self._right[:-1] <= self._left[1:]).all() + or (self._left[:-1] >= self._right[1:]).all() + ) + + # --------------------------------------------------------------------- + # Conversion + + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: + """ + Return the IntervalArray's data as a numpy array of Interval + objects (with dtype='object') + """ + left = self._left + right = self._right + mask = self.isna() + closed = self.closed + + result = np.empty(len(left), dtype=object) + for i, left_value in enumerate(left): + if mask[i]: + result[i] = np.nan + else: + result[i] = Interval(left_value, right[i], closed) + return result + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + try: + subtype = pyarrow.from_numpy_dtype(self.dtype.subtype) + except TypeError as err: + raise TypeError( + f"Conversion to arrow with subtype '{self.dtype.subtype}' " + "is not supported" + ) from err + interval_type = ArrowIntervalType(subtype, self.closed) + storage_array = pyarrow.StructArray.from_arrays( + [ + pyarrow.array(self._left, type=subtype, from_pandas=True), + pyarrow.array(self._right, type=subtype, from_pandas=True), + ], + names=["left", "right"], + ) + mask = self.isna() + if mask.any(): + # if there are missing values, set validity bitmap also on the array level + null_bitmap = pyarrow.array(~mask).buffers()[1] + storage_array = pyarrow.StructArray.from_buffers( + storage_array.type, + len(storage_array), + [null_bitmap], + children=[storage_array.field(0), storage_array.field(1)], + ) + + if type is not None: + if type.equals(interval_type.storage_type): + return storage_array + elif isinstance(type, ArrowIntervalType): + # ensure we have the same subtype and closed attributes + if not type.equals(interval_type): + raise TypeError( + "Not supported to convert IntervalArray to type with " + f"different 'subtype' ({self.dtype.subtype} vs {type.subtype}) " + f"and 'closed' ({self.closed} vs {type.closed}) attributes" + ) + else: + raise TypeError( + f"Not supported to convert IntervalArray to '{type}' type" + ) + + return pyarrow.ExtensionArray.from_storage(interval_type, storage_array) + + _interval_shared_docs["to_tuples"] = textwrap.dedent( + """ + Return an %(return_type)s of tuples of the form (left, right). + + Parameters + ---------- + na_tuple : bool, default True + If ``True``, return ``NA`` as a tuple ``(nan, nan)``. If ``False``, + just return ``NA`` as ``nan``. + + Returns + ------- + tuples: %(return_type)s + %(examples)s\ + """ + ) + + @Appender( + _interval_shared_docs["to_tuples"] + % { + "return_type": ( + "ndarray (if self is IntervalArray) or Index (if self is IntervalIndex)" + ), + "examples": textwrap.dedent( + """\ + + Examples + -------- + For :class:`pandas.IntervalArray`: + + >>> idx = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) + >>> idx + + [(0, 1], (1, 2]] + Length: 2, dtype: interval[int64, right] + >>> idx.to_tuples() + array([(0, 1), (1, 2)], dtype=object) + + For :class:`pandas.IntervalIndex`: + + >>> idx = pd.interval_range(start=0, end=2) + >>> idx + IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]') + >>> idx.to_tuples() + Index([(0, 1), (1, 2)], dtype='object') + """ + ), + } + ) + def to_tuples(self, na_tuple: bool = True) -> np.ndarray: + tuples = com.asarray_tuplesafe(zip(self._left, self._right)) + if not na_tuple: + # GH 18756 + tuples = np.where(~self.isna(), tuples, np.nan) + return tuples + + # --------------------------------------------------------------------- + + def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: + value_left, value_right = self._validate_setitem_value(value) + + if isinstance(self._left, np.ndarray): + np.putmask(self._left, mask, value_left) + assert isinstance(self._right, np.ndarray) + np.putmask(self._right, mask, value_right) + else: + self._left._putmask(mask, value_left) + assert not isinstance(self._right, np.ndarray) + self._right._putmask(mask, value_right) + + def insert(self, loc: int, item: Interval) -> Self: + """ + Return a new IntervalArray inserting new item at location. Follows + Python numpy.insert semantics for negative values. Only Interval + objects and NA can be inserted into an IntervalIndex + + Parameters + ---------- + loc : int + item : Interval + + Returns + ------- + IntervalArray + """ + left_insert, right_insert = self._validate_scalar(item) + + new_left = self.left.insert(loc, left_insert) + new_right = self.right.insert(loc, right_insert) + + return self._shallow_copy(new_left, new_right) + + def delete(self, loc) -> Self: + if isinstance(self._left, np.ndarray): + new_left = np.delete(self._left, loc) + assert isinstance(self._right, np.ndarray) + new_right = np.delete(self._right, loc) + else: + new_left = self._left.delete(loc) + assert not isinstance(self._right, np.ndarray) + new_right = self._right.delete(loc) + return self._shallow_copy(left=new_left, right=new_right) + + @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) + def repeat( + self, + repeats: int | Sequence[int], + axis: AxisInt | None = None, + ) -> Self: + nv.validate_repeat((), {"axis": axis}) + left_repeat = self.left.repeat(repeats) + right_repeat = self.right.repeat(repeats) + return self._shallow_copy(left=left_repeat, right=right_repeat) + + _interval_shared_docs["contains"] = textwrap.dedent( + """ + Check elementwise if the Intervals contain the value. + + Return a boolean mask whether the value is contained in the Intervals + of the %(klass)s. + + Parameters + ---------- + other : scalar + The value to check whether it is contained in the Intervals. + + Returns + ------- + boolean array + + See Also + -------- + Interval.contains : Check whether Interval object contains value. + %(klass)s.overlaps : Check if an Interval overlaps the values in the + %(klass)s. + + Examples + -------- + %(examples)s + >>> intervals.contains(0.5) + array([ True, False, False]) + """ + ) + + @Appender( + _interval_shared_docs["contains"] + % { + "klass": "IntervalArray", + "examples": textwrap.dedent( + """\ + >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)]) + >>> intervals + + [(0, 1], (1, 3], (2, 4]] + Length: 3, dtype: interval[int64, right] + """ + ), + } + ) + def contains(self, other): + if isinstance(other, Interval): + raise NotImplementedError("contains not implemented for two intervals") + + return (self._left < other if self.open_left else self._left <= other) & ( + other < self._right if self.open_right else other <= self._right + ) + + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + if isinstance(values, IntervalArray): + if self.closed != values.closed: + # not comparable -> no overlap + return np.zeros(self.shape, dtype=bool) + + if self.dtype == values.dtype: + # GH#38353 instead of casting to object, operating on a + # complex128 ndarray is much more performant. + left = self._combined.view("complex128") + right = values._combined.view("complex128") + # error: Argument 1 to "isin" has incompatible type + # "Union[ExtensionArray, ndarray[Any, Any], + # ndarray[Any, dtype[Any]]]"; expected + # "Union[_SupportsArray[dtype[Any]], + # _NestedSequence[_SupportsArray[dtype[Any]]], bool, + # int, float, complex, str, bytes, _NestedSequence[ + # Union[bool, int, float, complex, str, bytes]]]" + return np.isin(left, right).ravel() # type: ignore[arg-type] + + elif needs_i8_conversion(self.left.dtype) ^ needs_i8_conversion( + values.left.dtype + ): + # not comparable -> no overlap + return np.zeros(self.shape, dtype=bool) + + return isin(self.astype(object), values.astype(object)) + + @property + def _combined(self) -> IntervalSide: + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "reshape" [union-attr] + left = self.left._values.reshape(-1, 1) # type: ignore[union-attr] + right = self.right._values.reshape(-1, 1) # type: ignore[union-attr] + if needs_i8_conversion(left.dtype): + # error: Item "ndarray[Any, Any]" of "Any | ndarray[Any, Any]" has + # no attribute "_concat_same_type" + comb = left._concat_same_type( # type: ignore[union-attr] + [left, right], axis=1 + ) + else: + comb = np.concatenate([left, right], axis=1) + return comb + + def _from_combined(self, combined: np.ndarray) -> IntervalArray: + """ + Create a new IntervalArray with our dtype from a 1D complex128 ndarray. + """ + nc = combined.view("i8").reshape(-1, 2) + + dtype = self._left.dtype + if needs_i8_conversion(dtype): + assert isinstance(self._left, (DatetimeArray, TimedeltaArray)) + new_left = type(self._left)._from_sequence(nc[:, 0], dtype=dtype) + assert isinstance(self._right, (DatetimeArray, TimedeltaArray)) + new_right = type(self._right)._from_sequence(nc[:, 1], dtype=dtype) + else: + assert isinstance(dtype, np.dtype) + new_left = nc[:, 0].view(dtype) + new_right = nc[:, 1].view(dtype) + return self._shallow_copy(left=new_left, right=new_right) + + def unique(self) -> IntervalArray: + # No overload variant of "__getitem__" of "ExtensionArray" matches argument + # type "Tuple[slice, int]" + nc = unique( + self._combined.view("complex128")[:, 0] # type: ignore[call-overload] + ) + nc = nc[:, None] + return self._from_combined(nc) + + +def _maybe_convert_platform_interval(values) -> ArrayLike: + """ + Try to do platform conversion, with special casing for IntervalArray. + Wrapper around maybe_convert_platform that alters the default return + dtype in certain cases to be compatible with IntervalArray. For example, + empty lists return with integer dtype instead of object dtype, which is + prohibited for IntervalArray. + + Parameters + ---------- + values : array-like + + Returns + ------- + array + """ + if isinstance(values, (list, tuple)) and len(values) == 0: + # GH 19016 + # empty lists/tuples get object dtype by default, but this is + # prohibited for IntervalArray, so coerce to integer instead + return np.array([], dtype=np.int64) + elif not is_list_like(values) or isinstance(values, ABCDataFrame): + # This will raise later, but we avoid passing to maybe_convert_platform + return values + elif isinstance(getattr(values, "dtype", None), CategoricalDtype): + values = np.asarray(values) + elif not hasattr(values, "dtype") and not isinstance(values, (list, tuple, range)): + # TODO: should we just cast these to list? + return values + else: + values = extract_array(values, extract_numpy=True) + + if not hasattr(values, "dtype"): + values = np.asarray(values) + if values.dtype.kind in "iu" and values.dtype != np.int64: + values = values.astype(np.int64) + return values diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/masked.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/masked.py new file mode 100644 index 0000000000000000000000000000000000000000..d7e816b9d37814e40019d94305a1732ebc8b691c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/masked.py @@ -0,0 +1,1650 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import ( + lib, + missing as libmissing, +) +from pandas._libs.tslibs import is_supported_dtype +from pandas._typing import ( + ArrayLike, + AstypeArg, + AxisInt, + DtypeObj, + FillnaOptions, + InterpolateOptions, + NpDtype, + PositionalIndexer, + Scalar, + ScalarIndexer, + Self, + SequenceIndexer, + Shape, + npt, +) +from pandas.compat import ( + IS64, + is_platform_windows, +) +from pandas.errors import AbstractMethodError +from pandas.util._decorators import doc +from pandas.util._validators import validate_fillna_kwargs + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.common import ( + is_bool, + is_integer_dtype, + is_list_like, + is_scalar, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import BaseMaskedDtype +from pandas.core.dtypes.missing import ( + array_equivalent, + is_valid_na_for_dtype, + isna, + notna, +) + +from pandas.core import ( + algorithms as algos, + arraylike, + missing, + nanops, + ops, +) +from pandas.core.algorithms import ( + factorize_array, + isin, + map_array, + mode, + take, +) +from pandas.core.array_algos import ( + masked_accumulations, + masked_reductions, +) +from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._utils import to_numpy_dtype_inference +from pandas.core.arrays.base import ExtensionArray +from pandas.core.construction import ( + array as pd_array, + ensure_wrapped_if_datetimelike, + extract_array, +) +from pandas.core.indexers import check_array_indexer +from pandas.core.ops import invalid_comparison +from pandas.core.util.hashing import hash_array + +if TYPE_CHECKING: + from collections.abc import ( + Iterator, + Sequence, + ) + from pandas import Series + from pandas.core.arrays import BooleanArray + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + from pandas.core.arrays import FloatingArray + +from pandas.compat.numpy import function as nv + + +class BaseMaskedArray(OpsMixin, ExtensionArray): + """ + Base class for masked arrays (which use _data and _mask to store the data). + + numpy based + """ + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value: Scalar + # our underlying data and mask are each ndarrays + _data: np.ndarray + _mask: npt.NDArray[np.bool_] + + # Fill values used for any/all + _truthy_value = Scalar # bool(_truthy_value) = True + _falsey_value = Scalar # bool(_falsey_value) = False + + @classmethod + def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self: + result = BaseMaskedArray.__new__(cls) + result._data = values + result._mask = mask + return result + + def __init__( + self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False + ) -> None: + # values is supposed to already be validated in the subclass + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'pd.array' function instead" + ) + if values.shape != mask.shape: + raise ValueError("values.shape must match mask.shape") + + if copy: + values = values.copy() + mask = mask.copy() + + self._data = values + self._mask = mask + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: + values, mask = cls._coerce_to_array(scalars, dtype=dtype, copy=copy) + return cls(values, mask) + + @classmethod + @doc(ExtensionArray._empty) + def _empty(cls, shape: Shape, dtype: ExtensionDtype): + values = np.empty(shape, dtype=dtype.type) + values.fill(cls._internal_fill_value) + mask = np.ones(shape, dtype=bool) + result = cls(values, mask) + if not isinstance(result, cls) or dtype != result.dtype: + raise NotImplementedError( + f"Default 'empty' implementation is invalid for dtype='{dtype}'" + ) + return result + + def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: + # NEP 51: https://github.com/numpy/numpy/pull/22449 + return str + + @property + def dtype(self) -> BaseMaskedDtype: + raise AbstractMethodError(self) + + @overload + def __getitem__(self, item: ScalarIndexer) -> Any: + ... + + @overload + def __getitem__(self, item: SequenceIndexer) -> Self: + ... + + def __getitem__(self, item: PositionalIndexer) -> Self | Any: + item = check_array_indexer(self, item) + + newmask = self._mask[item] + if is_bool(newmask): + # This is a scalar indexing + if newmask: + return self.dtype.na_value + return self._data[item] + + return self._simple_new(self._data[item], newmask) + + def _pad_or_backfill( + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + mask = self._mask + + if mask.any(): + func = missing.get_fill_func(method, ndim=self.ndim) + + npvalues = self._data.T + new_mask = mask.T + if copy: + npvalues = npvalues.copy() + new_mask = new_mask.copy() + elif limit_area is not None: + mask = mask.copy() + func(npvalues, limit=limit, mask=new_mask) + + if limit_area is not None and not mask.all(): + mask = mask.T + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + new_mask[:first] |= mask[:first] + new_mask[last + 1 :] |= mask[last + 1 :] + elif limit_area == "outside": + new_mask[first + 1 : last] |= mask[first + 1 : last] + + if copy: + return self._simple_new(npvalues.T, new_mask.T) + else: + return self + else: + if copy: + new_values = self.copy() + else: + new_values = self + return new_values + + @doc(ExtensionArray.fillna) + def fillna( + self, value=None, method=None, limit: int | None = None, copy: bool = True + ) -> Self: + value, method = validate_fillna_kwargs(value, method) + + mask = self._mask + + value = missing.check_value_size(value, mask, len(self)) + + if mask.any(): + if method is not None: + func = missing.get_fill_func(method, ndim=self.ndim) + npvalues = self._data.T + new_mask = mask.T + if copy: + npvalues = npvalues.copy() + new_mask = new_mask.copy() + func(npvalues, limit=limit, mask=new_mask) + return self._simple_new(npvalues.T, new_mask.T) + else: + # fill with value + if copy: + new_values = self.copy() + else: + new_values = self[:] + new_values[mask] = value + else: + if copy: + new_values = self.copy() + else: + new_values = self[:] + return new_values + + @classmethod + def _coerce_to_array( + cls, values, *, dtype: DtypeObj, copy: bool = False + ) -> tuple[np.ndarray, np.ndarray]: + raise AbstractMethodError(cls) + + def _validate_setitem_value(self, value): + """ + Check if we have a scalar that we can cast losslessly. + + Raises + ------ + TypeError + """ + kind = self.dtype.kind + # TODO: get this all from np_can_hold_element? + if kind == "b": + if lib.is_bool(value): + return value + + elif kind == "f": + if lib.is_integer(value) or lib.is_float(value): + return value + + else: + if lib.is_integer(value) or (lib.is_float(value) and value.is_integer()): + return value + # TODO: unsigned checks + + # Note: without the "str" here, the f-string rendering raises in + # py38 builds. + raise TypeError(f"Invalid value '{str(value)}' for dtype {self.dtype}") + + def __setitem__(self, key, value) -> None: + key = check_array_indexer(self, key) + + if is_scalar(value): + if is_valid_na_for_dtype(value, self.dtype): + self._mask[key] = True + else: + value = self._validate_setitem_value(value) + self._data[key] = value + self._mask[key] = False + return + + value, mask = self._coerce_to_array(value, dtype=self.dtype) + + self._data[key] = value + self._mask[key] = mask + + def __contains__(self, key) -> bool: + if isna(key) and key is not self.dtype.na_value: + # GH#52840 + if self._data.dtype.kind == "f" and lib.is_float(key): + return bool((np.isnan(self._data) & ~self._mask).any()) + + return bool(super().__contains__(key)) + + def __iter__(self) -> Iterator: + if self.ndim == 1: + if not self._hasna: + for val in self._data: + yield val + else: + na_value = self.dtype.na_value + for isna_, val in zip(self._mask, self._data): + if isna_: + yield na_value + else: + yield val + else: + for i in range(len(self)): + yield self[i] + + def __len__(self) -> int: + return len(self._data) + + @property + def shape(self) -> Shape: + return self._data.shape + + @property + def ndim(self) -> int: + return self._data.ndim + + def swapaxes(self, axis1, axis2) -> Self: + data = self._data.swapaxes(axis1, axis2) + mask = self._mask.swapaxes(axis1, axis2) + return self._simple_new(data, mask) + + def delete(self, loc, axis: AxisInt = 0) -> Self: + data = np.delete(self._data, loc, axis=axis) + mask = np.delete(self._mask, loc, axis=axis) + return self._simple_new(data, mask) + + def reshape(self, *args, **kwargs) -> Self: + data = self._data.reshape(*args, **kwargs) + mask = self._mask.reshape(*args, **kwargs) + return self._simple_new(data, mask) + + def ravel(self, *args, **kwargs) -> Self: + # TODO: need to make sure we have the same order for data/mask + data = self._data.ravel(*args, **kwargs) + mask = self._mask.ravel(*args, **kwargs) + return type(self)(data, mask) + + @property + def T(self) -> Self: + return self._simple_new(self._data.T, self._mask.T) + + def round(self, decimals: int = 0, *args, **kwargs): + """ + Round each value in the array a to the given number of decimals. + + Parameters + ---------- + decimals : int, default 0 + Number of decimal places to round to. If decimals is negative, + it specifies the number of positions to the left of the decimal point. + *args, **kwargs + Additional arguments and keywords have no effect but might be + accepted for compatibility with NumPy. + + Returns + ------- + NumericArray + Rounded values of the NumericArray. + + See Also + -------- + numpy.around : Round values of an np.array. + DataFrame.round : Round values of a DataFrame. + Series.round : Round values of a Series. + """ + if self.dtype.kind == "b": + return self + nv.validate_round(args, kwargs) + values = np.round(self._data, decimals=decimals, **kwargs) + + # Usually we'll get same type as self, but ndarray[bool] casts to float + return self._maybe_mask_result(values, self._mask.copy()) + + # ------------------------------------------------------------------ + # Unary Methods + + def __invert__(self) -> Self: + return self._simple_new(~self._data, self._mask.copy()) + + def __neg__(self) -> Self: + return self._simple_new(-self._data, self._mask.copy()) + + def __pos__(self) -> Self: + return self.copy() + + def __abs__(self) -> Self: + return self._simple_new(abs(self._data), self._mask.copy()) + + # ------------------------------------------------------------------ + + def _values_for_json(self) -> np.ndarray: + return np.asarray(self, dtype=object) + + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Convert to a NumPy Array. + + By default converts to an object-dtype NumPy array. Specify the `dtype` and + `na_value` keywords to customize the conversion. + + Parameters + ---------- + dtype : dtype, default object + The numpy dtype to convert to. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + the array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. This is typically + only possible when no missing values are present and `dtype` + is the equivalent numpy dtype. + na_value : scalar, optional + Scalar missing value indicator to use in numpy array. Defaults + to the native missing value indicator of this array (pd.NA). + + Returns + ------- + numpy.ndarray + + Examples + -------- + An object-dtype is the default result + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a.to_numpy() + array([True, False, ], dtype=object) + + When no missing values are present, an equivalent dtype can be used. + + >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool") + array([ True, False]) + >>> pd.array([1, 2], dtype="Int64").to_numpy("int64") + array([1, 2]) + + However, requesting such dtype will raise a ValueError if + missing values are present and the default missing value :attr:`NA` + is used. + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a + + [True, False, ] + Length: 3, dtype: boolean + + >>> a.to_numpy(dtype="bool") + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + + Specify a valid `na_value` instead + + >>> a.to_numpy(dtype="bool", na_value=False) + array([ True, False, False]) + """ + hasna = self._hasna + dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) + if dtype is None: + dtype = object + + if hasna: + if ( + dtype != object + and not is_string_dtype(dtype) + and na_value is libmissing.NA + ): + raise ValueError( + f"cannot convert to '{dtype}'-dtype NumPy array " + "with missing values. Specify an appropriate 'na_value' " + "for this dtype." + ) + # don't pass copy to astype -> always need a copy since we are mutating + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=RuntimeWarning) + data = self._data.astype(dtype) + data[self._mask] = na_value + else: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=RuntimeWarning) + data = self._data.astype(dtype, copy=copy) + return data + + @doc(ExtensionArray.tolist) + def tolist(self): + if self.ndim > 1: + return [x.tolist() for x in self] + dtype = None if self._hasna else self._data.dtype + return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist() + + @overload + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: + ... + + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: + ... + + @overload + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: + ... + + def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: + dtype = pandas_dtype(dtype) + + if dtype == self.dtype: + if copy: + return self.copy() + return self + + # if we are astyping to another nullable masked dtype, we can fastpath + if isinstance(dtype, BaseMaskedDtype): + # TODO deal with NaNs for FloatingArray case + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=RuntimeWarning) + # TODO: Is rounding what we want long term? + data = self._data.astype(dtype.numpy_dtype, copy=copy) + # mask is copied depending on whether the data was copied, and + # not directly depending on the `copy` keyword + mask = self._mask if data is self._data else self._mask.copy() + cls = dtype.construct_array_type() + return cls(data, mask, copy=False) + + if isinstance(dtype, ExtensionDtype): + eacls = dtype.construct_array_type() + return eacls._from_sequence(self, dtype=dtype, copy=copy) + + na_value: float | np.datetime64 | lib.NoDefault + + # coerce + if dtype.kind == "f": + # In astype, we consider dtype=float to also mean na_value=np.nan + na_value = np.nan + elif dtype.kind == "M": + na_value = np.datetime64("NaT") + else: + na_value = lib.no_default + + # to_numpy will also raise, but we get somewhat nicer exception messages here + if dtype.kind in "iu" and self._hasna: + raise ValueError("cannot convert NA to integer") + if dtype.kind == "b" and self._hasna: + # careful: astype_nansafe converts np.nan to True + raise ValueError("cannot convert float NaN to bool") + + data = self.to_numpy(dtype=dtype, na_value=na_value, copy=copy) + return data + + __array_priority__ = 1000 # higher than ndarray so ops dispatch to us + + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: + """ + the array interface, return my values + We return an object array here to preserve our scalar values + """ + return self.to_numpy(dtype=dtype) + + _HANDLED_TYPES: tuple[type, ...] + + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + # For MaskedArray inputs, we apply the ufunc to ._data + # and mask the result. + + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (BaseMaskedArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = arraylike.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + if "out" in kwargs: + # e.g. test_ufunc_with_out + return arraylike.dispatch_ufunc_with_out( + self, ufunc, method, *inputs, **kwargs + ) + + if method == "reduce": + result = arraylike.dispatch_reduction_ufunc( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + mask = np.zeros(len(self), dtype=bool) + inputs2 = [] + for x in inputs: + if isinstance(x, BaseMaskedArray): + mask |= x._mask + inputs2.append(x._data) + else: + inputs2.append(x) + + def reconstruct(x: np.ndarray): + # we don't worry about scalar `x` here, since we + # raise for reduce up above. + from pandas.core.arrays import ( + BooleanArray, + FloatingArray, + IntegerArray, + ) + + if x.dtype.kind == "b": + m = mask.copy() + return BooleanArray(x, m) + elif x.dtype.kind in "iu": + m = mask.copy() + return IntegerArray(x, m) + elif x.dtype.kind == "f": + m = mask.copy() + if x.dtype == np.float16: + # reached in e.g. np.sqrt on BooleanArray + # we don't support float16 + x = x.astype(np.float32) + return FloatingArray(x, m) + else: + x[mask] = np.nan + return x + + result = getattr(ufunc, method)(*inputs2, **kwargs) + if ufunc.nout > 1: + # e.g. np.divmod + return tuple(reconstruct(x) for x in result) + elif method == "reduce": + # e.g. np.add.reduce; test_ufunc_reduce_raises + if self._mask.any(): + return self._na_value + return result + else: + return reconstruct(result) + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + return pa.array(self._data, mask=self._mask, type=type) + + @property + def _hasna(self) -> bool: + # Note: this is expensive right now! The hope is that we can + # make this faster by having an optional mask, but not have to change + # source code using it.. + + # error: Incompatible return value type (got "bool_", expected "bool") + return self._mask.any() # type: ignore[return-value] + + def _propagate_mask( + self, mask: npt.NDArray[np.bool_] | None, other + ) -> npt.NDArray[np.bool_]: + if mask is None: + mask = self._mask.copy() # TODO: need test for BooleanArray needing a copy + if other is libmissing.NA: + # GH#45421 don't alter inplace + mask = mask | True + elif is_list_like(other) and len(other) == len(mask): + mask = mask | isna(other) + else: + mask = self._mask | mask + # Incompatible return value type (got "Optional[ndarray[Any, dtype[bool_]]]", + # expected "ndarray[Any, dtype[bool_]]") + return mask # type: ignore[return-value] + + def _arith_method(self, other, op): + op_name = op.__name__ + omask = None + + if ( + not hasattr(other, "dtype") + and is_list_like(other) + and len(other) == len(self) + ): + # Try inferring masked dtype instead of casting to object + other = pd_array(other) + other = extract_array(other, extract_numpy=True) + + if isinstance(other, BaseMaskedArray): + other, omask = other._data, other._mask + + elif is_list_like(other): + if not isinstance(other, ExtensionArray): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + + # We wrap the non-masked arithmetic logic used for numpy dtypes + # in Series/Index arithmetic ops. + other = ops.maybe_prepare_scalar_for_op(other, (len(self),)) + pd_op = ops.get_array_op(op) + other = ensure_wrapped_if_datetimelike(other) + + if op_name in {"pow", "rpow"} and isinstance(other, np.bool_): + # Avoid DeprecationWarning: In future, it will be an error + # for 'np.bool_' scalars to be interpreted as an index + # e.g. test_array_scalar_like_equivalence + other = bool(other) + + mask = self._propagate_mask(omask, other) + + if other is libmissing.NA: + result = np.ones_like(self._data) + if self.dtype.kind == "b": + if op_name in { + "floordiv", + "rfloordiv", + "pow", + "rpow", + "truediv", + "rtruediv", + }: + # GH#41165 Try to match non-masked Series behavior + # This is still imperfect GH#46043 + raise NotImplementedError( + f"operator '{op_name}' not implemented for bool dtypes" + ) + if op_name in {"mod", "rmod"}: + dtype = "int8" + else: + dtype = "bool" + result = result.astype(dtype) + elif "truediv" in op_name and self.dtype.kind != "f": + # The actual data here doesn't matter since the mask + # will be all-True, but since this is division, we want + # to end up with floating dtype. + result = result.astype(np.float64) + else: + # Make sure we do this before the "pow" mask checks + # to get an expected exception message on shape mismatch. + if self.dtype.kind in "iu" and op_name in ["floordiv", "mod"]: + # TODO(GH#30188) ATM we don't match the behavior of non-masked + # types with respect to floordiv-by-zero + pd_op = op + + with np.errstate(all="ignore"): + result = pd_op(self._data, other) + + if op_name == "pow": + # 1 ** x is 1. + mask = np.where((self._data == 1) & ~self._mask, False, mask) + # x ** 0 is 1. + if omask is not None: + mask = np.where((other == 0) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 0, False, mask) + + elif op_name == "rpow": + # 1 ** x is 1. + if omask is not None: + mask = np.where((other == 1) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 1, False, mask) + # x ** 0 is 1. + mask = np.where((self._data == 0) & ~self._mask, False, mask) + + return self._maybe_mask_result(result, mask) + + _logical_method = _arith_method + + def _cmp_method(self, other, op) -> BooleanArray: + from pandas.core.arrays import BooleanArray + + mask = None + + if isinstance(other, BaseMaskedArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning or DeprecationWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + warnings.filterwarnings("ignore", "elementwise", DeprecationWarning) + method = getattr(self._data, f"__{op.__name__}__") + result = method(other) + + if result is NotImplemented: + result = invalid_comparison(self._data, other, op) + + mask = self._propagate_mask(mask, other) + return BooleanArray(result, mask, copy=False) + + def _maybe_mask_result( + self, result: np.ndarray | tuple[np.ndarray, np.ndarray], mask: np.ndarray + ): + """ + Parameters + ---------- + result : array-like or tuple[array-like] + mask : array-like bool + """ + if isinstance(result, tuple): + # i.e. divmod + div, mod = result + return ( + self._maybe_mask_result(div, mask), + self._maybe_mask_result(mod, mask), + ) + + if result.dtype.kind == "f": + from pandas.core.arrays import FloatingArray + + return FloatingArray(result, mask, copy=False) + + elif result.dtype.kind == "b": + from pandas.core.arrays import BooleanArray + + return BooleanArray(result, mask, copy=False) + + elif lib.is_np_dtype(result.dtype, "m") and is_supported_dtype(result.dtype): + # e.g. test_numeric_arr_mul_tdscalar_numexpr_path + from pandas.core.arrays import TimedeltaArray + + result[mask] = result.dtype.type("NaT") + + if not isinstance(result, TimedeltaArray): + return TimedeltaArray._simple_new(result, dtype=result.dtype) + + return result + + elif result.dtype.kind in "iu": + from pandas.core.arrays import IntegerArray + + return IntegerArray(result, mask, copy=False) + + else: + result[mask] = np.nan + return result + + def isna(self) -> np.ndarray: + return self._mask.copy() + + @property + def _na_value(self): + return self.dtype.na_value + + @property + def nbytes(self) -> int: + return self._data.nbytes + self._mask.nbytes + + @classmethod + def _concat_same_type( + cls, + to_concat: Sequence[Self], + axis: AxisInt = 0, + ) -> Self: + data = np.concatenate([x._data for x in to_concat], axis=axis) + mask = np.concatenate([x._mask for x in to_concat], axis=axis) + return cls(data, mask) + + def _hash_pandas_object( + self, *, encoding: str, hash_key: str, categorize: bool + ) -> npt.NDArray[np.uint64]: + hashed_array = hash_array( + self._data, encoding=encoding, hash_key=hash_key, categorize=categorize + ) + hashed_array[self.isna()] = hash(self.dtype.na_value) + return hashed_array + + def take( + self, + indexer, + *, + allow_fill: bool = False, + fill_value: Scalar | None = None, + axis: AxisInt = 0, + ) -> Self: + # we always fill with 1 internally + # to avoid upcasting + data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value + result = take( + self._data, + indexer, + fill_value=data_fill_value, + allow_fill=allow_fill, + axis=axis, + ) + + mask = take( + self._mask, indexer, fill_value=True, allow_fill=allow_fill, axis=axis + ) + + # if we are filling + # we only fill where the indexer is null + # not existing missing values + # TODO(jreback) what if we have a non-na float as a fill value? + if allow_fill and notna(fill_value): + fill_mask = np.asarray(indexer) == -1 + result[fill_mask] = fill_value + mask = mask ^ fill_mask + + return self._simple_new(result, mask) + + # error: Return type "BooleanArray" of "isin" incompatible with return type + # "ndarray" in supertype "ExtensionArray" + def isin(self, values: ArrayLike) -> BooleanArray: # type: ignore[override] + from pandas.core.arrays import BooleanArray + + # algorithms.isin will eventually convert values to an ndarray, so no extra + # cost to doing it here first + values_arr = np.asarray(values) + result = isin(self._data, values_arr) + + if self._hasna: + values_have_NA = values_arr.dtype == object and any( + val is self.dtype.na_value for val in values_arr + ) + + # For now, NA does not propagate so set result according to presence of NA, + # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion + result[self._mask] = values_have_NA + + mask = np.zeros(self._data.shape, dtype=bool) + return BooleanArray(result, mask, copy=False) + + def copy(self) -> Self: + data = self._data.copy() + mask = self._mask.copy() + return self._simple_new(data, mask) + + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + values = self._data + mask = self._mask + return algos.duplicated(values, keep=keep, mask=mask) + + def unique(self) -> Self: + """ + Compute the BaseMaskedArray of unique values. + + Returns + ------- + uniques : BaseMaskedArray + """ + uniques, mask = algos.unique_with_mask(self._data, self._mask) + return self._simple_new(uniques, mask) + + @doc(ExtensionArray.searchsorted) + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + if self._hasna: + raise ValueError( + "searchsorted requires array to be sorted, which is impossible " + "with NAs present." + ) + if isinstance(value, ExtensionArray): + value = value.astype(object) + # Base class searchsorted would cast to object, which is *much* slower. + return self._data.searchsorted(value, side=side, sorter=sorter) + + @doc(ExtensionArray.factorize) + def factorize( + self, + use_na_sentinel: bool = True, + ) -> tuple[np.ndarray, ExtensionArray]: + arr = self._data + mask = self._mask + + # Use a sentinel for na; recode and add NA to uniques if necessary below + codes, uniques = factorize_array(arr, use_na_sentinel=True, mask=mask) + + # check that factorize_array correctly preserves dtype. + assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype) + + has_na = mask.any() + if use_na_sentinel or not has_na: + size = len(uniques) + else: + # Make room for an NA value + size = len(uniques) + 1 + uniques_mask = np.zeros(size, dtype=bool) + if not use_na_sentinel and has_na: + na_index = mask.argmax() + # Insert na with the proper code + if na_index == 0: + na_code = np.intp(0) + else: + na_code = codes[:na_index].max() + 1 + codes[codes >= na_code] += 1 + codes[codes == -1] = na_code + # dummy value for uniques; not used since uniques_mask will be True + uniques = np.insert(uniques, na_code, 0) + uniques_mask[na_code] = True + uniques_ea = self._simple_new(uniques, uniques_mask) + + return codes, uniques_ea + + @doc(ExtensionArray._values_for_argsort) + def _values_for_argsort(self) -> np.ndarray: + return self._data + + def value_counts(self, dropna: bool = True) -> Series: + """ + Returns a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import ( + Index, + Series, + ) + from pandas.arrays import IntegerArray + + keys, value_counts, na_counter = algos.value_counts_arraylike( + self._data, dropna=dropna, mask=self._mask + ) + mask_index = np.zeros((len(value_counts),), dtype=np.bool_) + mask = mask_index.copy() + + if na_counter > 0: + mask_index[-1] = True + + arr = IntegerArray(value_counts, mask) + index = Index( + self.dtype.construct_array_type()( + keys, mask_index # type: ignore[arg-type] + ) + ) + return Series(arr, index=index, name="count", copy=False) + + def _mode(self, dropna: bool = True) -> Self: + if dropna: + result = mode(self._data, dropna=dropna, mask=self._mask) + res_mask = np.zeros(result.shape, dtype=np.bool_) + else: + result, res_mask = mode(self._data, dropna=dropna, mask=self._mask) + result = type(self)(result, res_mask) # type: ignore[arg-type] + return result[result.argsort()] + + @doc(ExtensionArray.equals) + def equals(self, other) -> bool: + if type(self) != type(other): + return False + if other.dtype != self.dtype: + return False + + # GH#44382 if e.g. self[1] is np.nan and other[1] is pd.NA, we are NOT + # equal. + if not np.array_equal(self._mask, other._mask): + return False + + left = self._data[~self._mask] + right = other._data[~other._mask] + return array_equivalent(left, right, strict_nan=True, dtype_equal=True) + + def _quantile( + self, qs: npt.NDArray[np.float64], interpolation: str + ) -> BaseMaskedArray: + """ + Dispatch to quantile_with_mask, needed because we do not have + _from_factorized. + + Notes + ----- + We assume that all impacted cases are 1D-only. + """ + res = quantile_with_mask( + self._data, + mask=self._mask, + # TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype) + # instead of np.nan + fill_value=np.nan, + qs=qs, + interpolation=interpolation, + ) + + if self._hasna: + # Our result mask is all-False unless we are all-NA, in which + # case it is all-True. + if self.ndim == 2: + # I think this should be out_mask=self.isna().all(axis=1) + # but am holding off until we have tests + raise NotImplementedError + if self.isna().all(): + out_mask = np.ones(res.shape, dtype=bool) + + if is_integer_dtype(self.dtype): + # We try to maintain int dtype if possible for not all-na case + # as well + res = np.zeros(res.shape, dtype=self.dtype.numpy_dtype) + else: + out_mask = np.zeros(res.shape, dtype=bool) + else: + out_mask = np.zeros(res.shape, dtype=bool) + return self._maybe_mask_result(res, mask=out_mask) + + # ------------------------------------------------------------------ + # Reductions + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}: + result = getattr(self, name)(skipna=skipna, **kwargs) + else: + # median, skew, kurt, sem + data = self._data + mask = self._mask + op = getattr(nanops, f"nan{name}") + axis = kwargs.pop("axis", None) + result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs) + + if keepdims: + if isna(result): + return self._wrap_na_result(name=name, axis=0, mask_size=(1,)) + else: + result = result.reshape(1) + mask = np.zeros(1, dtype=bool) + return self._maybe_mask_result(result, mask) + + if isna(result): + return libmissing.NA + else: + return result + + def _wrap_reduction_result(self, name: str, result, *, skipna, axis): + if isinstance(result, np.ndarray): + if skipna: + # we only retain mask for all-NA rows/columns + mask = self._mask.all(axis=axis) + else: + mask = self._mask.any(axis=axis) + + return self._maybe_mask_result(result, mask) + return result + + def _wrap_na_result(self, *, name, axis, mask_size): + mask = np.ones(mask_size, dtype=bool) + + float_dtyp = "float32" if self.dtype == "Float32" else "float64" + if name in ["mean", "median", "var", "std", "skew", "kurt"]: + np_dtype = float_dtyp + elif name in ["min", "max"] or self.dtype.itemsize == 8: + np_dtype = self.dtype.numpy_dtype.name + else: + is_windows_or_32bit = is_platform_windows() or not IS64 + int_dtyp = "int32" if is_windows_or_32bit else "int64" + uint_dtyp = "uint32" if is_windows_or_32bit else "uint64" + np_dtype = {"b": int_dtyp, "i": int_dtyp, "u": uint_dtyp, "f": float_dtyp}[ + self.dtype.kind + ] + + value = np.array([1], dtype=np_dtype) + return self._maybe_mask_result(value, mask=mask) + + def _wrap_min_count_reduction_result( + self, name: str, result, *, skipna, min_count, axis + ): + if min_count == 0 and isinstance(result, np.ndarray): + return self._maybe_mask_result(result, np.zeros(result.shape, dtype=bool)) + return self._wrap_reduction_result(name, result, skipna=skipna, axis=axis) + + def sum( + self, + *, + skipna: bool = True, + min_count: int = 0, + axis: AxisInt | None = 0, + **kwargs, + ): + nv.validate_sum((), kwargs) + + result = masked_reductions.sum( + self._data, + self._mask, + skipna=skipna, + min_count=min_count, + axis=axis, + ) + return self._wrap_min_count_reduction_result( + "sum", result, skipna=skipna, min_count=min_count, axis=axis + ) + + def prod( + self, + *, + skipna: bool = True, + min_count: int = 0, + axis: AxisInt | None = 0, + **kwargs, + ): + nv.validate_prod((), kwargs) + + result = masked_reductions.prod( + self._data, + self._mask, + skipna=skipna, + min_count=min_count, + axis=axis, + ) + return self._wrap_min_count_reduction_result( + "prod", result, skipna=skipna, min_count=min_count, axis=axis + ) + + def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): + nv.validate_mean((), kwargs) + result = masked_reductions.mean( + self._data, + self._mask, + skipna=skipna, + axis=axis, + ) + return self._wrap_reduction_result("mean", result, skipna=skipna, axis=axis) + + def var( + self, *, skipna: bool = True, axis: AxisInt | None = 0, ddof: int = 1, **kwargs + ): + nv.validate_stat_ddof_func((), kwargs, fname="var") + result = masked_reductions.var( + self._data, + self._mask, + skipna=skipna, + axis=axis, + ddof=ddof, + ) + return self._wrap_reduction_result("var", result, skipna=skipna, axis=axis) + + def std( + self, *, skipna: bool = True, axis: AxisInt | None = 0, ddof: int = 1, **kwargs + ): + nv.validate_stat_ddof_func((), kwargs, fname="std") + result = masked_reductions.std( + self._data, + self._mask, + skipna=skipna, + axis=axis, + ddof=ddof, + ) + return self._wrap_reduction_result("std", result, skipna=skipna, axis=axis) + + def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): + nv.validate_min((), kwargs) + result = masked_reductions.min( + self._data, + self._mask, + skipna=skipna, + axis=axis, + ) + return self._wrap_reduction_result("min", result, skipna=skipna, axis=axis) + + def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): + nv.validate_max((), kwargs) + result = masked_reductions.max( + self._data, + self._mask, + skipna=skipna, + axis=axis, + ) + return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) + + def map(self, mapper, na_action=None): + return map_array(self.to_numpy(), mapper, na_action=na_action) + + def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): + """ + Return whether any element is truthy. + + Returns False unless there is at least one element that is truthy. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + .. versionchanged:: 1.4.0 + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be False, as for an empty array. + If `skipna` is False, the result will still be True if there is + at least one element that is truthy, otherwise NA will be returned + if there are NA's present. + axis : int, optional, default 0 + **kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + numpy.any : Numpy version of this method. + BaseMaskedArray.all : Return whether all elements are truthy. + + Examples + -------- + The result indicates whether any element is truthy (and by default + skips NAs): + + >>> pd.array([True, False, True]).any() + True + >>> pd.array([True, False, pd.NA]).any() + True + >>> pd.array([False, False, pd.NA]).any() + False + >>> pd.array([], dtype="boolean").any() + False + >>> pd.array([pd.NA], dtype="boolean").any() + False + >>> pd.array([pd.NA], dtype="Float64").any() + False + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, False, pd.NA]).any(skipna=False) + True + >>> pd.array([1, 0, pd.NA]).any(skipna=False) + True + >>> pd.array([False, False, pd.NA]).any(skipna=False) + + >>> pd.array([0, 0, pd.NA]).any(skipna=False) + + """ + nv.validate_any((), kwargs) + + values = self._data.copy() + # error: Argument 3 to "putmask" has incompatible type "object"; + # expected "Union[_SupportsArray[dtype[Any]], + # _NestedSequence[_SupportsArray[dtype[Any]]], + # bool, int, float, complex, str, bytes, + # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" + np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type] + result = values.any() + if skipna: + return result + else: + if result or len(self) == 0 or not self._mask.any(): + return result + else: + return self.dtype.na_value + + def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): + """ + Return whether all elements are truthy. + + Returns True unless there is at least one element that is falsey. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + .. versionchanged:: 1.4.0 + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be True, as for an empty array. + If `skipna` is False, the result will still be False if there is + at least one element that is falsey, otherwise NA will be returned + if there are NA's present. + axis : int, optional, default 0 + **kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + numpy.all : Numpy version of this method. + BooleanArray.any : Return whether any element is truthy. + + Examples + -------- + The result indicates whether all elements are truthy (and by default + skips NAs): + + >>> pd.array([True, True, pd.NA]).all() + True + >>> pd.array([1, 1, pd.NA]).all() + True + >>> pd.array([True, False, pd.NA]).all() + False + >>> pd.array([], dtype="boolean").all() + True + >>> pd.array([pd.NA], dtype="boolean").all() + True + >>> pd.array([pd.NA], dtype="Float64").all() + True + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, True, pd.NA]).all(skipna=False) + + >>> pd.array([1, 1, pd.NA]).all(skipna=False) + + >>> pd.array([True, False, pd.NA]).all(skipna=False) + False + >>> pd.array([1, 0, pd.NA]).all(skipna=False) + False + """ + nv.validate_all((), kwargs) + + values = self._data.copy() + # error: Argument 3 to "putmask" has incompatible type "object"; + # expected "Union[_SupportsArray[dtype[Any]], + # _NestedSequence[_SupportsArray[dtype[Any]]], + # bool, int, float, complex, str, bytes, + # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" + np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] + result = values.all(axis=axis) + + if skipna: + return result + else: + if not result or len(self) == 0 or not self._mask.any(): + return result + else: + return self.dtype.na_value + + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> FloatingArray: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + if self.dtype.kind == "f": + if copy: + data = self._data.copy() + mask = self._mask.copy() + else: + data = self._data + mask = self._mask + elif self.dtype.kind in "iu": + copy = True + data = self._data.astype("f8") + mask = self._mask.copy() + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + if not copy: + return self # type: ignore[return-value] + if self.dtype.kind == "f": + return type(self)._simple_new(data, mask) # type: ignore[return-value] + else: + from pandas.core.arrays import FloatingArray + + return FloatingArray._simple_new(data, mask) + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> BaseMaskedArray: + data = self._data + mask = self._mask + + op = getattr(masked_accumulations, name) + data, mask = op(data, mask, skipna=skipna, **kwargs) + + return self._simple_new(data, mask) + + # ------------------------------------------------------------------ + # GroupBy Methods + + def _groupby_op( + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, + ): + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + + # libgroupby functions are responsible for NOT altering mask + mask = self._mask + if op.kind != "aggregate": + result_mask = mask.copy() + else: + result_mask = np.zeros(ngroups, dtype=bool) + + if how == "rank" and kwargs.get("na_option") in ["top", "bottom"]: + result_mask[:] = False + + res_values = op._cython_op_ndim_compat( + self._data, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + + if op.how == "ohlc": + arity = op._cython_arity.get(op.how, 1) + result_mask = np.tile(result_mask, (arity, 1)).T + + if op.how in ["idxmin", "idxmax"]: + # Result values are indexes to take, keep as ndarray + return res_values + else: + # res_values should already have the correct dtype, we just need to + # wrap in a MaskedArray + return self._maybe_mask_result(res_values, result_mask) + + +def transpose_homogeneous_masked_arrays( + masked_arrays: Sequence[BaseMaskedArray], +) -> list[BaseMaskedArray]: + """Transpose masked arrays in a list, but faster. + + Input should be a list of 1-dim masked arrays of equal length and all have the + same dtype. The caller is responsible for ensuring validity of input data. + """ + masked_arrays = list(masked_arrays) + dtype = masked_arrays[0].dtype + + values = [arr._data.reshape(1, -1) for arr in masked_arrays] + transposed_values = np.concatenate( + values, + axis=0, + out=np.empty( + (len(masked_arrays), len(masked_arrays[0])), + order="F", + dtype=dtype.numpy_dtype, + ), + ) + + masks = [arr._mask.reshape(1, -1) for arr in masked_arrays] + transposed_masks = np.concatenate( + masks, axis=0, out=np.empty_like(transposed_values, dtype=bool) + ) + + arr_type = dtype.construct_array_type() + transposed_arrays: list[BaseMaskedArray] = [] + for i in range(transposed_values.shape[1]): + transposed_arr = arr_type(transposed_values[:, i], mask=transposed_masks[:, i]) + transposed_arrays.append(transposed_arr) + + return transposed_arrays diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..adf83963aca39e7d2ec2da55d21fc69aaca48977 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__init__.py @@ -0,0 +1,19 @@ +from pandas.core.arrays.sparse.accessor import ( + SparseAccessor, + SparseFrameAccessor, +) +from pandas.core.arrays.sparse.array import ( + BlockIndex, + IntIndex, + SparseArray, + make_sparse_index, +) + +__all__ = [ + "BlockIndex", + "IntIndex", + "make_sparse_index", + "SparseAccessor", + "SparseArray", + "SparseFrameAccessor", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5c3ab9d0f8c941eb3d3247c5e452b3091f21eaf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/accessor.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/accessor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ddfaa6fd96f311fb480ab84e8338b31fc4dcfece Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/accessor.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..028c0e893971c277cb062b9d21385184a2605abd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/scipy_sparse.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/scipy_sparse.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7c9909839712a3e144113d3fe7b7041f33aa89e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/__pycache__/scipy_sparse.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/accessor.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/accessor.py new file mode 100644 index 0000000000000000000000000000000000000000..67bb41786547501c0cd7187e5ca6be8393b85ced --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/accessor.py @@ -0,0 +1,414 @@ +"""Sparse accessor""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + +from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.dtypes import SparseDtype + +from pandas.core.accessor import ( + PandasDelegate, + delegate_names, +) +from pandas.core.arrays.sparse.array import SparseArray + +if TYPE_CHECKING: + from pandas import ( + DataFrame, + Series, + ) + + +class BaseAccessor: + _validation_msg = "Can only use the '.sparse' accessor with Sparse data." + + def __init__(self, data=None) -> None: + self._parent = data + self._validate(data) + + def _validate(self, data): + raise NotImplementedError + + +@delegate_names( + SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property" +) +class SparseAccessor(BaseAccessor, PandasDelegate): + """ + Accessor for SparseSparse from other sparse matrix data types. + + Examples + -------- + >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]") + >>> ser.sparse.density + 0.6 + >>> ser.sparse.sp_values + array([2, 2, 2]) + """ + + def _validate(self, data): + if not isinstance(data.dtype, SparseDtype): + raise AttributeError(self._validation_msg) + + def _delegate_property_get(self, name: str, *args, **kwargs): + return getattr(self._parent.array, name) + + def _delegate_method(self, name: str, *args, **kwargs): + if name == "from_coo": + return self.from_coo(*args, **kwargs) + elif name == "to_coo": + return self.to_coo(*args, **kwargs) + else: + raise ValueError + + @classmethod + def from_coo(cls, A, dense_index: bool = False) -> Series: + """ + Create a Series with sparse values from a scipy.sparse.coo_matrix. + + Parameters + ---------- + A : scipy.sparse.coo_matrix + dense_index : bool, default False + If False (default), the index consists of only the + coords of the non-null entries of the original coo_matrix. + If True, the index consists of the full sorted + (row, col) coordinates of the coo_matrix. + + Returns + ------- + s : Series + A Series with sparse values. + + Examples + -------- + >>> from scipy import sparse + + >>> A = sparse.coo_matrix( + ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) + ... ) + >>> A + + + >>> A.todense() + matrix([[0., 0., 1., 2.], + [3., 0., 0., 0.], + [0., 0., 0., 0.]]) + + >>> ss = pd.Series.sparse.from_coo(A) + >>> ss + 0 2 1.0 + 3 2.0 + 1 0 3.0 + dtype: Sparse[float64, nan] + """ + from pandas import Series + from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series + + result = coo_to_sparse_series(A, dense_index=dense_index) + result = Series(result.array, index=result.index, copy=False) + + return result + + def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False): + """ + Create a scipy.sparse.coo_matrix from a Series with MultiIndex. + + Use row_levels and column_levels to determine the row and column + coordinates respectively. row_levels and column_levels are the names + (labels) or numbers of the levels. {row_levels, column_levels} must be + a partition of the MultiIndex level names (or numbers). + + Parameters + ---------- + row_levels : tuple/list + column_levels : tuple/list + sort_labels : bool, default False + Sort the row and column labels before forming the sparse matrix. + When `row_levels` and/or `column_levels` refer to a single level, + set to `True` for a faster execution. + + Returns + ------- + y : scipy.sparse.coo_matrix + rows : list (row labels) + columns : list (column labels) + + Examples + -------- + >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) + >>> s.index = pd.MultiIndex.from_tuples( + ... [ + ... (1, 2, "a", 0), + ... (1, 2, "a", 1), + ... (1, 1, "b", 0), + ... (1, 1, "b", 1), + ... (2, 1, "b", 0), + ... (2, 1, "b", 1) + ... ], + ... names=["A", "B", "C", "D"], + ... ) + >>> s + A B C D + 1 2 a 0 3.0 + 1 NaN + 1 b 0 1.0 + 1 3.0 + 2 1 b 0 NaN + 1 NaN + dtype: float64 + + >>> ss = s.astype("Sparse") + >>> ss + A B C D + 1 2 a 0 3.0 + 1 NaN + 1 b 0 1.0 + 1 3.0 + 2 1 b 0 NaN + 1 NaN + dtype: Sparse[float64, nan] + + >>> A, rows, columns = ss.sparse.to_coo( + ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True + ... ) + >>> A + + >>> A.todense() + matrix([[0., 0., 1., 3.], + [3., 0., 0., 0.], + [0., 0., 0., 0.]]) + + >>> rows + [(1, 1), (1, 2), (2, 1)] + >>> columns + [('a', 0), ('a', 1), ('b', 0), ('b', 1)] + """ + from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo + + A, rows, columns = sparse_series_to_coo( + self._parent, row_levels, column_levels, sort_labels=sort_labels + ) + return A, rows, columns + + def to_dense(self) -> Series: + """ + Convert a Series from sparse values to dense. + + Returns + ------- + Series: + A Series with the same values, stored as a dense array. + + Examples + -------- + >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0])) + >>> series + 0 0 + 1 1 + 2 0 + dtype: Sparse[int64, 0] + + >>> series.sparse.to_dense() + 0 0 + 1 1 + 2 0 + dtype: int64 + """ + from pandas import Series + + return Series( + self._parent.array.to_dense(), + index=self._parent.index, + name=self._parent.name, + copy=False, + ) + + +class SparseFrameAccessor(BaseAccessor, PandasDelegate): + """ + DataFrame accessor for sparse data. + + Examples + -------- + >>> df = pd.DataFrame({"a": [1, 2, 0, 0], + ... "b": [3, 0, 0, 4]}, dtype="Sparse[int]") + >>> df.sparse.density + 0.5 + """ + + def _validate(self, data): + dtypes = data.dtypes + if not all(isinstance(t, SparseDtype) for t in dtypes): + raise AttributeError(self._validation_msg) + + @classmethod + def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: + """ + Create a new DataFrame from a scipy sparse matrix. + + Parameters + ---------- + data : scipy.sparse.spmatrix + Must be convertible to csc format. + index, columns : Index, optional + Row and column labels to use for the resulting DataFrame. + Defaults to a RangeIndex. + + Returns + ------- + DataFrame + Each column of the DataFrame is stored as a + :class:`arrays.SparseArray`. + + Examples + -------- + >>> import scipy.sparse + >>> mat = scipy.sparse.eye(3, dtype=float) + >>> pd.DataFrame.sparse.from_spmatrix(mat) + 0 1 2 + 0 1.0 0 0 + 1 0 1.0 0 + 2 0 0 1.0 + """ + from pandas._libs.sparse import IntIndex + + from pandas import DataFrame + + data = data.tocsc() + index, columns = cls._prep_index(data, index, columns) + n_rows, n_columns = data.shape + # We need to make sure indices are sorted, as we create + # IntIndex with no input validation (i.e. check_integrity=False ). + # Indices may already be sorted in scipy in which case this adds + # a small overhead. + data.sort_indices() + indices = data.indices + indptr = data.indptr + array_data = data.data + dtype = SparseDtype(array_data.dtype, 0) + arrays = [] + for i in range(n_columns): + sl = slice(indptr[i], indptr[i + 1]) + idx = IntIndex(n_rows, indices[sl], check_integrity=False) + arr = SparseArray._simple_new(array_data[sl], idx, dtype) + arrays.append(arr) + return DataFrame._from_arrays( + arrays, columns=columns, index=index, verify_integrity=False + ) + + def to_dense(self) -> DataFrame: + """ + Convert a DataFrame with sparse values to dense. + + Returns + ------- + DataFrame + A DataFrame with the same values stored as dense arrays. + + Examples + -------- + >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])}) + >>> df.sparse.to_dense() + A + 0 0 + 1 1 + 2 0 + """ + from pandas import DataFrame + + data = {k: v.array.to_dense() for k, v in self._parent.items()} + return DataFrame(data, index=self._parent.index, columns=self._parent.columns) + + def to_coo(self): + """ + Return the contents of the frame as a sparse SciPy COO matrix. + + Returns + ------- + scipy.sparse.spmatrix + If the caller is heterogeneous and contains booleans or objects, + the result will be of dtype=object. See Notes. + + Notes + ----- + The dtype will be the lowest-common-denominator type (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. By numpy.find_common_type convention, mixing int64 and + and uint64 will result in a float64 dtype. + + Examples + -------- + >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) + >>> df.sparse.to_coo() + + """ + import_optional_dependency("scipy") + from scipy.sparse import coo_matrix + + dtype = find_common_type(self._parent.dtypes.to_list()) + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + + cols, rows, data = [], [], [] + for col, (_, ser) in enumerate(self._parent.items()): + sp_arr = ser.array + if sp_arr.fill_value != 0: + raise ValueError("fill value must be 0 when converting to COO matrix") + + row = sp_arr.sp_index.indices + cols.append(np.repeat(col, len(row))) + rows.append(row) + data.append(sp_arr.sp_values.astype(dtype, copy=False)) + + cols = np.concatenate(cols) + rows = np.concatenate(rows) + data = np.concatenate(data) + return coo_matrix((data, (rows, cols)), shape=self._parent.shape) + + @property + def density(self) -> float: + """ + Ratio of non-sparse points to total (dense) data points. + + Examples + -------- + >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) + >>> df.sparse.density + 0.5 + """ + tmp = np.mean([column.array.density for _, column in self._parent.items()]) + return tmp + + @staticmethod + def _prep_index(data, index, columns): + from pandas.core.indexes.api import ( + default_index, + ensure_index, + ) + + N, K = data.shape + if index is None: + index = default_index(N) + else: + index = ensure_index(index) + if columns is None: + columns = default_index(K) + else: + columns = ensure_index(columns) + + if len(columns) != K: + raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}") + if len(index) != N: + raise ValueError(f"Index length mismatch: {len(index)} vs. {N}") + return index, columns diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/array.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/array.py new file mode 100644 index 0000000000000000000000000000000000000000..82fcfa74ec7d229faaa7922c7a7dc860da3bc471 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/array.py @@ -0,0 +1,1929 @@ +""" +SparseArray data structure +""" +from __future__ import annotations + +from collections import abc +import numbers +import operator +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + cast, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import lib +import pandas._libs.sparse as splib +from pandas._libs.sparse import ( + BlockIndex, + IntIndex, + SparseIndex, +) +from pandas._libs.tslibs import NaT +from pandas.compat.numpy import function as nv +from pandas.errors import PerformanceWarning +from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import ( + validate_bool_kwarg, + validate_insert_loc, +) + +from pandas.core.dtypes.astype import astype_array +from pandas.core.dtypes.cast import ( + construct_1d_arraylike_from_scalar, + find_common_type, + maybe_box_datetimelike, +) +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer, + is_list_like, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + SparseDtype, +) +from pandas.core.dtypes.generic import ( + ABCIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + isna, + na_value_for_dtype, + notna, +) + +from pandas.core import arraylike +import pandas.core.algorithms as algos +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays import ExtensionArray +from pandas.core.base import PandasObject +import pandas.core.common as com +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, + sanitize_array, +) +from pandas.core.indexers import ( + check_array_indexer, + unpack_tuple_and_ellipses, +) +from pandas.core.nanops import check_below_min_count + +from pandas.io.formats import printing + +# See https://github.com/python/typing/issues/684 +if TYPE_CHECKING: + from collections.abc import Sequence + from enum import Enum + + class ellipsis(Enum): + Ellipsis = "..." + + Ellipsis = ellipsis.Ellipsis + + from scipy.sparse import spmatrix + + from pandas._typing import ( + FillnaOptions, + NumpySorter, + ) + + SparseIndexKind = Literal["integer", "block"] + + from pandas._typing import ( + ArrayLike, + AstypeArg, + Axis, + AxisInt, + Dtype, + NpDtype, + PositionalIndexer, + Scalar, + ScalarIndexer, + Self, + SequenceIndexer, + npt, + ) + + from pandas import Series + +else: + ellipsis = type(Ellipsis) + + +# ---------------------------------------------------------------------------- +# Array + +_sparray_doc_kwargs = {"klass": "SparseArray"} + + +def _get_fill(arr: SparseArray) -> np.ndarray: + """ + Create a 0-dim ndarray containing the fill value + + Parameters + ---------- + arr : SparseArray + + Returns + ------- + fill_value : ndarray + 0-dim ndarray with just the fill value. + + Notes + ----- + coerce fill_value to arr dtype if possible + int64 SparseArray can have NaN as fill_value if there is no missing + """ + try: + return np.asarray(arr.fill_value, dtype=arr.dtype.subtype) + except ValueError: + return np.asarray(arr.fill_value) + + +def _sparse_array_op( + left: SparseArray, right: SparseArray, op: Callable, name: str +) -> SparseArray: + """ + Perform a binary operation between two arrays. + + Parameters + ---------- + left : Union[SparseArray, ndarray] + right : Union[SparseArray, ndarray] + op : Callable + The binary operation to perform + name str + Name of the callable. + + Returns + ------- + SparseArray + """ + if name.startswith("__"): + # For lookups in _libs.sparse we need non-dunder op name + name = name[2:-2] + + # dtype used to find corresponding sparse method + ltype = left.dtype.subtype + rtype = right.dtype.subtype + + if ltype != rtype: + subtype = find_common_type([ltype, rtype]) + ltype = SparseDtype(subtype, left.fill_value) + rtype = SparseDtype(subtype, right.fill_value) + + left = left.astype(ltype, copy=False) + right = right.astype(rtype, copy=False) + dtype = ltype.subtype + else: + dtype = ltype + + # dtype the result must have + result_dtype = None + + if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: + with np.errstate(all="ignore"): + result = op(left.to_dense(), right.to_dense()) + fill = op(_get_fill(left), _get_fill(right)) + + if left.sp_index.ngaps == 0: + index = left.sp_index + else: + index = right.sp_index + elif left.sp_index.equals(right.sp_index): + with np.errstate(all="ignore"): + result = op(left.sp_values, right.sp_values) + fill = op(_get_fill(left), _get_fill(right)) + index = left.sp_index + else: + if name[0] == "r": + left, right = right, left + name = name[1:] + + if name in ("and", "or", "xor") and dtype == "bool": + opname = f"sparse_{name}_uint8" + # to make template simple, cast here + left_sp_values = left.sp_values.view(np.uint8) + right_sp_values = right.sp_values.view(np.uint8) + result_dtype = bool + else: + opname = f"sparse_{name}_{dtype}" + left_sp_values = left.sp_values + right_sp_values = right.sp_values + + if ( + name in ["floordiv", "mod"] + and (right == 0).any() + and left.dtype.kind in "iu" + ): + # Match the non-Sparse Series behavior + opname = f"sparse_{name}_float64" + left_sp_values = left_sp_values.astype("float64") + right_sp_values = right_sp_values.astype("float64") + + sparse_op = getattr(splib, opname) + + with np.errstate(all="ignore"): + result, index, fill = sparse_op( + left_sp_values, + left.sp_index, + left.fill_value, + right_sp_values, + right.sp_index, + right.fill_value, + ) + + if name == "divmod": + # result is a 2-tuple + # error: Incompatible return value type (got "Tuple[SparseArray, + # SparseArray]", expected "SparseArray") + return ( # type: ignore[return-value] + _wrap_result(name, result[0], index, fill[0], dtype=result_dtype), + _wrap_result(name, result[1], index, fill[1], dtype=result_dtype), + ) + + if result_dtype is None: + result_dtype = result.dtype + + return _wrap_result(name, result, index, fill, dtype=result_dtype) + + +def _wrap_result( + name: str, data, sparse_index, fill_value, dtype: Dtype | None = None +) -> SparseArray: + """ + wrap op result to have correct dtype + """ + if name.startswith("__"): + # e.g. __eq__ --> eq + name = name[2:-2] + + if name in ("eq", "ne", "lt", "gt", "le", "ge"): + dtype = bool + + fill_value = lib.item_from_zerodim(fill_value) + + if is_bool_dtype(dtype): + # fill_value may be np.bool_ + fill_value = bool(fill_value) + return SparseArray( + data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype + ) + + +class SparseArray(OpsMixin, PandasObject, ExtensionArray): + """ + An ExtensionArray for storing sparse data. + + Parameters + ---------- + data : array-like or scalar + A dense array of values to store in the SparseArray. This may contain + `fill_value`. + sparse_index : SparseIndex, optional + fill_value : scalar, optional + Elements in data that are ``fill_value`` are not stored in the + SparseArray. For memory savings, this should be the most common value + in `data`. By default, `fill_value` depends on the dtype of `data`: + + =========== ========== + data.dtype na_value + =========== ========== + float ``np.nan`` + int ``0`` + bool False + datetime64 ``pd.NaT`` + timedelta64 ``pd.NaT`` + =========== ========== + + The fill value is potentially specified in three ways. In order of + precedence, these are + + 1. The `fill_value` argument + 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is + a ``SparseDtype`` + 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` + is not a ``SparseDtype`` and `data` is a ``SparseArray``. + + kind : str + Can be 'integer' or 'block', default is 'integer'. + The type of storage for sparse locations. + + * 'block': Stores a `block` and `block_length` for each + contiguous *span* of sparse values. This is best when + sparse data tends to be clumped together, with large + regions of ``fill-value`` values between sparse values. + * 'integer': uses an integer to store the location of + each sparse value. + + dtype : np.dtype or SparseDtype, optional + The dtype to use for the SparseArray. For numpy dtypes, this + determines the dtype of ``self.sp_values``. For SparseDtype, + this determines ``self.sp_values`` and ``self.fill_value``. + copy : bool, default False + Whether to explicitly copy the incoming `data` array. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> from pandas.arrays import SparseArray + >>> arr = SparseArray([0, 0, 1, 2]) + >>> arr + [0, 0, 1, 2] + Fill: 0 + IntIndex + Indices: array([2, 3], dtype=int32) + """ + + _subtyp = "sparse_array" # register ABCSparseArray + _hidden_attrs = PandasObject._hidden_attrs | frozenset([]) + _sparse_index: SparseIndex + _sparse_values: np.ndarray + _dtype: SparseDtype + + def __init__( + self, + data, + sparse_index=None, + fill_value=None, + kind: SparseIndexKind = "integer", + dtype: Dtype | None = None, + copy: bool = False, + ) -> None: + if fill_value is None and isinstance(dtype, SparseDtype): + fill_value = dtype.fill_value + + if isinstance(data, type(self)): + # disable normal inference on dtype, sparse_index, & fill_value + if sparse_index is None: + sparse_index = data.sp_index + if fill_value is None: + fill_value = data.fill_value + if dtype is None: + dtype = data.dtype + # TODO: make kind=None, and use data.kind? + data = data.sp_values + + # Handle use-provided dtype + if isinstance(dtype, str): + # Two options: dtype='int', regular numpy dtype + # or dtype='Sparse[int]', a sparse dtype + try: + dtype = SparseDtype.construct_from_string(dtype) + except TypeError: + dtype = pandas_dtype(dtype) + + if isinstance(dtype, SparseDtype): + if fill_value is None: + fill_value = dtype.fill_value + dtype = dtype.subtype + + if is_scalar(data): + warnings.warn( + f"Constructing {type(self).__name__} with scalar data is deprecated " + "and will raise in a future version. Pass a sequence instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if sparse_index is None: + npoints = 1 + else: + npoints = sparse_index.length + + data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None) + dtype = data.dtype + + if dtype is not None: + dtype = pandas_dtype(dtype) + + # TODO: disentangle the fill_value dtype inference from + # dtype inference + if data is None: + # TODO: What should the empty dtype be? Object or float? + + # error: Argument "dtype" to "array" has incompatible type + # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any], + # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, + # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" + data = np.array([], dtype=dtype) # type: ignore[arg-type] + + try: + data = sanitize_array(data, index=None) + except ValueError: + # NumPy may raise a ValueError on data like [1, []] + # we retry with object dtype here. + if dtype is None: + dtype = np.dtype(object) + data = np.atleast_1d(np.asarray(data, dtype=dtype)) + else: + raise + + if copy: + # TODO: avoid double copy when dtype forces cast. + data = data.copy() + + if fill_value is None: + fill_value_dtype = data.dtype if dtype is None else dtype + if fill_value_dtype is None: + fill_value = np.nan + else: + fill_value = na_value_for_dtype(fill_value_dtype) + + if isinstance(data, type(self)) and sparse_index is None: + sparse_index = data._sparse_index + # error: Argument "dtype" to "asarray" has incompatible type + # "Union[ExtensionDtype, dtype[Any], None]"; expected "None" + sparse_values = np.asarray( + data.sp_values, dtype=dtype # type: ignore[arg-type] + ) + elif sparse_index is None: + data = extract_array(data, extract_numpy=True) + if not isinstance(data, np.ndarray): + # EA + if isinstance(data.dtype, DatetimeTZDtype): + warnings.warn( + f"Creating SparseArray from {data.dtype} data " + "loses timezone information. Cast to object before " + "sparse to retain timezone information.", + UserWarning, + stacklevel=find_stack_level(), + ) + data = np.asarray(data, dtype="datetime64[ns]") + if fill_value is NaT: + fill_value = np.datetime64("NaT", "ns") + data = np.asarray(data) + sparse_values, sparse_index, fill_value = _make_sparse( + # error: Argument "dtype" to "_make_sparse" has incompatible type + # "Union[ExtensionDtype, dtype[Any], None]"; expected + # "Optional[dtype[Any]]" + data, + kind=kind, + fill_value=fill_value, + dtype=dtype, # type: ignore[arg-type] + ) + else: + # error: Argument "dtype" to "asarray" has incompatible type + # "Union[ExtensionDtype, dtype[Any], None]"; expected "None" + sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type] + if len(sparse_values) != sparse_index.npoints: + raise AssertionError( + f"Non array-like type {type(sparse_values)} must " + "have the same length as the index" + ) + self._sparse_index = sparse_index + self._sparse_values = sparse_values + self._dtype = SparseDtype(sparse_values.dtype, fill_value) + + @classmethod + def _simple_new( + cls, + sparse_array: np.ndarray, + sparse_index: SparseIndex, + dtype: SparseDtype, + ) -> Self: + new = object.__new__(cls) + new._sparse_index = sparse_index + new._sparse_values = sparse_array + new._dtype = dtype + return new + + @classmethod + def from_spmatrix(cls, data: spmatrix) -> Self: + """ + Create a SparseArray from a scipy.sparse matrix. + + Parameters + ---------- + data : scipy.sparse.sp_matrix + This should be a SciPy sparse matrix where the size + of the second dimension is 1. In other words, a + sparse matrix with a single column. + + Returns + ------- + SparseArray + + Examples + -------- + >>> import scipy.sparse + >>> mat = scipy.sparse.coo_matrix((4, 1)) + >>> pd.arrays.SparseArray.from_spmatrix(mat) + [0.0, 0.0, 0.0, 0.0] + Fill: 0.0 + IntIndex + Indices: array([], dtype=int32) + """ + length, ncol = data.shape + + if ncol != 1: + raise ValueError(f"'data' must have a single column, not '{ncol}'") + + # our sparse index classes require that the positions be strictly + # increasing. So we need to sort loc, and arr accordingly. + data = data.tocsc() + data.sort_indices() + arr = data.data + idx = data.indices + + zero = np.array(0, dtype=arr.dtype).item() + dtype = SparseDtype(arr.dtype, zero) + index = IntIndex(length, idx) + + return cls._simple_new(arr, index, dtype) + + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: + fill_value = self.fill_value + + if self.sp_index.ngaps == 0: + # Compat for na dtype and int values. + return self.sp_values + if dtype is None: + # Can NumPy represent this type? + # If not, `np.result_type` will raise. We catch that + # and return object. + if self.sp_values.dtype.kind == "M": + # However, we *do* special-case the common case of + # a datetime64 with pandas NaT. + if fill_value is NaT: + # Can't put pd.NaT in a datetime64[ns] + fill_value = np.datetime64("NaT") + try: + dtype = np.result_type(self.sp_values.dtype, type(fill_value)) + except TypeError: + dtype = object + + out = np.full(self.shape, fill_value, dtype=dtype) + out[self.sp_index.indices] = self.sp_values + return out + + def __setitem__(self, key, value) -> None: + # I suppose we could allow setting of non-fill_value elements. + # TODO(SparseArray.__setitem__): remove special cases in + # ExtensionBlock.where + msg = "SparseArray does not support item assignment via setitem" + raise TypeError(msg) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): + return cls(scalars, dtype=dtype) + + @classmethod + def _from_factorized(cls, values, original): + return cls(values, dtype=original.dtype) + + # ------------------------------------------------------------------------ + # Data + # ------------------------------------------------------------------------ + @property + def sp_index(self) -> SparseIndex: + """ + The SparseIndex containing the location of non- ``fill_value`` points. + """ + return self._sparse_index + + @property + def sp_values(self) -> np.ndarray: + """ + An ndarray containing the non- ``fill_value`` values. + + Examples + -------- + >>> from pandas.arrays import SparseArray + >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0) + >>> s.sp_values + array([1, 2]) + """ + return self._sparse_values + + @property + def dtype(self) -> SparseDtype: + return self._dtype + + @property + def fill_value(self): + """ + Elements in `data` that are `fill_value` are not stored. + + For memory savings, this should be the most common value in the array. + + Examples + -------- + >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]") + >>> ser.sparse.fill_value + 0 + >>> spa_dtype = pd.SparseDtype(dtype=np.int32, fill_value=2) + >>> ser = pd.Series([0, 0, 2, 2, 2], dtype=spa_dtype) + >>> ser.sparse.fill_value + 2 + """ + return self.dtype.fill_value + + @fill_value.setter + def fill_value(self, value) -> None: + self._dtype = SparseDtype(self.dtype.subtype, value) + + @property + def kind(self) -> SparseIndexKind: + """ + The kind of sparse index for this array. One of {'integer', 'block'}. + """ + if isinstance(self.sp_index, IntIndex): + return "integer" + else: + return "block" + + @property + def _valid_sp_values(self) -> np.ndarray: + sp_vals = self.sp_values + mask = notna(sp_vals) + return sp_vals[mask] + + def __len__(self) -> int: + return self.sp_index.length + + @property + def _null_fill_value(self) -> bool: + return self._dtype._is_na_fill_value + + def _fill_value_matches(self, fill_value) -> bool: + if self._null_fill_value: + return isna(fill_value) + else: + return self.fill_value == fill_value + + @property + def nbytes(self) -> int: + return self.sp_values.nbytes + self.sp_index.nbytes + + @property + def density(self) -> float: + """ + The percent of non- ``fill_value`` points, as decimal. + + Examples + -------- + >>> from pandas.arrays import SparseArray + >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) + >>> s.density + 0.6 + """ + return self.sp_index.npoints / self.sp_index.length + + @property + def npoints(self) -> int: + """ + The number of non- ``fill_value`` points. + + Examples + -------- + >>> from pandas.arrays import SparseArray + >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) + >>> s.npoints + 3 + """ + return self.sp_index.npoints + + # error: Return type "SparseArray" of "isna" incompatible with return type + # "ndarray[Any, Any] | ExtensionArraySupportsAnyAll" in supertype "ExtensionArray" + def isna(self) -> Self: # type: ignore[override] + # If null fill value, we want SparseDtype[bool, true] + # to preserve the same memory usage. + dtype = SparseDtype(bool, self._null_fill_value) + if self._null_fill_value: + return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype) + mask = np.full(len(self), False, dtype=np.bool_) + mask[self.sp_index.indices] = isna(self.sp_values) + return type(self)(mask, fill_value=False, dtype=dtype) + + def _pad_or_backfill( # pylint: disable=useless-parent-delegation + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, + ) -> Self: + # TODO(3.0): We can remove this method once deprecation for fillna method + # keyword is enforced. + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) + + def fillna( + self, + value=None, + method: FillnaOptions | None = None, + limit: int | None = None, + copy: bool = True, + ) -> Self: + """ + Fill missing values with `value`. + + Parameters + ---------- + value : scalar, optional + method : str, optional + + .. warning:: + + Using 'method' will result in high memory use, + as all `fill_value` methods will be converted to + an in-memory ndarray + + limit : int, optional + + copy: bool, default True + Ignored for SparseArray. + + Returns + ------- + SparseArray + + Notes + ----- + When `value` is specified, the result's ``fill_value`` depends on + ``self.fill_value``. The goal is to maintain low-memory use. + + If ``self.fill_value`` is NA, the result dtype will be + ``SparseDtype(self.dtype, fill_value=value)``. This will preserve + amount of memory used before and after filling. + + When ``self.fill_value`` is not NA, the result dtype will be + ``self.dtype``. Again, this preserves the amount of memory used. + """ + if (method is None and value is None) or ( + method is not None and value is not None + ): + raise ValueError("Must specify one of 'method' or 'value'.") + + if method is not None: + return super().fillna(method=method, limit=limit) + + else: + new_values = np.where(isna(self.sp_values), value, self.sp_values) + + if self._null_fill_value: + # This is essentially just updating the dtype. + new_dtype = SparseDtype(self.dtype.subtype, fill_value=value) + else: + new_dtype = self.dtype + + return self._simple_new(new_values, self._sparse_index, new_dtype) + + def shift(self, periods: int = 1, fill_value=None) -> Self: + if not len(self) or periods == 0: + return self.copy() + + if isna(fill_value): + fill_value = self.dtype.na_value + + subtype = np.result_type(fill_value, self.dtype.subtype) + + if subtype != self.dtype.subtype: + # just coerce up front + arr = self.astype(SparseDtype(subtype, self.fill_value)) + else: + arr = self + + empty = self._from_sequence( + [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype + ) + + if periods > 0: + a = empty + b = arr[:-periods] + else: + a = arr[abs(periods) :] + b = empty + return arr._concat_same_type([a, b]) + + def _first_fill_value_loc(self): + """ + Get the location of the first fill value. + + Returns + ------- + int + """ + if len(self) == 0 or self.sp_index.npoints == len(self): + return -1 + + indices = self.sp_index.indices + if not len(indices) or indices[0] > 0: + return 0 + + # a number larger than 1 should be appended to + # the last in case of fill value only appears + # in the tail of array + diff = np.r_[np.diff(indices), 2] + return indices[(diff > 1).argmax()] + 1 + + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + values = np.asarray(self) + mask = np.asarray(self.isna()) + return algos.duplicated(values, keep=keep, mask=mask) + + def unique(self) -> Self: + uniques = algos.unique(self.sp_values) + if len(self.sp_values) != len(self): + fill_loc = self._first_fill_value_loc() + # Inorder to align the behavior of pd.unique or + # pd.Series.unique, we should keep the original + # order, here we use unique again to find the + # insertion place. Since the length of sp_values + # is not large, maybe minor performance hurt + # is worthwhile to the correctness. + insert_loc = len(algos.unique(self.sp_values[:fill_loc])) + uniques = np.insert(uniques, insert_loc, self.fill_value) + return type(self)._from_sequence(uniques, dtype=self.dtype) + + def _values_for_factorize(self): + # Still override this for hash_pandas_object + return np.asarray(self), self.fill_value + + def factorize( + self, + use_na_sentinel: bool = True, + ) -> tuple[np.ndarray, SparseArray]: + # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] + # The sparsity on this is backwards from what Sparse would want. Want + # ExtensionArray.factorize -> Tuple[EA, EA] + # Given that we have to return a dense array of codes, why bother + # implementing an efficient factorize? + codes, uniques = algos.factorize( + np.asarray(self), use_na_sentinel=use_na_sentinel + ) + uniques_sp = SparseArray(uniques, dtype=self.dtype) + return codes, uniques_sp + + def value_counts(self, dropna: bool = True) -> Series: + """ + Returns a Series containing counts of unique values. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of NaN, even if NaN is in sp_values. + + Returns + ------- + counts : Series + """ + from pandas import ( + Index, + Series, + ) + + keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna) + fcounts = self.sp_index.ngaps + if fcounts > 0 and (not self._null_fill_value or not dropna): + mask = isna(keys) if self._null_fill_value else keys == self.fill_value + if mask.any(): + counts[mask] += fcounts + else: + # error: Argument 1 to "insert" has incompatible type "Union[ + # ExtensionArray,ndarray[Any, Any]]"; expected "Union[ + # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype + # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]], + # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence + # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]" + keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type] + counts = np.insert(counts, 0, fcounts) + + if not isinstance(keys, ABCIndex): + index = Index(keys) + else: + index = keys + return Series(counts, index=index, copy=False) + + # -------- + # Indexing + # -------- + @overload + def __getitem__(self, key: ScalarIndexer) -> Any: + ... + + @overload + def __getitem__( + self, + key: SequenceIndexer | tuple[int | ellipsis, ...], + ) -> Self: + ... + + def __getitem__( + self, + key: PositionalIndexer | tuple[int | ellipsis, ...], + ) -> Self | Any: + if isinstance(key, tuple): + key = unpack_tuple_and_ellipses(key) + if key is Ellipsis: + raise ValueError("Cannot slice with Ellipsis") + + if is_integer(key): + return self._get_val_at(key) + elif isinstance(key, tuple): + # error: Invalid index type "Tuple[Union[int, ellipsis], ...]" + # for "ndarray[Any, Any]"; expected type + # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_, + # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[ + # Union[bool_, integer[Any]]]]], _NestedSequence[Union[ + # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[ + # dtype[Union[bool_, integer[Any]]]], _NestedSequence[ + # _SupportsArray[dtype[Union[bool_, integer[Any]]]]], + # _NestedSequence[Union[bool, int]]], ...]]" + data_slice = self.to_dense()[key] # type: ignore[index] + elif isinstance(key, slice): + # Avoid densifying when handling contiguous slices + if key.step is None or key.step == 1: + start = 0 if key.start is None else key.start + if start < 0: + start += len(self) + + end = len(self) if key.stop is None else key.stop + if end < 0: + end += len(self) + + indices = self.sp_index.indices + keep_inds = np.flatnonzero((indices >= start) & (indices < end)) + sp_vals = self.sp_values[keep_inds] + + sp_index = indices[keep_inds].copy() + + # If we've sliced to not include the start of the array, all our indices + # should be shifted. NB: here we are careful to also not shift by a + # negative value for a case like [0, 1][-100:] where the start index + # should be treated like 0 + if start > 0: + sp_index -= start + + # Length of our result should match applying this slice to a range + # of the length of our original array + new_len = len(range(len(self))[key]) + new_sp_index = make_sparse_index(new_len, sp_index, self.kind) + return type(self)._simple_new(sp_vals, new_sp_index, self.dtype) + else: + indices = np.arange(len(self), dtype=np.int32)[key] + return self.take(indices) + + elif not is_list_like(key): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) + + else: + if isinstance(key, SparseArray): + # NOTE: If we guarantee that SparseDType(bool) + # has only fill_value - true, false or nan + # (see GH PR 44955) + # we can apply mask very fast: + if is_bool_dtype(key): + if isna(key.fill_value): + return self.take(key.sp_index.indices[key.sp_values]) + if not key.fill_value: + return self.take(key.sp_index.indices) + n = len(self) + mask = np.full(n, True, dtype=np.bool_) + mask[key.sp_index.indices] = False + return self.take(np.arange(n)[mask]) + else: + key = np.asarray(key) + + key = check_array_indexer(self, key) + + if com.is_bool_indexer(key): + # mypy doesn't know we have an array here + key = cast(np.ndarray, key) + return self.take(np.arange(len(key), dtype=np.int32)[key]) + elif hasattr(key, "__len__"): + return self.take(key) + else: + raise ValueError(f"Cannot slice with '{key}'") + + return type(self)(data_slice, kind=self.kind) + + def _get_val_at(self, loc): + loc = validate_insert_loc(loc, len(self)) + + sp_loc = self.sp_index.lookup(loc) + if sp_loc == -1: + return self.fill_value + else: + val = self.sp_values[sp_loc] + val = maybe_box_datetimelike(val, self.sp_values.dtype) + return val + + def take(self, indices, *, allow_fill: bool = False, fill_value=None) -> Self: + if is_scalar(indices): + raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.") + indices = np.asarray(indices, dtype=np.int32) + + dtype = None + if indices.size == 0: + result = np.array([], dtype="object") + dtype = self.dtype + elif allow_fill: + result = self._take_with_fill(indices, fill_value=fill_value) + else: + return self._take_without_fill(indices) + + return type(self)( + result, fill_value=self.fill_value, kind=self.kind, dtype=dtype + ) + + def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: + if fill_value is None: + fill_value = self.dtype.na_value + + if indices.min() < -1: + raise ValueError( + "Invalid value in 'indices'. Must be between -1 " + "and the length of the array." + ) + + if indices.max() >= len(self): + raise IndexError("out of bounds value in 'indices'.") + + if len(self) == 0: + # Empty... Allow taking only if all empty + if (indices == -1).all(): + dtype = np.result_type(self.sp_values, type(fill_value)) + taken = np.empty_like(indices, dtype=dtype) + taken.fill(fill_value) + return taken + else: + raise IndexError("cannot do a non-empty take from an empty axes.") + + # sp_indexer may be -1 for two reasons + # 1.) we took for an index of -1 (new) + # 2.) we took a value that was self.fill_value (old) + sp_indexer = self.sp_index.lookup_array(indices) + new_fill_indices = indices == -1 + old_fill_indices = (sp_indexer == -1) & ~new_fill_indices + + if self.sp_index.npoints == 0 and old_fill_indices.all(): + # We've looked up all valid points on an all-sparse array. + taken = np.full( + sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype + ) + + elif self.sp_index.npoints == 0: + # Use the old fill_value unless we took for an index of -1 + _dtype = np.result_type(self.dtype.subtype, type(fill_value)) + taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) + taken[old_fill_indices] = self.fill_value + else: + taken = self.sp_values.take(sp_indexer) + + # Fill in two steps. + # Old fill values + # New fill values + # potentially coercing to a new dtype at each stage. + + m0 = sp_indexer[old_fill_indices] < 0 + m1 = sp_indexer[new_fill_indices] < 0 + + result_type = taken.dtype + + if m0.any(): + result_type = np.result_type(result_type, type(self.fill_value)) + taken = taken.astype(result_type) + taken[old_fill_indices] = self.fill_value + + if m1.any(): + result_type = np.result_type(result_type, type(fill_value)) + taken = taken.astype(result_type) + taken[new_fill_indices] = fill_value + + return taken + + def _take_without_fill(self, indices) -> Self: + to_shift = indices < 0 + + n = len(self) + + if (indices.max() >= n) or (indices.min() < -n): + if n == 0: + raise IndexError("cannot do a non-empty take from an empty axes.") + raise IndexError("out of bounds value in 'indices'.") + + if to_shift.any(): + indices = indices.copy() + indices[to_shift] += n + + sp_indexer = self.sp_index.lookup_array(indices) + value_mask = sp_indexer != -1 + new_sp_values = self.sp_values[sp_indexer[value_mask]] + + value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False) + + new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind) + return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype) + + def searchsorted( + self, + v: ArrayLike | object, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + msg = "searchsorted requires high memory usage." + warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) + v = np.asarray(v) + return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) + + def copy(self) -> Self: + values = self.sp_values.copy() + return self._simple_new(values, self.sp_index, self.dtype) + + @classmethod + def _concat_same_type(cls, to_concat: Sequence[Self]) -> Self: + fill_value = to_concat[0].fill_value + + values = [] + length = 0 + + if to_concat: + sp_kind = to_concat[0].kind + else: + sp_kind = "integer" + + sp_index: SparseIndex + if sp_kind == "integer": + indices = [] + + for arr in to_concat: + int_idx = arr.sp_index.indices.copy() + int_idx += length # TODO: wraparound + length += arr.sp_index.length + + values.append(arr.sp_values) + indices.append(int_idx) + + data = np.concatenate(values) + indices_arr = np.concatenate(indices) + # error: Argument 2 to "IntIndex" has incompatible type + # "ndarray[Any, dtype[signedinteger[_32Bit]]]"; + # expected "Sequence[int]" + sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type] + + else: + # when concatenating block indices, we don't claim that you'll + # get an identical index as concatenating the values and then + # creating a new index. We don't want to spend the time trying + # to merge blocks across arrays in `to_concat`, so the resulting + # BlockIndex may have more blocks. + blengths = [] + blocs = [] + + for arr in to_concat: + block_idx = arr.sp_index.to_block_index() + + values.append(arr.sp_values) + blocs.append(block_idx.blocs.copy() + length) + blengths.append(block_idx.blengths) + length += arr.sp_index.length + + data = np.concatenate(values) + blocs_arr = np.concatenate(blocs) + blengths_arr = np.concatenate(blengths) + + sp_index = BlockIndex(length, blocs_arr, blengths_arr) + + return cls(data, sparse_index=sp_index, fill_value=fill_value) + + def astype(self, dtype: AstypeArg | None = None, copy: bool = True): + """ + Change the dtype of a SparseArray. + + The output will always be a SparseArray. To convert to a dense + ndarray with a certain dtype, use :meth:`numpy.asarray`. + + Parameters + ---------- + dtype : np.dtype or ExtensionDtype + For SparseDtype, this changes the dtype of + ``self.sp_values`` and the ``self.fill_value``. + + For other dtypes, this only changes the dtype of + ``self.sp_values``. + + copy : bool, default True + Whether to ensure a copy is made, even if not necessary. + + Returns + ------- + SparseArray + + Examples + -------- + >>> arr = pd.arrays.SparseArray([0, 0, 1, 2]) + >>> arr + [0, 0, 1, 2] + Fill: 0 + IntIndex + Indices: array([2, 3], dtype=int32) + + >>> arr.astype(SparseDtype(np.dtype('int32'))) + [0, 0, 1, 2] + Fill: 0 + IntIndex + Indices: array([2, 3], dtype=int32) + + Using a NumPy dtype with a different kind (e.g. float) will coerce + just ``self.sp_values``. + + >>> arr.astype(SparseDtype(np.dtype('float64'))) + ... # doctest: +NORMALIZE_WHITESPACE + [nan, nan, 1.0, 2.0] + Fill: nan + IntIndex + Indices: array([2, 3], dtype=int32) + + Using a SparseDtype, you can also change the fill value as well. + + >>> arr.astype(SparseDtype("float64", fill_value=0.0)) + ... # doctest: +NORMALIZE_WHITESPACE + [0.0, 0.0, 1.0, 2.0] + Fill: 0.0 + IntIndex + Indices: array([2, 3], dtype=int32) + """ + if dtype == self._dtype: + if not copy: + return self + else: + return self.copy() + + future_dtype = pandas_dtype(dtype) + if not isinstance(future_dtype, SparseDtype): + # GH#34457 + values = np.asarray(self) + values = ensure_wrapped_if_datetimelike(values) + return astype_array(values, dtype=future_dtype, copy=False) + + dtype = self.dtype.update_dtype(dtype) + subtype = pandas_dtype(dtype._subtype_with_str) + subtype = cast(np.dtype, subtype) # ensured by update_dtype + values = ensure_wrapped_if_datetimelike(self.sp_values) + sp_values = astype_array(values, subtype, copy=copy) + sp_values = np.asarray(sp_values) + + return self._simple_new(sp_values, self.sp_index, dtype) + + def map(self, mapper, na_action=None) -> Self: + """ + Map categories using an input mapping or function. + + Parameters + ---------- + mapper : dict, Series, callable + The correspondence from old values to new. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. + + Returns + ------- + SparseArray + The output array will have the same density as the input. + The output fill value will be the result of applying the + mapping to ``self.fill_value`` + + Examples + -------- + >>> arr = pd.arrays.SparseArray([0, 1, 2]) + >>> arr.map(lambda x: x + 10) + [10, 11, 12] + Fill: 10 + IntIndex + Indices: array([1, 2], dtype=int32) + + >>> arr.map({0: 10, 1: 11, 2: 12}) + [10, 11, 12] + Fill: 10 + IntIndex + Indices: array([1, 2], dtype=int32) + + >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2])) + [10, 11, 12] + Fill: 10 + IntIndex + Indices: array([1, 2], dtype=int32) + """ + is_map = isinstance(mapper, (abc.Mapping, ABCSeries)) + + fill_val = self.fill_value + + if na_action is None or notna(fill_val): + fill_val = mapper.get(fill_val, fill_val) if is_map else mapper(fill_val) + + def func(sp_val): + new_sp_val = mapper.get(sp_val, None) if is_map else mapper(sp_val) + # check identity and equality because nans are not equal to each other + if new_sp_val is fill_val or new_sp_val == fill_val: + msg = "fill value in the sparse values not supported" + raise ValueError(msg) + return new_sp_val + + sp_values = [func(x) for x in self.sp_values] + + return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_val) + + def to_dense(self) -> np.ndarray: + """ + Convert SparseArray to a NumPy array. + + Returns + ------- + arr : NumPy array + """ + return np.asarray(self, dtype=self.sp_values.dtype) + + def _where(self, mask, value): + # NB: may not preserve dtype, e.g. result may be Sparse[float64] + # while self is Sparse[int64] + naive_implementation = np.where(mask, self, value) + dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value) + result = type(self)._from_sequence(naive_implementation, dtype=dtype) + return result + + # ------------------------------------------------------------------------ + # IO + # ------------------------------------------------------------------------ + def __setstate__(self, state) -> None: + """Necessary for making this object picklable""" + if isinstance(state, tuple): + # Compat for pandas < 0.24.0 + nd_state, (fill_value, sp_index) = state + sparse_values = np.array([]) + sparse_values.__setstate__(nd_state) + + self._sparse_values = sparse_values + self._sparse_index = sp_index + self._dtype = SparseDtype(sparse_values.dtype, fill_value) + else: + self.__dict__.update(state) + + def nonzero(self) -> tuple[npt.NDArray[np.int32]]: + if self.fill_value == 0: + return (self.sp_index.indices,) + else: + return (self.sp_index.indices[self.sp_values != 0],) + + # ------------------------------------------------------------------------ + # Reductions + # ------------------------------------------------------------------------ + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + method = getattr(self, name, None) + + if method is None: + raise TypeError(f"cannot perform {name} with type {self.dtype}") + + if skipna: + arr = self + else: + arr = self.dropna() + + result = getattr(arr, name)(**kwargs) + + if keepdims: + return type(self)([result], dtype=self.dtype) + else: + return result + + def all(self, axis=None, *args, **kwargs): + """ + Tests whether all elements evaluate True + + Returns + ------- + all : bool + + See Also + -------- + numpy.all + """ + nv.validate_all(args, kwargs) + + values = self.sp_values + + if len(values) != len(self) and not np.all(self.fill_value): + return False + + return values.all() + + def any(self, axis: AxisInt = 0, *args, **kwargs) -> bool: + """ + Tests whether at least one of elements evaluate True + + Returns + ------- + any : bool + + See Also + -------- + numpy.any + """ + nv.validate_any(args, kwargs) + + values = self.sp_values + + if len(values) != len(self) and np.any(self.fill_value): + return True + + return values.any().item() + + def sum( + self, + axis: AxisInt = 0, + min_count: int = 0, + skipna: bool = True, + *args, + **kwargs, + ) -> Scalar: + """ + Sum of non-NA/null values + + Parameters + ---------- + axis : int, default 0 + Not Used. NumPy compatibility. + min_count : int, default 0 + The required number of valid values to perform the summation. If fewer + than ``min_count`` valid values are present, the result will be the missing + value indicator for subarray type. + *args, **kwargs + Not Used. NumPy compatibility. + + Returns + ------- + scalar + """ + nv.validate_sum(args, kwargs) + valid_vals = self._valid_sp_values + sp_sum = valid_vals.sum() + has_na = self.sp_index.ngaps > 0 and not self._null_fill_value + + if has_na and not skipna: + return na_value_for_dtype(self.dtype.subtype, compat=False) + + if self._null_fill_value: + if check_below_min_count(valid_vals.shape, None, min_count): + return na_value_for_dtype(self.dtype.subtype, compat=False) + return sp_sum + else: + nsparse = self.sp_index.ngaps + if check_below_min_count(valid_vals.shape, None, min_count - nsparse): + return na_value_for_dtype(self.dtype.subtype, compat=False) + return sp_sum + self.fill_value * nsparse + + def cumsum(self, axis: AxisInt = 0, *args, **kwargs) -> SparseArray: + """ + Cumulative sum of non-NA/null values. + + When performing the cumulative summation, any non-NA/null values will + be skipped. The resulting SparseArray will preserve the locations of + NaN values, but the fill value will be `np.nan` regardless. + + Parameters + ---------- + axis : int or None + Axis over which to perform the cumulative summation. If None, + perform cumulative summation over flattened array. + + Returns + ------- + cumsum : SparseArray + """ + nv.validate_cumsum(args, kwargs) + + if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour. + raise ValueError(f"axis(={axis}) out of bounds") + + if not self._null_fill_value: + return SparseArray(self.to_dense()).cumsum() + + return SparseArray( + self.sp_values.cumsum(), + sparse_index=self.sp_index, + fill_value=self.fill_value, + ) + + def mean(self, axis: Axis = 0, *args, **kwargs): + """ + Mean of non-NA/null values + + Returns + ------- + mean : float + """ + nv.validate_mean(args, kwargs) + valid_vals = self._valid_sp_values + sp_sum = valid_vals.sum() + ct = len(valid_vals) + + if self._null_fill_value: + return sp_sum / ct + else: + nsparse = self.sp_index.ngaps + return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) + + def max(self, *, axis: AxisInt | None = None, skipna: bool = True): + """ + Max of array values, ignoring NA values if specified. + + Parameters + ---------- + axis : int, default 0 + Not Used. NumPy compatibility. + skipna : bool, default True + Whether to ignore NA values. + + Returns + ------- + scalar + """ + nv.validate_minmax_axis(axis, self.ndim) + return self._min_max("max", skipna=skipna) + + def min(self, *, axis: AxisInt | None = None, skipna: bool = True): + """ + Min of array values, ignoring NA values if specified. + + Parameters + ---------- + axis : int, default 0 + Not Used. NumPy compatibility. + skipna : bool, default True + Whether to ignore NA values. + + Returns + ------- + scalar + """ + nv.validate_minmax_axis(axis, self.ndim) + return self._min_max("min", skipna=skipna) + + def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar: + """ + Min/max of non-NA/null values + + Parameters + ---------- + kind : {"min", "max"} + skipna : bool + + Returns + ------- + scalar + """ + valid_vals = self._valid_sp_values + has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0 + + if len(valid_vals) > 0: + sp_min_max = getattr(valid_vals, kind)() + + # If a non-null fill value is currently present, it might be the min/max + if has_nonnull_fill_vals: + func = max if kind == "max" else min + return func(sp_min_max, self.fill_value) + elif skipna: + return sp_min_max + elif self.sp_index.ngaps == 0: + # No NAs present + return sp_min_max + else: + return na_value_for_dtype(self.dtype.subtype, compat=False) + elif has_nonnull_fill_vals: + return self.fill_value + else: + return na_value_for_dtype(self.dtype.subtype, compat=False) + + def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int: + values = self._sparse_values + index = self._sparse_index.indices + mask = np.asarray(isna(values)) + func = np.argmax if kind == "argmax" else np.argmin + + idx = np.arange(values.shape[0]) + non_nans = values[~mask] + non_nan_idx = idx[~mask] + + _candidate = non_nan_idx[func(non_nans)] + candidate = index[_candidate] + + if isna(self.fill_value): + return candidate + if kind == "argmin" and self[candidate] < self.fill_value: + return candidate + if kind == "argmax" and self[candidate] > self.fill_value: + return candidate + _loc = self._first_fill_value_loc() + if _loc == -1: + # fill_value doesn't exist + return candidate + else: + return _loc + + def argmax(self, skipna: bool = True) -> int: + validate_bool_kwarg(skipna, "skipna") + if not skipna and self._hasna: + raise NotImplementedError + return self._argmin_argmax("argmax") + + def argmin(self, skipna: bool = True) -> int: + validate_bool_kwarg(skipna, "skipna") + if not skipna and self._hasna: + raise NotImplementedError + return self._argmin_argmax("argmin") + + # ------------------------------------------------------------------------ + # Ufuncs + # ------------------------------------------------------------------------ + + _HANDLED_TYPES = (np.ndarray, numbers.Number) + + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = arraylike.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + if "out" in kwargs: + # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace + res = arraylike.dispatch_ufunc_with_out( + self, ufunc, method, *inputs, **kwargs + ) + return res + + if method == "reduce": + result = arraylike.dispatch_reduction_ufunc( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + # e.g. tests.series.test_ufunc.TestNumpyReductions + return result + + if len(inputs) == 1: + # No alignment necessary. + sp_values = getattr(ufunc, method)(self.sp_values, **kwargs) + fill_value = getattr(ufunc, method)(self.fill_value, **kwargs) + + if ufunc.nout > 1: + # multiple outputs. e.g. modf + arrays = tuple( + self._simple_new( + sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv) + ) + for sp_value, fv in zip(sp_values, fill_value) + ) + return arrays + elif method == "reduce": + # e.g. reductions + return sp_values + + return self._simple_new( + sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) + ) + + new_inputs = tuple(np.asarray(x) for x in inputs) + result = getattr(ufunc, method)(*new_inputs, **kwargs) + if out: + if len(out) == 1: + out = out[0] + return out + + if ufunc.nout > 1: + return tuple(type(self)(x) for x in result) + elif method == "at": + # no return value + return None + else: + return type(self)(result) + + # ------------------------------------------------------------------------ + # Ops + # ------------------------------------------------------------------------ + + def _arith_method(self, other, op): + op_name = op.__name__ + + if isinstance(other, SparseArray): + return _sparse_array_op(self, other, op, op_name) + + elif is_scalar(other): + with np.errstate(all="ignore"): + fill = op(_get_fill(self), np.asarray(other)) + result = op(self.sp_values, other) + + if op_name == "divmod": + left, right = result + lfill, rfill = fill + return ( + _wrap_result(op_name, left, self.sp_index, lfill), + _wrap_result(op_name, right, self.sp_index, rfill), + ) + + return _wrap_result(op_name, result, self.sp_index, fill) + + else: + other = np.asarray(other) + with np.errstate(all="ignore"): + if len(self) != len(other): + raise AssertionError( + f"length mismatch: {len(self)} vs. {len(other)}" + ) + if not isinstance(other, SparseArray): + dtype = getattr(other, "dtype", None) + other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) + return _sparse_array_op(self, other, op, op_name) + + def _cmp_method(self, other, op) -> SparseArray: + if not is_scalar(other) and not isinstance(other, type(self)): + # convert list-like to ndarray + other = np.asarray(other) + + if isinstance(other, np.ndarray): + # TODO: make this more flexible than just ndarray... + other = SparseArray(other, fill_value=self.fill_value) + + if isinstance(other, SparseArray): + if len(self) != len(other): + raise ValueError( + f"operands have mismatched length {len(self)} and {len(other)}" + ) + + op_name = op.__name__.strip("_") + return _sparse_array_op(self, other, op, op_name) + else: + # scalar + fill_value = op(self.fill_value, other) + result = np.full(len(self), fill_value, dtype=np.bool_) + result[self.sp_index.indices] = op(self.sp_values, other) + + return type(self)( + result, + fill_value=fill_value, + dtype=np.bool_, + ) + + _logical_method = _cmp_method + + def _unary_method(self, op) -> SparseArray: + fill_value = op(np.array(self.fill_value)).item() + dtype = SparseDtype(self.dtype.subtype, fill_value) + # NOTE: if fill_value doesn't change + # we just have to apply op to sp_values + if isna(self.fill_value) or fill_value == self.fill_value: + values = op(self.sp_values) + return type(self)._simple_new(values, self.sp_index, self.dtype) + # In the other case we have to recalc indexes + return type(self)(op(self.to_dense()), dtype=dtype) + + def __pos__(self) -> SparseArray: + return self._unary_method(operator.pos) + + def __neg__(self) -> SparseArray: + return self._unary_method(operator.neg) + + def __invert__(self) -> SparseArray: + return self._unary_method(operator.invert) + + def __abs__(self) -> SparseArray: + return self._unary_method(operator.abs) + + # ---------- + # Formatting + # ----------- + def __repr__(self) -> str: + pp_str = printing.pprint_thing(self) + pp_fill = printing.pprint_thing(self.fill_value) + pp_index = printing.pprint_thing(self.sp_index) + return f"{pp_str}\nFill: {pp_fill}\n{pp_index}" + + def _formatter(self, boxed: bool = False): + # Defer to the formatter from the GenericArrayFormatter calling us. + # This will infer the correct formatter from the dtype of the values. + return None + + +def _make_sparse( + arr: np.ndarray, + kind: SparseIndexKind = "block", + fill_value=None, + dtype: np.dtype | None = None, +): + """ + Convert ndarray to sparse format + + Parameters + ---------- + arr : ndarray + kind : {'block', 'integer'} + fill_value : NaN or another value + dtype : np.dtype, optional + copy : bool, default False + + Returns + ------- + (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) + """ + assert isinstance(arr, np.ndarray) + + if arr.ndim > 1: + raise TypeError("expected dimension <= 1 data") + + if fill_value is None: + fill_value = na_value_for_dtype(arr.dtype) + + if isna(fill_value): + mask = notna(arr) + else: + # cast to object comparison to be safe + if is_string_dtype(arr.dtype): + arr = arr.astype(object) + + if is_object_dtype(arr.dtype): + # element-wise equality check method in numpy doesn't treat + # each element type, eg. 0, 0.0, and False are treated as + # same. So we have to check the both of its type and value. + mask = splib.make_mask_object_ndarray(arr, fill_value) + else: + mask = arr != fill_value + + length = len(arr) + if length != len(mask): + # the arr is a SparseArray + indices = mask.sp_index.indices + else: + indices = mask.nonzero()[0].astype(np.int32) + + index = make_sparse_index(length, indices, kind) + sparsified_values = arr[mask] + if dtype is not None: + sparsified_values = ensure_wrapped_if_datetimelike(sparsified_values) + sparsified_values = astype_array(sparsified_values, dtype=dtype) + sparsified_values = np.asarray(sparsified_values) + + # TODO: copy + return sparsified_values, index, fill_value + + +@overload +def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex: + ... + + +@overload +def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex: + ... + + +def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex: + index: SparseIndex + if kind == "block": + locs, lens = splib.get_blocks(indices) + index = BlockIndex(length, locs, lens) + elif kind == "integer": + index = IntIndex(length, indices) + else: # pragma: no cover + raise ValueError("must be block or integer type") + return index diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/scipy_sparse.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/scipy_sparse.py new file mode 100644 index 0000000000000000000000000000000000000000..71b71a9779da5c1a584e0ef98bc8320d81bc2a35 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/arrays/sparse/scipy_sparse.py @@ -0,0 +1,207 @@ +""" +Interaction with scipy.sparse matrices. + +Currently only includes to_coo helpers. +""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pandas._libs import lib + +from pandas.core.dtypes.missing import notna + +from pandas.core.algorithms import factorize +from pandas.core.indexes.api import MultiIndex +from pandas.core.series import Series + +if TYPE_CHECKING: + from collections.abc import Iterable + + import numpy as np + import scipy.sparse + + from pandas._typing import ( + IndexLabel, + npt, + ) + + +def _check_is_partition(parts: Iterable, whole: Iterable): + whole = set(whole) + parts = [set(x) for x in parts] + if set.intersection(*parts) != set(): + raise ValueError("Is not a partition because intersection is not null.") + if set.union(*parts) != whole: + raise ValueError("Is not a partition because union is not the whole.") + + +def _levels_to_axis( + ss, + levels: tuple[int] | list[int], + valid_ilocs: npt.NDArray[np.intp], + sort_labels: bool = False, +) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]: + """ + For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`, + where `ax_coords` are the coordinates along one of the two axes of the + destination sparse matrix, and `ax_labels` are the labels from `ss`' Index + which correspond to these coordinates. + + Parameters + ---------- + ss : Series + levels : tuple/list + valid_ilocs : numpy.ndarray + Array of integer positions of valid values for the sparse matrix in ss. + sort_labels : bool, default False + Sort the axis labels before forming the sparse matrix. When `levels` + refers to a single level, set to True for a faster execution. + + Returns + ------- + ax_coords : numpy.ndarray (axis coordinates) + ax_labels : list (axis labels) + """ + # Since the labels are sorted in `Index.levels`, when we wish to sort and + # there is only one level of the MultiIndex for this axis, the desired + # output can be obtained in the following simpler, more efficient way. + if sort_labels and len(levels) == 1: + ax_coords = ss.index.codes[levels[0]][valid_ilocs] + ax_labels = ss.index.levels[levels[0]] + + else: + levels_values = lib.fast_zip( + [ss.index.get_level_values(lvl).to_numpy() for lvl in levels] + ) + codes, ax_labels = factorize(levels_values, sort=sort_labels) + ax_coords = codes[valid_ilocs] + + ax_labels = ax_labels.tolist() + return ax_coords, ax_labels + + +def _to_ijv( + ss, + row_levels: tuple[int] | list[int] = (0,), + column_levels: tuple[int] | list[int] = (1,), + sort_labels: bool = False, +) -> tuple[ + np.ndarray, + npt.NDArray[np.intp], + npt.NDArray[np.intp], + list[IndexLabel], + list[IndexLabel], +]: + """ + For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels, + jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo + constructor, and ilabels and jlabels are the row and column labels + respectively. + + Parameters + ---------- + ss : Series + row_levels : tuple/list + column_levels : tuple/list + sort_labels : bool, default False + Sort the row and column labels before forming the sparse matrix. + When `row_levels` and/or `column_levels` refer to a single level, + set to `True` for a faster execution. + + Returns + ------- + values : numpy.ndarray + Valid values to populate a sparse matrix, extracted from + ss. + i_coords : numpy.ndarray (row coordinates of the values) + j_coords : numpy.ndarray (column coordinates of the values) + i_labels : list (row labels) + j_labels : list (column labels) + """ + # index and column levels must be a partition of the index + _check_is_partition([row_levels, column_levels], range(ss.index.nlevels)) + # From the sparse Series, get the integer indices and data for valid sparse + # entries. + sp_vals = ss.array.sp_values + na_mask = notna(sp_vals) + values = sp_vals[na_mask] + valid_ilocs = ss.array.sp_index.indices[na_mask] + + i_coords, i_labels = _levels_to_axis( + ss, row_levels, valid_ilocs, sort_labels=sort_labels + ) + + j_coords, j_labels = _levels_to_axis( + ss, column_levels, valid_ilocs, sort_labels=sort_labels + ) + + return values, i_coords, j_coords, i_labels, j_labels + + +def sparse_series_to_coo( + ss: Series, + row_levels: Iterable[int] = (0,), + column_levels: Iterable[int] = (1,), + sort_labels: bool = False, +) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]: + """ + Convert a sparse Series to a scipy.sparse.coo_matrix using index + levels row_levels, column_levels as the row and column + labels respectively. Returns the sparse_matrix, row and column labels. + """ + import scipy.sparse + + if ss.index.nlevels < 2: + raise ValueError("to_coo requires MultiIndex with nlevels >= 2.") + if not ss.index.is_unique: + raise ValueError( + "Duplicate index entries are not allowed in to_coo transformation." + ) + + # to keep things simple, only rely on integer indexing (not labels) + row_levels = [ss.index._get_level_number(x) for x in row_levels] + column_levels = [ss.index._get_level_number(x) for x in column_levels] + + v, i, j, rows, columns = _to_ijv( + ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels + ) + sparse_matrix = scipy.sparse.coo_matrix( + (v, (i, j)), shape=(len(rows), len(columns)) + ) + return sparse_matrix, rows, columns + + +def coo_to_sparse_series( + A: scipy.sparse.coo_matrix, dense_index: bool = False +) -> Series: + """ + Convert a scipy.sparse.coo_matrix to a Series with type sparse. + + Parameters + ---------- + A : scipy.sparse.coo_matrix + dense_index : bool, default False + + Returns + ------- + Series + + Raises + ------ + TypeError if A is not a coo_matrix + """ + from pandas import SparseDtype + + try: + ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)), copy=False) + except AttributeError as err: + raise TypeError( + f"Expected coo_matrix. Got {type(A).__name__} instead." + ) from err + ser = ser.sort_index() + ser = ser.astype(SparseDtype(ser.dtype)) + if dense_index: + ind = MultiIndex.from_product([A.row, A.col]) + ser = ser.reindex(ind) + return ser diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..235b38752b6fa167c802c92f4228a9f698588266 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/align.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/align.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c70327b5ff2230e1f4ede7273c782deb3aea7da0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/align.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df23914ca9807c0e3a24bc726e67c804cb8780c7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/check.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/check.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc96859b500d8a31dbe9c61891c5eb0558e76317 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/check.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..521179ef782bf7f3d0688c4ba8ed25ea56604732 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/engines.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/engines.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d03f43f1c630fb6d940683795ef7866fc607423c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/engines.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/eval.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/eval.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b42ba2745a71df51dc28a2c441e2a4e12672aba Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/eval.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/expr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/expr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..afebef22b1b0841e6a6395a46348937b33dc3f64 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/expr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/expressions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/expressions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07fb85b147d0c8cfd6221a9a3ff51161b897f797 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/expressions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..205b2865a52329b9f0ed40906625fbfacc5c9a9a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/parsing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/parsing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..184da0e2c8718a4e1ac3cbaf85743c57c69cfbb5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/parsing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/pytables.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/pytables.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f57c7b055a7b955bc55256ec60c59d4a2eaa71ff Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/pytables.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/scope.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/scope.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e177b176e5b89c6c78947e9d255c05db40e70804 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/__pycache__/scope.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/align.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/align.py new file mode 100644 index 0000000000000000000000000000000000000000..cd852ba9249cf26ea39b7e51bbfa754ffe9c10ce --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/align.py @@ -0,0 +1,213 @@ +""" +Core eval alignment algorithms. +""" +from __future__ import annotations + +from functools import ( + partial, + wraps, +) +from typing import ( + TYPE_CHECKING, + Callable, +) +import warnings + +import numpy as np + +from pandas.errors import PerformanceWarning +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) + +from pandas.core.base import PandasObject +import pandas.core.common as com +from pandas.core.computation.common import result_type_many + +if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import F + + from pandas.core.generic import NDFrame + from pandas.core.indexes.api import Index + + +def _align_core_single_unary_op( + term, +) -> tuple[partial | type[NDFrame], dict[str, Index] | None]: + typ: partial | type[NDFrame] + axes: dict[str, Index] | None = None + + if isinstance(term.value, np.ndarray): + typ = partial(np.asanyarray, dtype=term.value.dtype) + else: + typ = type(term.value) + if hasattr(term.value, "axes"): + axes = _zip_axes_from_type(typ, term.value.axes) + + return typ, axes + + +def _zip_axes_from_type( + typ: type[NDFrame], new_axes: Sequence[Index] +) -> dict[str, Index]: + return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} + + +def _any_pandas_objects(terms) -> bool: + """ + Check a sequence of terms for instances of PandasObject. + """ + return any(isinstance(term.value, PandasObject) for term in terms) + + +def _filter_special_cases(f) -> Callable[[F], F]: + @wraps(f) + def wrapper(terms): + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + + term_values = (term.value for term in terms) + + # we don't have any pandas objects + if not _any_pandas_objects(terms): + return result_type_many(*term_values), None + + return f(terms) + + return wrapper + + +@_filter_special_cases +def _align_core(terms): + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")] + term_dims = [terms[i].value.ndim for i in term_index] + + from pandas import Series + + ndims = Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()].value + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + gt_than_one_axis = naxes > 1 + + for value in (terms[i].value for i in term_index): + is_series = isinstance(value, ABCSeries) + is_series_and_gt_one_axis = is_series and gt_than_one_axis + + for axis, items in enumerate(value.axes): + if is_series_and_gt_one_axis: + ax, itm = naxes - 1, value.index + else: + ax, itm = axis, items + + if not axes[ax].is_(itm): + axes[ax] = axes[ax].union(itm) + + for i, ndim in ndims.items(): + for axis, items in zip(range(ndim), axes): + ti = terms[i].value + + if hasattr(ti, "reindex"): + transpose = isinstance(ti, ABCSeries) and naxes > 1 + reindexer = axes[naxes - 1] if transpose else items + + term_axis_size = len(ti.axes[axis]) + reindexer_size = len(reindexer) + + ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) + if ordm >= 1 and reindexer_size >= 10000: + w = ( + f"Alignment difference on axis {axis} is larger " + f"than an order of magnitude on term {repr(terms[i].name)}, " + f"by more than {ordm:.4g}; performance may suffer." + ) + warnings.warn( + w, category=PerformanceWarning, stacklevel=find_stack_level() + ) + + obj = ti.reindex(reindexer, axis=axis, copy=False) + terms[i].update(obj) + + terms[i].update(terms[i].value.values) + + return typ, _zip_axes_from_type(typ, axes) + + +def align_terms(terms): + """ + Align a set of terms. + """ + try: + # flatten the parse tree (a nested list, really) + terms = list(com.flatten(terms)) + except TypeError: + # can't iterate so it must just be a constant or single variable + if isinstance(terms.value, (ABCSeries, ABCDataFrame)): + typ = type(terms.value) + return typ, _zip_axes_from_type(typ, terms.value.axes) + return np.result_type(terms.type), None + + # if all resolved variables are numeric scalars + if all(term.is_scalar for term in terms): + return result_type_many(*(term.value for term in terms)).type, None + + # perform the main alignment + typ, axes = _align_core(terms) + return typ, axes + + +def reconstruct_object(typ, obj, axes, dtype): + """ + Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + ret : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + try: + typ = typ.type + except AttributeError: + pass + + res_t = np.result_type(obj.dtype, dtype) + + if not isinstance(typ, partial) and issubclass(typ, PandasObject): + return typ(obj, dtype=res_t, **axes) + + # special case for pathological things like ~True/~False + if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: + ret_value = res_t.type(obj) + else: + ret_value = typ(obj).astype(res_t) + # The condition is to distinguish 0-dim array (returned in case of + # scalar) and 1 element array + # e.g. np.array(0) and np.array([0]) + if ( + len(obj.shape) == 1 + and len(obj) == 1 + and not isinstance(ret_value, np.ndarray) + ): + ret_value = np.array([ret_value]).astype(res_t) + + return ret_value diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/api.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/api.py new file mode 100644 index 0000000000000000000000000000000000000000..bd3be5b3f8c42267c8a61421b7f0877a01b33d34 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/api.py @@ -0,0 +1,2 @@ +__all__ = ["eval"] +from pandas.core.computation.eval import eval diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/check.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/check.py new file mode 100644 index 0000000000000000000000000000000000000000..caccf34f811112abbe04d965d6f6be1e21527e8b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/check.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +from pandas.compat._optional import import_optional_dependency + +ne = import_optional_dependency("numexpr", errors="warn") +NUMEXPR_INSTALLED = ne is not None + +__all__ = ["NUMEXPR_INSTALLED"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/common.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/common.py new file mode 100644 index 0000000000000000000000000000000000000000..115191829f044a7d6d7f17c279025ccc26d44d04 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/common.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from functools import reduce + +import numpy as np + +from pandas._config import get_option + + +def ensure_decoded(s) -> str: + """ + If we have bytes, decode them to unicode. + """ + if isinstance(s, (np.bytes_, bytes)): + s = s.decode(get_option("display.encoding")) + return s + + +def result_type_many(*arrays_and_dtypes): + """ + Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32) + argument limit. + """ + try: + return np.result_type(*arrays_and_dtypes) + except ValueError: + # we have > NPY_MAXARGS terms in our expression + return reduce(np.result_type, arrays_and_dtypes) + except TypeError: + from pandas.core.dtypes.cast import find_common_type + from pandas.core.dtypes.common import is_extension_array_dtype + + arr_and_dtypes = list(arrays_and_dtypes) + ea_dtypes, non_ea_dtypes = [], [] + for arr_or_dtype in arr_and_dtypes: + if is_extension_array_dtype(arr_or_dtype): + ea_dtypes.append(arr_or_dtype) + else: + non_ea_dtypes.append(arr_or_dtype) + + if non_ea_dtypes: + try: + np_dtype = np.result_type(*non_ea_dtypes) + except ValueError: + np_dtype = reduce(np.result_type, arrays_and_dtypes) + return find_common_type(ea_dtypes + [np_dtype]) + + return find_common_type(ea_dtypes) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/engines.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/engines.py new file mode 100644 index 0000000000000000000000000000000000000000..a3a05a9d75c6ed6b80564a69ff5b6cf5a648c1b3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/engines.py @@ -0,0 +1,143 @@ +""" +Engine classes for :func:`~pandas.eval` +""" +from __future__ import annotations + +import abc +from typing import TYPE_CHECKING + +from pandas.errors import NumExprClobberingError + +from pandas.core.computation.align import ( + align_terms, + reconstruct_object, +) +from pandas.core.computation.ops import ( + MATHOPS, + REDUCTIONS, +) + +from pandas.io.formats import printing + +if TYPE_CHECKING: + from pandas.core.computation.expr import Expr + +_ne_builtins = frozenset(MATHOPS + REDUCTIONS) + + +def _check_ne_builtin_clash(expr: Expr) -> None: + """ + Attempt to prevent foot-shooting in a helpful way. + + Parameters + ---------- + expr : Expr + Terms can contain + """ + names = expr.names + overlap = names & _ne_builtins + + if overlap: + s = ", ".join([repr(x) for x in overlap]) + raise NumExprClobberingError( + f'Variables in expression "{expr}" overlap with builtins: ({s})' + ) + + +class AbstractEngine(metaclass=abc.ABCMeta): + """Object serving as a base class for all engines.""" + + has_neg_frac = False + + def __init__(self, expr) -> None: + self.expr = expr + self.aligned_axes = None + self.result_type = None + + def convert(self) -> str: + """ + Convert an expression for evaluation. + + Defaults to return the expression as a string. + """ + return printing.pprint_thing(self.expr) + + def evaluate(self) -> object: + """ + Run the engine on the expression. + + This method performs alignment which is necessary no matter what engine + is being used, thus its implementation is in the base class. + + Returns + ------- + object + The result of the passed expression. + """ + if not self._is_aligned: + self.result_type, self.aligned_axes = align_terms(self.expr.terms) + + # make sure no names in resolvers and locals/globals clash + res = self._evaluate() + return reconstruct_object( + self.result_type, res, self.aligned_axes, self.expr.terms.return_type + ) + + @property + def _is_aligned(self) -> bool: + return self.aligned_axes is not None and self.result_type is not None + + @abc.abstractmethod + def _evaluate(self): + """ + Return an evaluated expression. + + Parameters + ---------- + env : Scope + The local and global environment in which to evaluate an + expression. + + Notes + ----- + Must be implemented by subclasses. + """ + + +class NumExprEngine(AbstractEngine): + """NumExpr engine class""" + + has_neg_frac = True + + def _evaluate(self): + import numexpr as ne + + # convert the expression to a valid numexpr expression + s = self.convert() + + env = self.expr.env + scope = env.full_scope + _check_ne_builtin_clash(self.expr) + return ne.evaluate(s, local_dict=scope) + + +class PythonEngine(AbstractEngine): + """ + Evaluate an expression in Python space. + + Mostly for testing purposes. + """ + + has_neg_frac = False + + def evaluate(self): + return self.expr() + + def _evaluate(self) -> None: + pass + + +ENGINES: dict[str, type[AbstractEngine]] = { + "numexpr": NumExprEngine, + "python": PythonEngine, +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/eval.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..f1fe528de06f8eeb5bdfb0f22339499c15ebbd9c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/eval.py @@ -0,0 +1,415 @@ +""" +Top level ``eval`` module. +""" +from __future__ import annotations + +import tokenize +from typing import TYPE_CHECKING +import warnings + +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import validate_bool_kwarg + +from pandas.core.dtypes.common import is_extension_array_dtype + +from pandas.core.computation.engines import ENGINES +from pandas.core.computation.expr import ( + PARSERS, + Expr, +) +from pandas.core.computation.parsing import tokenize_string +from pandas.core.computation.scope import ensure_scope +from pandas.core.generic import NDFrame + +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from pandas.core.computation.ops import BinOp + + +def _check_engine(engine: str | None) -> str: + """ + Make sure a valid engine is passed. + + Parameters + ---------- + engine : str + String to validate. + + Raises + ------ + KeyError + * If an invalid engine is passed. + ImportError + * If numexpr was requested but doesn't exist. + + Returns + ------- + str + Engine name. + """ + from pandas.core.computation.check import NUMEXPR_INSTALLED + from pandas.core.computation.expressions import USE_NUMEXPR + + if engine is None: + engine = "numexpr" if USE_NUMEXPR else "python" + + if engine not in ENGINES: + valid_engines = list(ENGINES.keys()) + raise KeyError( + f"Invalid engine '{engine}' passed, valid engines are {valid_engines}" + ) + + # TODO: validate this in a more general way (thinking of future engines + # that won't necessarily be import-able) + # Could potentially be done on engine instantiation + if engine == "numexpr" and not NUMEXPR_INSTALLED: + raise ImportError( + "'numexpr' is not installed or an unsupported version. Cannot use " + "engine='numexpr' for query/eval if 'numexpr' is not installed" + ) + + return engine + + +def _check_parser(parser: str): + """ + Make sure a valid parser is passed. + + Parameters + ---------- + parser : str + + Raises + ------ + KeyError + * If an invalid parser is passed + """ + if parser not in PARSERS: + raise KeyError( + f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}" + ) + + +def _check_resolvers(resolvers): + if resolvers is not None: + for resolver in resolvers: + if not hasattr(resolver, "__getitem__"): + name = type(resolver).__name__ + raise TypeError( + f"Resolver of type '{name}' does not " + "implement the __getitem__ method" + ) + + +def _check_expression(expr): + """ + Make sure an expression is not an empty string + + Parameters + ---------- + expr : object + An object that can be converted to a string + + Raises + ------ + ValueError + * If expr is an empty string + """ + if not expr: + raise ValueError("expr cannot be an empty string") + + +def _convert_expression(expr) -> str: + """ + Convert an object to an expression. + + This function converts an object to an expression (a unicode string) and + checks to make sure it isn't empty after conversion. This is used to + convert operators to their string representation for recursive calls to + :func:`~pandas.eval`. + + Parameters + ---------- + expr : object + The object to be converted to a string. + + Returns + ------- + str + The string representation of an object. + + Raises + ------ + ValueError + * If the expression is empty. + """ + s = pprint_thing(expr) + _check_expression(s) + return s + + +def _check_for_locals(expr: str, stack_level: int, parser: str): + at_top_of_stack = stack_level == 0 + not_pandas_parser = parser != "pandas" + + if not_pandas_parser: + msg = "The '@' prefix is only supported by the pandas parser" + elif at_top_of_stack: + msg = ( + "The '@' prefix is not allowed in top-level eval calls.\n" + "please refer to your variables by name without the '@' prefix." + ) + + if at_top_of_stack or not_pandas_parser: + for toknum, tokval in tokenize_string(expr): + if toknum == tokenize.OP and tokval == "@": + raise SyntaxError(msg) + + +def eval( + expr: str | BinOp, # we leave BinOp out of the docstr bc it isn't for users + parser: str = "pandas", + engine: str | None = None, + local_dict=None, + global_dict=None, + resolvers=(), + level: int = 0, + target=None, + inplace: bool = False, +): + """ + Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: ``+``, ``-``, ``*``, + ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following + boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. :class:`~pandas.Series` and + :class:`~pandas.DataFrame` objects are supported and behave as they would + with plain ol' Python evaluation. + + Parameters + ---------- + expr : str + The expression to evaluate. This string cannot contain any Python + `statements + `__, + only Python `expressions + `__. + parser : {'pandas', 'python'}, default 'pandas' + The parser to use to construct the syntax tree from the expression. The + default of ``'pandas'`` parses code slightly different than standard + Python. Alternatively, you can parse an expression using the + ``'python'`` parser to retain strict Python semantics. See the + :ref:`enhancing performance ` documentation for + more details. + engine : {'python', 'numexpr'}, default 'numexpr' + + The engine used to evaluate the expression. Supported engines are + + - None : tries to use ``numexpr``, falls back to ``python`` + - ``'numexpr'`` : This default engine evaluates pandas objects using + numexpr for large speed ups in complex expressions with large frames. + - ``'python'`` : Performs operations as if you had ``eval``'d in top + level python. This engine is generally not that useful. + + More backends may be available in the future. + local_dict : dict or None, optional + A dictionary of local variables, taken from locals() by default. + global_dict : dict or None, optional + A dictionary of global variables, taken from globals() by default. + resolvers : list of dict-like or None, optional + A list of objects implementing the ``__getitem__`` special method that + you can use to inject an additional collection of namespaces to use for + variable lookup. For example, this is used in the + :meth:`~DataFrame.query` method to inject the + ``DataFrame.index`` and ``DataFrame.columns`` + variables that refer to their respective :class:`~pandas.DataFrame` + instance attributes. + level : int, optional + The number of prior stack frames to traverse and add to the current + scope. Most users will **not** need to change this parameter. + target : object, optional, default None + This is the target object for assignment. It is used when there is + variable assignment in the expression. If so, then `target` must + support item assignment with string keys, and if a copy is being + returned, it must also support `.copy()`. + inplace : bool, default False + If `target` is provided, and the expression mutates `target`, whether + to modify `target` inplace. Otherwise, return a copy of `target` with + the mutation. + + Returns + ------- + ndarray, numeric scalar, DataFrame, Series, or None + The completion value of evaluating the given code or None if ``inplace=True``. + + Raises + ------ + ValueError + There are many instances where such an error can be raised: + + - `target=None`, but the expression is multiline. + - The expression is multiline, but not all them have item assignment. + An example of such an arrangement is this: + + a = b + 1 + a + 2 + + Here, there are expressions on different lines, making it multiline, + but the last line has no variable assigned to the output of `a + 2`. + - `inplace=True`, but the expression is missing item assignment. + - Item assignment is provided, but the `target` does not support + string item assignment. + - Item assignment is provided and `inplace=False`, but the `target` + does not support the `.copy()` method + + See Also + -------- + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + DataFrame.eval : Evaluate a string describing operations on + DataFrame columns. + + Notes + ----- + The ``dtype`` of any objects involved in an arithmetic ``%`` operation are + recursively cast to ``float64``. + + See the :ref:`enhancing performance ` documentation for + more details. + + Examples + -------- + >>> df = pd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) + >>> df + animal age + 0 dog 10 + 1 pig 20 + + We can add a new column using ``pd.eval``: + + >>> pd.eval("double_age = df.age * 2", target=df) + animal age double_age + 0 dog 10 20 + 1 pig 20 40 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + exprs: list[str | BinOp] + if isinstance(expr, str): + _check_expression(expr) + exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""] + else: + # ops.BinOp; for internal compat, not intended to be passed by users + exprs = [expr] + multi_line = len(exprs) > 1 + + if multi_line and target is None: + raise ValueError( + "multi-line expressions are only valid in the " + "context of data, use DataFrame.eval" + ) + engine = _check_engine(engine) + _check_parser(parser) + _check_resolvers(resolvers) + + ret = None + first_expr = True + target_modified = False + + for expr in exprs: + expr = _convert_expression(expr) + _check_for_locals(expr, level, parser) + + # get our (possibly passed-in) scope + env = ensure_scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) + + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) + + if engine == "numexpr" and ( + is_extension_array_dtype(parsed_expr.terms.return_type) + or getattr(parsed_expr.terms, "operand_types", None) is not None + and any( + is_extension_array_dtype(elem) + for elem in parsed_expr.terms.operand_types + ) + ): + warnings.warn( + "Engine has switched to 'python' because numexpr does not support " + "extension array dtypes. Please set your engine to python manually.", + RuntimeWarning, + stacklevel=find_stack_level(), + ) + engine = "python" + + # construct the engine and evaluate the parsed expression + eng = ENGINES[engine] + eng_inst = eng(parsed_expr) + ret = eng_inst.evaluate() + + if parsed_expr.assigner is None: + if multi_line: + raise ValueError( + "Multi-line expressions are only valid " + "if all expressions contain an assignment" + ) + if inplace: + raise ValueError("Cannot operate inplace if there is no assignment") + + # assign if needed + assigner = parsed_expr.assigner + if env.target is not None and assigner is not None: + target_modified = True + + # if returning a copy, copy only on the first assignment + if not inplace and first_expr: + try: + target = env.target + if isinstance(target, NDFrame): + target = target.copy(deep=None) + else: + target = target.copy() + except AttributeError as err: + raise ValueError("Cannot return a copy of the target") from err + else: + target = env.target + + # TypeError is most commonly raised (e.g. int, list), but you + # get IndexError if you try to do this assignment on np.ndarray. + # we will ignore numpy warnings here; e.g. if trying + # to use a non-numeric indexer + try: + if inplace and isinstance(target, NDFrame): + target.loc[:, assigner] = ret + else: + target[assigner] = ret # pyright: ignore[reportGeneralTypeIssues] + except (TypeError, IndexError) as err: + raise ValueError("Cannot assign expression output to target") from err + + if not resolvers: + resolvers = ({assigner: ret},) + else: + # existing resolver needs updated to handle + # case of mutating existing column in copy + for resolver in resolvers: + if assigner in resolver: + resolver[assigner] = ret + break + else: + resolvers += ({assigner: ret},) + + ret = None + first_expr = False + + # We want to exclude `inplace=None` as being False. + if inplace is False: + return target if target_modified else ret diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/expr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/expr.py new file mode 100644 index 0000000000000000000000000000000000000000..d642c37cea1293dc7d6c7e74beda5415c2a2f37a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/expr.py @@ -0,0 +1,836 @@ +""" +:func:`~pandas.eval` parsers. +""" +from __future__ import annotations + +import ast +from functools import ( + partial, + reduce, +) +from keyword import iskeyword +import tokenize +from typing import ( + Callable, + ClassVar, + TypeVar, +) + +import numpy as np + +from pandas.errors import UndefinedVariableError + +import pandas.core.common as com +from pandas.core.computation.ops import ( + ARITH_OPS_SYMS, + BOOL_OPS_SYMS, + CMP_OPS_SYMS, + LOCAL_TAG, + MATHOPS, + REDUCTIONS, + UNARY_OPS_SYMS, + BinOp, + Constant, + FuncNode, + Op, + Term, + UnaryOp, + is_term, +) +from pandas.core.computation.parsing import ( + clean_backtick_quoted_toks, + tokenize_string, +) +from pandas.core.computation.scope import Scope + +from pandas.io.formats import printing + + +def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]: + """ + Rewrite the assignment operator for PyTables expressions that use ``=`` + as a substitute for ``==``. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + return toknum, "==" if tokval == "=" else tokval + + +def _replace_booleans(tok: tuple[int, str]) -> tuple[int, str]: + """ + Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise + precedence is changed to boolean precedence. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == tokenize.OP: + if tokval == "&": + return tokenize.NAME, "and" + elif tokval == "|": + return tokenize.NAME, "or" + return toknum, tokval + return toknum, tokval + + +def _replace_locals(tok: tuple[int, str]) -> tuple[int, str]: + """ + Replace local variables with a syntactically valid name. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + + Notes + ----- + This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as + ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_`` + is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. + """ + toknum, tokval = tok + if toknum == tokenize.OP and tokval == "@": + return tokenize.OP, LOCAL_TAG + return toknum, tokval + + +def _compose2(f, g): + """ + Compose 2 callables. + """ + return lambda *args, **kwargs: f(g(*args, **kwargs)) + + +def _compose(*funcs): + """ + Compose 2 or more callables. + """ + assert len(funcs) > 1, "At least 2 callables must be passed to compose" + return reduce(_compose2, funcs) + + +def _preparse( + source: str, + f=_compose( + _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks + ), +) -> str: + """ + Compose a collection of tokenization functions. + + Parameters + ---------- + source : str + A Python source code string + f : callable + This takes a tuple of (toknum, tokval) as its argument and returns a + tuple with the same structure but possibly different elements. Defaults + to the composition of ``_rewrite_assign``, ``_replace_booleans``, and + ``_replace_locals``. + + Returns + ------- + str + Valid Python source code + + Notes + ----- + The `f` parameter can be any callable that takes *and* returns input of the + form ``(toknum, tokval)``, where ``toknum`` is one of the constants from + the ``tokenize`` module and ``tokval`` is a string. + """ + assert callable(f), "f must be callable" + return tokenize.untokenize(f(x) for x in tokenize_string(source)) + + +def _is_type(t): + """ + Factory for a type checking function of type ``t`` or tuple of types. + """ + return lambda x: isinstance(x.value, t) + + +_is_list = _is_type(list) +_is_str = _is_type(str) + + +# partition all AST nodes +_all_nodes = frozenset( + node + for node in (getattr(ast, name) for name in dir(ast)) + if isinstance(node, type) and issubclass(node, ast.AST) +) + + +def _filter_nodes(superclass, all_nodes=_all_nodes): + """ + Filter out AST nodes that are subclasses of ``superclass``. + """ + node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass)) + return frozenset(node_names) + + +_all_node_names = frozenset(x.__name__ for x in _all_nodes) +_mod_nodes = _filter_nodes(ast.mod) +_stmt_nodes = _filter_nodes(ast.stmt) +_expr_nodes = _filter_nodes(ast.expr) +_expr_context_nodes = _filter_nodes(ast.expr_context) +_boolop_nodes = _filter_nodes(ast.boolop) +_operator_nodes = _filter_nodes(ast.operator) +_unary_op_nodes = _filter_nodes(ast.unaryop) +_cmp_op_nodes = _filter_nodes(ast.cmpop) +_comprehension_nodes = _filter_nodes(ast.comprehension) +_handler_nodes = _filter_nodes(ast.excepthandler) +_arguments_nodes = _filter_nodes(ast.arguments) +_keyword_nodes = _filter_nodes(ast.keyword) +_alias_nodes = _filter_nodes(ast.alias) + + +# nodes that we don't support directly but are needed for parsing +_hacked_nodes = frozenset(["Assign", "Module", "Expr"]) + + +_unsupported_expr_nodes = frozenset( + [ + "Yield", + "GeneratorExp", + "IfExp", + "DictComp", + "SetComp", + "Repr", + "Lambda", + "Set", + "AST", + "Is", + "IsNot", + ] +) + +# these nodes are low priority or won't ever be supported (e.g., AST) +_unsupported_nodes = ( + _stmt_nodes + | _mod_nodes + | _handler_nodes + | _arguments_nodes + | _keyword_nodes + | _alias_nodes + | _expr_context_nodes + | _unsupported_expr_nodes +) - _hacked_nodes + +# we're adding a different assignment in some cases to be equality comparison +# and we don't want `stmt` and friends in their so get only the class whose +# names are capitalized +_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes +intersection = _unsupported_nodes & _base_supported_nodes +_msg = f"cannot both support and not support {intersection}" +assert not intersection, _msg + + +def _node_not_implemented(node_name: str) -> Callable[..., None]: + """ + Return a function that raises a NotImplementedError with a passed node name. + """ + + def f(self, *args, **kwargs): + raise NotImplementedError(f"'{node_name}' nodes are not implemented") + + return f + + +# should be bound by BaseExprVisitor but that creates a circular dependency: +# _T is used in disallow, but disallow is used to define BaseExprVisitor +# https://github.com/microsoft/pyright/issues/2315 +_T = TypeVar("_T") + + +def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]: + """ + Decorator to disallow certain nodes from parsing. Raises a + NotImplementedError instead. + + Returns + ------- + callable + """ + + def disallowed(cls: type[_T]) -> type[_T]: + # error: "Type[_T]" has no attribute "unsupported_nodes" + cls.unsupported_nodes = () # type: ignore[attr-defined] + for node in nodes: + new_method = _node_not_implemented(node) + name = f"visit_{node}" + # error: "Type[_T]" has no attribute "unsupported_nodes" + cls.unsupported_nodes += (name,) # type: ignore[attr-defined] + setattr(cls, name, new_method) + return cls + + return disallowed + + +def _op_maker(op_class, op_symbol): + """ + Return a function to create an op class with its symbol already passed. + + Returns + ------- + callable + """ + + def f(self, node, *args, **kwargs): + """ + Return a partial function with an Op subclass with an operator already passed. + + Returns + ------- + callable + """ + return partial(op_class, op_symbol, *args, **kwargs) + + return f + + +_op_classes = {"binary": BinOp, "unary": UnaryOp} + + +def add_ops(op_classes): + """ + Decorator to add default implementation of ops. + """ + + def f(cls): + for op_attr_name, op_class in op_classes.items(): + ops = getattr(cls, f"{op_attr_name}_ops") + ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map") + for op in ops: + op_node = ops_map[op] + if op_node is not None: + made_op = _op_maker(op_class, op) + setattr(cls, f"visit_{op_node}", made_op) + return cls + + return f + + +@disallow(_unsupported_nodes) +@add_ops(_op_classes) +class BaseExprVisitor(ast.NodeVisitor): + """ + Custom ast walker. Parsers of other engines should subclass this class + if necessary. + + Parameters + ---------- + env : Scope + engine : str + parser : str + preparser : callable + """ + + const_type: ClassVar[type[Term]] = Constant + term_type: ClassVar[type[Term]] = Term + + binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS + binary_op_nodes = ( + "Gt", + "Lt", + "GtE", + "LtE", + "Eq", + "NotEq", + "In", + "NotIn", + "BitAnd", + "BitOr", + "And", + "Or", + "Add", + "Sub", + "Mult", + "Div", + "Pow", + "FloorDiv", + "Mod", + ) + binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) + + unary_ops = UNARY_OPS_SYMS + unary_op_nodes = "UAdd", "USub", "Invert", "Not" + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + rewrite_map = { + ast.Eq: ast.In, + ast.NotEq: ast.NotIn, + ast.In: ast.In, + ast.NotIn: ast.NotIn, + } + + unsupported_nodes: tuple[str, ...] + + def __init__(self, env, engine, parser, preparser=_preparse) -> None: + self.env = env + self.engine = engine + self.parser = parser + self.preparser = preparser + self.assigner = None + + def visit(self, node, **kwargs): + if isinstance(node, str): + clean = self.preparser(node) + try: + node = ast.fix_missing_locations(ast.parse(clean)) + except SyntaxError as e: + if any(iskeyword(x) for x in clean.split()): + e.msg = "Python keyword not valid identifier in numexpr query" + raise e + + method = f"visit_{type(node).__name__}" + visitor = getattr(self, method) + return visitor(node, **kwargs) + + def visit_Module(self, node, **kwargs): + if len(node.body) != 1: + raise SyntaxError("only a single expression is allowed") + expr = node.body[0] + return self.visit(expr, **kwargs) + + def visit_Expr(self, node, **kwargs): + return self.visit(node.value, **kwargs) + + def _rewrite_membership_op(self, node, left, right): + # the kind of the operator (is actually an instance) + op_instance = node.op + op_type = type(op_instance) + + # must be two terms and the comparison operator must be ==/!=/in/not in + if is_term(left) and is_term(right) and op_type in self.rewrite_map: + left_list, right_list = map(_is_list, (left, right)) + left_str, right_str = map(_is_str, (left, right)) + + # if there are any strings or lists in the expression + if left_list or right_list or left_str or right_str: + op_instance = self.rewrite_map[op_type]() + + # pop the string variable out of locals and replace it with a list + # of one string, kind of a hack + if right_str: + name = self.env.add_tmp([right.value]) + right = self.term_type(name, self.env) + + if left_str: + name = self.env.add_tmp([left.value]) + left = self.term_type(name, self.env) + + op = self.visit(op_instance) + return op, op_instance, left, right + + def _maybe_transform_eq_ne(self, node, left=None, right=None): + if left is None: + left = self.visit(node.left, side="left") + if right is None: + right = self.visit(node.right, side="right") + op, op_class, left, right = self._rewrite_membership_op(node, left, right) + return op, op_class, left, right + + def _maybe_downcast_constants(self, left, right): + f32 = np.dtype(np.float32) + if ( + left.is_scalar + and hasattr(left, "value") + and not right.is_scalar + and right.return_type == f32 + ): + # right is a float32 array, left is a scalar + name = self.env.add_tmp(np.float32(left.value)) + left = self.term_type(name, self.env) + if ( + right.is_scalar + and hasattr(right, "value") + and not left.is_scalar + and left.return_type == f32 + ): + # left is a float32 array, right is a scalar + name = self.env.add_tmp(np.float32(right.value)) + right = self.term_type(name, self.env) + + return left, right + + def _maybe_eval(self, binop, eval_in_python): + # eval `in` and `not in` (for now) in "partial" python space + # things that can be evaluated in "eval" space will be turned into + # temporary variables. for example, + # [1,2] in a + 2 * b + # in that case a + 2 * b will be evaluated using numexpr, and the "in" + # call will be evaluated using isin (in python space) + return binop.evaluate( + self.env, self.engine, self.parser, self.term_type, eval_in_python + ) + + def _maybe_evaluate_binop( + self, + op, + op_class, + lhs, + rhs, + eval_in_python=("in", "not in"), + maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="), + ): + res = op(lhs, rhs) + + if res.has_invalid_return_type: + raise TypeError( + f"unsupported operand type(s) for {res.op}: " + f"'{lhs.type}' and '{rhs.type}'" + ) + + if self.engine != "pytables" and ( + res.op in CMP_OPS_SYMS + and getattr(lhs, "is_datetime", False) + or getattr(rhs, "is_datetime", False) + ): + # all date ops must be done in python bc numexpr doesn't work + # well with NaT + return self._maybe_eval(res, self.binary_ops) + + if res.op in eval_in_python: + # "in"/"not in" ops are always evaluated in python + return self._maybe_eval(res, eval_in_python) + elif self.engine != "pytables": + if ( + getattr(lhs, "return_type", None) == object + or getattr(rhs, "return_type", None) == object + ): + # evaluate "==" and "!=" in python if either of our operands + # has an object return type + return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) + return res + + def visit_BinOp(self, node, **kwargs): + op, op_class, left, right = self._maybe_transform_eq_ne(node) + left, right = self._maybe_downcast_constants(left, right) + return self._maybe_evaluate_binop(op, op_class, left, right) + + def visit_UnaryOp(self, node, **kwargs): + op = self.visit(node.op) + operand = self.visit(node.operand) + return op(operand) + + def visit_Name(self, node, **kwargs) -> Term: + return self.term_type(node.id, self.env, **kwargs) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_NameConstant(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_Num(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + def visit_Constant(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_Str(self, node, **kwargs) -> Term: + name = self.env.add_tmp(node.s) + return self.term_type(name, self.env) + + def visit_List(self, node, **kwargs) -> Term: + name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts]) + return self.term_type(name, self.env) + + visit_Tuple = visit_List + + def visit_Index(self, node, **kwargs): + """df.index[4]""" + return self.visit(node.value) + + def visit_Subscript(self, node, **kwargs) -> Term: + from pandas import eval as pd_eval + + value = self.visit(node.value) + slobj = self.visit(node.slice) + result = pd_eval( + slobj, local_dict=self.env, engine=self.engine, parser=self.parser + ) + try: + # a Term instance + v = value.value[result] + except AttributeError: + # an Op instance + lhs = pd_eval( + value, local_dict=self.env, engine=self.engine, parser=self.parser + ) + v = lhs[result] + name = self.env.add_tmp(v) + return self.term_type(name, env=self.env) + + def visit_Slice(self, node, **kwargs) -> slice: + """df.index[slice(4,6)]""" + lower = node.lower + if lower is not None: + lower = self.visit(lower).value + upper = node.upper + if upper is not None: + upper = self.visit(upper).value + step = node.step + if step is not None: + step = self.visit(step).value + + return slice(lower, upper, step) + + def visit_Assign(self, node, **kwargs): + """ + support a single assignment node, like + + c = a + b + + set the assigner at the top level, must be a Name node which + might or might not exist in the resolvers + + """ + if len(node.targets) != 1: + raise SyntaxError("can only assign a single expression") + if not isinstance(node.targets[0], ast.Name): + raise SyntaxError("left hand side of an assignment must be a single name") + if self.env.target is None: + raise ValueError("cannot assign without a target object") + + try: + assigner = self.visit(node.targets[0], **kwargs) + except UndefinedVariableError: + assigner = node.targets[0].id + + self.assigner = getattr(assigner, "name", assigner) + if self.assigner is None: + raise SyntaxError( + "left hand side of an assignment must be a single resolvable name" + ) + + return self.visit(node.value, **kwargs) + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = node.ctx + if isinstance(ctx, ast.Load): + # resolve the value + resolved = self.visit(value).value + try: + v = getattr(resolved, attr) + name = self.env.add_tmp(v) + return self.term_type(name, self.env) + except AttributeError: + # something like datetime.datetime where scope is overridden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + raise + + raise ValueError(f"Invalid Attribute context {type(ctx).__name__}") + + def visit_Call(self, node, side=None, **kwargs): + if isinstance(node.func, ast.Attribute) and node.func.attr != "__call__": + res = self.visit_Attribute(node.func) + elif not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") + else: + try: + res = self.visit(node.func) + except UndefinedVariableError: + # Check if this is a supported function name + try: + res = FuncNode(node.func.id) + except ValueError: + # Raise original error + raise + + if res is None: + # error: "expr" has no attribute "id" + raise ValueError( + f"Invalid function call {node.func.id}" # type: ignore[attr-defined] + ) + if hasattr(res, "value"): + res = res.value + + if isinstance(res, FuncNode): + new_args = [self.visit(arg) for arg in node.args] + + if node.keywords: + raise TypeError( + f'Function "{res.name}" does not support keyword arguments' + ) + + return res(*new_args) + + else: + new_args = [self.visit(arg)(self.env) for arg in node.args] + + for key in node.keywords: + if not isinstance(key, ast.keyword): + # error: "expr" has no attribute "id" + raise ValueError( + "keyword error in function call " + f"'{node.func.id}'" # type: ignore[attr-defined] + ) + + if key.arg: + kwargs[key.arg] = self.visit(key.value)(self.env) + + name = self.env.add_tmp(res(*new_args, **kwargs)) + return self.term_type(name=name, env=self.env) + + def translate_In(self, op): + return op + + def visit_Compare(self, node, **kwargs): + ops = node.ops + comps = node.comparators + + # base case: we have something like a CMP b + if len(comps) == 1: + op = self.translate_In(ops[0]) + binop = ast.BinOp(op=op, left=node.left, right=comps[0]) + return self.visit(binop) + + # recursive case: we have a chained comparison, a CMP b CMP c, etc. + left = node.left + values = [] + for op, comp in zip(ops, comps): + new_node = self.visit( + ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)]) + ) + left = comp + values.append(new_node) + return self.visit(ast.BoolOp(op=ast.And(), values=values)) + + def _try_visit_binop(self, bop): + if isinstance(bop, (Op, Term)): + return bop + return self.visit(bop) + + def visit_BoolOp(self, node, **kwargs): + def visitor(x, y): + lhs = self._try_visit_binop(x) + rhs = self._try_visit_binop(y) + + op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs) + return self._maybe_evaluate_binop(op, node.op, lhs, rhs) + + operands = node.values + return reduce(visitor, operands) + + +_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) +_numexpr_supported_calls = frozenset(REDUCTIONS + MATHOPS) + + +@disallow( + (_unsupported_nodes | _python_not_supported) + - (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"])) +) +class PandasExprVisitor(BaseExprVisitor): + def __init__( + self, + env, + engine, + parser, + preparser=partial( + _preparse, + f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks), + ), + ) -> None: + super().__init__(env, engine, parser, preparser) + + +@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"])) +class PythonExprVisitor(BaseExprVisitor): + def __init__( + self, env, engine, parser, preparser=lambda source, f=None: source + ) -> None: + super().__init__(env, engine, parser, preparser=preparser) + + +class Expr: + """ + Object encapsulating an expression. + + Parameters + ---------- + expr : str + engine : str, optional, default 'numexpr' + parser : str, optional, default 'pandas' + env : Scope, optional, default None + level : int, optional, default 2 + """ + + env: Scope + engine: str + parser: str + + def __init__( + self, + expr, + engine: str = "numexpr", + parser: str = "pandas", + env: Scope | None = None, + level: int = 0, + ) -> None: + self.expr = expr + self.env = env or Scope(level=level + 1) + self.engine = engine + self.parser = parser + self._visitor = PARSERS[parser](self.env, self.engine, self.parser) + self.terms = self.parse() + + @property + def assigner(self): + return getattr(self._visitor, "assigner", None) + + def __call__(self): + return self.terms(self.env) + + def __repr__(self) -> str: + return printing.pprint_thing(self.terms) + + def __len__(self) -> int: + return len(self.expr) + + def parse(self): + """ + Parse an expression. + """ + return self._visitor.visit(self.expr) + + @property + def names(self): + """ + Get the names in an expression. + """ + if is_term(self.terms): + return frozenset([self.terms.name]) + return frozenset(term.name for term in com.flatten(self.terms)) + + +PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/expressions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/expressions.py new file mode 100644 index 0000000000000000000000000000000000000000..6219cac4aeb16ee019551f95a03af59da44c9d06 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/expressions.py @@ -0,0 +1,286 @@ +""" +Expressions +----------- + +Offer fast expression evaluation through numexpr + +""" +from __future__ import annotations + +import operator +from typing import TYPE_CHECKING +import warnings + +import numpy as np + +from pandas._config import get_option + +from pandas.util._exceptions import find_stack_level + +from pandas.core import roperator +from pandas.core.computation.check import NUMEXPR_INSTALLED + +if NUMEXPR_INSTALLED: + import numexpr as ne + +if TYPE_CHECKING: + from pandas._typing import FuncType + +_TEST_MODE: bool | None = None +_TEST_RESULT: list[bool] = [] +USE_NUMEXPR = NUMEXPR_INSTALLED +_evaluate: FuncType | None = None +_where: FuncType | None = None + +# the set of dtypes that we will allow pass to numexpr +_ALLOWED_DTYPES = { + "evaluate": {"int64", "int32", "float64", "float32", "bool"}, + "where": {"int64", "float64", "bool"}, +} + +# the minimum prod shape that we will use numexpr +_MIN_ELEMENTS = 1_000_000 + + +def set_use_numexpr(v: bool = True) -> None: + # set/unset to use numexpr + global USE_NUMEXPR + if NUMEXPR_INSTALLED: + USE_NUMEXPR = v + + # choose what we are going to do + global _evaluate, _where + + _evaluate = _evaluate_numexpr if USE_NUMEXPR else _evaluate_standard + _where = _where_numexpr if USE_NUMEXPR else _where_standard + + +def set_numexpr_threads(n=None) -> None: + # if we are using numexpr, set the threads to n + # otherwise reset + if NUMEXPR_INSTALLED and USE_NUMEXPR: + if n is None: + n = ne.detect_number_of_cores() + ne.set_num_threads(n) + + +def _evaluate_standard(op, op_str, a, b): + """ + Standard evaluation. + """ + if _TEST_MODE: + _store_test_result(False) + return op(a, b) + + +def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool: + """return a boolean if we WILL be using numexpr""" + if op_str is not None: + # required min elements (otherwise we are adding overhead) + if a.size > _MIN_ELEMENTS: + # check for dtype compatibility + dtypes: set[str] = set() + for o in [a, b]: + # ndarray and Series Case + if hasattr(o, "dtype"): + dtypes |= {o.dtype.name} + + # allowed are a superset + if not len(dtypes) or _ALLOWED_DTYPES[dtype_check] >= dtypes: + return True + + return False + + +def _evaluate_numexpr(op, op_str, a, b): + result = None + + if _can_use_numexpr(op, op_str, a, b, "evaluate"): + is_reversed = op.__name__.strip("_").startswith("r") + if is_reversed: + # we were originally called by a reversed op method + a, b = b, a + + a_value = a + b_value = b + + try: + result = ne.evaluate( + f"a_value {op_str} b_value", + local_dict={"a_value": a_value, "b_value": b_value}, + casting="safe", + ) + except TypeError: + # numexpr raises eg for array ** array with integers + # (https://github.com/pydata/numexpr/issues/379) + pass + except NotImplementedError: + if _bool_arith_fallback(op_str, a, b): + pass + else: + raise + + if is_reversed: + # reverse order to original for fallback + a, b = b, a + + if _TEST_MODE: + _store_test_result(result is not None) + + if result is None: + result = _evaluate_standard(op, op_str, a, b) + + return result + + +_op_str_mapping = { + operator.add: "+", + roperator.radd: "+", + operator.mul: "*", + roperator.rmul: "*", + operator.sub: "-", + roperator.rsub: "-", + operator.truediv: "/", + roperator.rtruediv: "/", + # floordiv not supported by numexpr 2.x + operator.floordiv: None, + roperator.rfloordiv: None, + # we require Python semantics for mod of negative for backwards compatibility + # see https://github.com/pydata/numexpr/issues/365 + # so sticking with unaccelerated for now GH#36552 + operator.mod: None, + roperator.rmod: None, + operator.pow: "**", + roperator.rpow: "**", + operator.eq: "==", + operator.ne: "!=", + operator.le: "<=", + operator.lt: "<", + operator.ge: ">=", + operator.gt: ">", + operator.and_: "&", + roperator.rand_: "&", + operator.or_: "|", + roperator.ror_: "|", + operator.xor: "^", + roperator.rxor: "^", + divmod: None, + roperator.rdivmod: None, +} + + +def _where_standard(cond, a, b): + # Caller is responsible for extracting ndarray if necessary + return np.where(cond, a, b) + + +def _where_numexpr(cond, a, b): + # Caller is responsible for extracting ndarray if necessary + result = None + + if _can_use_numexpr(None, "where", a, b, "where"): + result = ne.evaluate( + "where(cond_value, a_value, b_value)", + local_dict={"cond_value": cond, "a_value": a, "b_value": b}, + casting="safe", + ) + + if result is None: + result = _where_standard(cond, a, b) + + return result + + +# turn myself on +set_use_numexpr(get_option("compute.use_numexpr")) + + +def _has_bool_dtype(x): + try: + return x.dtype == bool + except AttributeError: + return isinstance(x, (bool, np.bool_)) + + +_BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"} + + +def _bool_arith_fallback(op_str, a, b) -> bool: + """ + Check if we should fallback to the python `_evaluate_standard` in case + of an unsupported operation by numexpr, which is the case for some + boolean ops. + """ + if _has_bool_dtype(a) and _has_bool_dtype(b): + if op_str in _BOOL_OP_UNSUPPORTED: + warnings.warn( + f"evaluating in Python space because the {repr(op_str)} " + "operator is not supported by numexpr for the bool dtype, " + f"use {repr(_BOOL_OP_UNSUPPORTED[op_str])} instead.", + stacklevel=find_stack_level(), + ) + return True + return False + + +def evaluate(op, a, b, use_numexpr: bool = True): + """ + Evaluate and return the expression of the op on a and b. + + Parameters + ---------- + op : the actual operand + a : left operand + b : right operand + use_numexpr : bool, default True + Whether to try to use numexpr. + """ + op_str = _op_str_mapping[op] + if op_str is not None: + if use_numexpr: + # error: "None" not callable + return _evaluate(op, op_str, a, b) # type: ignore[misc] + return _evaluate_standard(op, op_str, a, b) + + +def where(cond, a, b, use_numexpr: bool = True): + """ + Evaluate the where condition cond on a and b. + + Parameters + ---------- + cond : np.ndarray[bool] + a : return if cond is True + b : return if cond is False + use_numexpr : bool, default True + Whether to try to use numexpr. + """ + assert _where is not None + return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) + + +def set_test_mode(v: bool = True) -> None: + """ + Keeps track of whether numexpr was used. + + Stores an additional ``True`` for every successful use of evaluate with + numexpr since the last ``get_test_result``. + """ + global _TEST_MODE, _TEST_RESULT + _TEST_MODE = v + _TEST_RESULT = [] + + +def _store_test_result(used_numexpr: bool) -> None: + if used_numexpr: + _TEST_RESULT.append(used_numexpr) + + +def get_test_result() -> list[bool]: + """ + Get test result and reset test_results. + """ + global _TEST_RESULT + res = _TEST_RESULT + _TEST_RESULT = [] + return res diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/ops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..d8265456dfcedb5eb398cbf0517d78a1d976cd2a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/ops.py @@ -0,0 +1,572 @@ +""" +Operator classes for eval. +""" + +from __future__ import annotations + +from datetime import datetime +from functools import partial +import operator +from typing import ( + TYPE_CHECKING, + Callable, + Literal, +) + +import numpy as np + +from pandas._libs.tslibs import Timestamp + +from pandas.core.dtypes.common import ( + is_list_like, + is_scalar, +) + +import pandas.core.common as com +from pandas.core.computation.common import ( + ensure_decoded, + result_type_many, +) +from pandas.core.computation.scope import DEFAULT_GLOBALS + +from pandas.io.formats.printing import ( + pprint_thing, + pprint_thing_encoded, +) + +if TYPE_CHECKING: + from collections.abc import ( + Iterable, + Iterator, + ) + +REDUCTIONS = ("sum", "prod", "min", "max") + +_unary_math_ops = ( + "sin", + "cos", + "exp", + "log", + "expm1", + "log1p", + "sqrt", + "sinh", + "cosh", + "tanh", + "arcsin", + "arccos", + "arctan", + "arccosh", + "arcsinh", + "arctanh", + "abs", + "log10", + "floor", + "ceil", +) +_binary_math_ops = ("arctan2",) + +MATHOPS = _unary_math_ops + _binary_math_ops + + +LOCAL_TAG = "__pd_eval_local_" + + +class Term: + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, str) else cls + # error: Argument 2 for "super" not an instance of argument 1 + supr_new = super(Term, klass).__new__ # type: ignore[misc] + return supr_new(klass) + + is_local: bool + + def __init__(self, name, env, side=None, encoding=None) -> None: + # name is a str for Term, but may be something else for subclasses + self._name = name + self.env = env + self.side = side + tname = str(name) + self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS + self._value = self._resolve_name() + self.encoding = encoding + + @property + def local_name(self) -> str: + return self.name.replace(LOCAL_TAG, "") + + def __repr__(self) -> str: + return pprint_thing(self.name) + + def __call__(self, *args, **kwargs): + return self.value + + def evaluate(self, *args, **kwargs) -> Term: + return self + + def _resolve_name(self): + local_name = str(self.local_name) + is_local = self.is_local + if local_name in self.env.scope and isinstance( + self.env.scope[local_name], type + ): + is_local = False + + res = self.env.resolve(local_name, is_local=is_local) + self.update(res) + + if hasattr(res, "ndim") and res.ndim > 2: + raise NotImplementedError( + "N-dimensional objects, where N > 2, are not supported with eval" + ) + return res + + def update(self, value) -> None: + """ + search order for local (i.e., @variable) variables: + + scope, key_variable + [('locals', 'local_name'), + ('globals', 'local_name'), + ('locals', 'key'), + ('globals', 'key')] + """ + key = self.name + + # if it's a variable name (otherwise a constant) + if isinstance(key, str): + self.env.swapkey(self.local_name, key, new_value=value) + + self.value = value + + @property + def is_scalar(self) -> bool: + return is_scalar(self._value) + + @property + def type(self): + try: + # potentially very slow for large, mixed dtype frames + return self._value.values.dtype + except AttributeError: + try: + # ndarray + return self._value.dtype + except AttributeError: + # scalar + return type(self._value) + + return_type = type + + @property + def raw(self) -> str: + return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})" + + @property + def is_datetime(self) -> bool: + try: + t = self.type.type + except AttributeError: + t = self.type + + return issubclass(t, (datetime, np.datetime64)) + + @property + def value(self): + return self._value + + @value.setter + def value(self, new_value) -> None: + self._value = new_value + + @property + def name(self): + return self._name + + @property + def ndim(self) -> int: + return self._value.ndim + + +class Constant(Term): + def _resolve_name(self): + return self._name + + @property + def name(self): + return self.value + + def __repr__(self) -> str: + # in python 2 str() of float + # can truncate shorter than repr() + return repr(self.name) + + +_bool_op_map = {"not": "~", "and": "&", "or": "|"} + + +class Op: + """ + Hold an operator of arbitrary arity. + """ + + op: str + + def __init__(self, op: str, operands: Iterable[Term | Op], encoding=None) -> None: + self.op = _bool_op_map.get(op, op) + self.operands = operands + self.encoding = encoding + + def __iter__(self) -> Iterator: + return iter(self.operands) + + def __repr__(self) -> str: + """ + Print a generic n-ary operator and its operands using infix notation. + """ + # recurse over the operands + parened = (f"({pprint_thing(opr)})" for opr in self.operands) + return pprint_thing(f" {self.op} ".join(parened)) + + @property + def return_type(self): + # clobber types to bool if the op is a boolean operator + if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS): + return np.bool_ + return result_type_many(*(term.type for term in com.flatten(self))) + + @property + def has_invalid_return_type(self) -> bool: + types = self.operand_types + obj_dtype_set = frozenset([np.dtype("object")]) + return self.return_type == object and types - obj_dtype_set + + @property + def operand_types(self): + return frozenset(term.type for term in com.flatten(self)) + + @property + def is_scalar(self) -> bool: + return all(operand.is_scalar for operand in self.operands) + + @property + def is_datetime(self) -> bool: + try: + t = self.return_type.type + except AttributeError: + t = self.return_type + + return issubclass(t, (datetime, np.datetime64)) + + +def _in(x, y): + """ + Compute the vectorized membership of ``x in y`` if possible, otherwise + use Python. + """ + try: + return x.isin(y) + except AttributeError: + if is_list_like(x): + try: + return y.isin(x) + except AttributeError: + pass + return x in y + + +def _not_in(x, y): + """ + Compute the vectorized membership of ``x not in y`` if possible, + otherwise use Python. + """ + try: + return ~x.isin(y) + except AttributeError: + if is_list_like(x): + try: + return ~y.isin(x) + except AttributeError: + pass + return x not in y + + +CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in") +_cmp_ops_funcs = ( + operator.gt, + operator.lt, + operator.ge, + operator.le, + operator.eq, + operator.ne, + _in, + _not_in, +) +_cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs)) + +BOOL_OPS_SYMS = ("&", "|", "and", "or") +_bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_) +_bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs)) + +ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%") +_arith_ops_funcs = ( + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.pow, + operator.floordiv, + operator.mod, +) +_arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs)) + +SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%") +_special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod) +_special_case_arith_ops_dict = dict( + zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs) +) + +_binary_ops_dict = {} + +for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): + _binary_ops_dict.update(d) + + +def is_term(obj) -> bool: + return isinstance(obj, Term) + + +class BinOp(Op): + """ + Hold a binary operator and its operands. + + Parameters + ---------- + op : str + lhs : Term or Op + rhs : Term or Op + """ + + def __init__(self, op: str, lhs, rhs) -> None: + super().__init__(op, (lhs, rhs)) + self.lhs = lhs + self.rhs = rhs + + self._disallow_scalar_only_bool_ops() + + self.convert_values() + + try: + self.func = _binary_ops_dict[op] + except KeyError as err: + # has to be made a list for python3 + keys = list(_binary_ops_dict.keys()) + raise ValueError( + f"Invalid binary operator {repr(op)}, valid operators are {keys}" + ) from err + + def __call__(self, env): + """ + Recursively evaluate an expression in Python space. + + Parameters + ---------- + env : Scope + + Returns + ------- + object + The result of an evaluated expression. + """ + # recurse over the left/right nodes + left = self.lhs(env) + right = self.rhs(env) + + return self.func(left, right) + + def evaluate(self, env, engine: str, parser, term_type, eval_in_python): + """ + Evaluate a binary operation *before* being passed to the engine. + + Parameters + ---------- + env : Scope + engine : str + parser : str + term_type : type + eval_in_python : list + + Returns + ------- + term_type + The "pre-evaluated" expression as an instance of ``term_type`` + """ + if engine == "python": + res = self(env) + else: + # recurse over the left/right nodes + + left = self.lhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + + right = self.rhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + + # base cases + if self.op in eval_in_python: + res = self.func(left.value, right.value) + else: + from pandas.core.computation.eval import eval + + res = eval(self, local_dict=env, engine=engine, parser=parser) + + name = env.add_tmp(res) + return term_type(name, env=env) + + def convert_values(self) -> None: + """ + Convert datetimes to a comparable value in an expression. + """ + + def stringify(value): + encoder: Callable + if self.encoding is not None: + encoder = partial(pprint_thing_encoded, encoding=self.encoding) + else: + encoder = pprint_thing + return encoder(value) + + lhs, rhs = self.lhs, self.rhs + + if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar: + v = rhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = Timestamp(ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert("UTC") + self.rhs.update(v) + + if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar: + v = lhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = Timestamp(ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert("UTC") + self.lhs.update(v) + + def _disallow_scalar_only_bool_ops(self): + rhs = self.rhs + lhs = self.lhs + + # GH#24883 unwrap dtype if necessary to ensure we have a type object + rhs_rt = rhs.return_type + rhs_rt = getattr(rhs_rt, "type", rhs_rt) + lhs_rt = lhs.return_type + lhs_rt = getattr(lhs_rt, "type", lhs_rt) + if ( + (lhs.is_scalar or rhs.is_scalar) + and self.op in _bool_ops_dict + and ( + not ( + issubclass(rhs_rt, (bool, np.bool_)) + and issubclass(lhs_rt, (bool, np.bool_)) + ) + ) + ): + raise NotImplementedError("cannot evaluate scalar only bool ops") + + +def isnumeric(dtype) -> bool: + return issubclass(np.dtype(dtype).type, np.number) + + +UNARY_OPS_SYMS = ("+", "-", "~", "not") +_unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) +_unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) + + +class UnaryOp(Op): + """ + Hold a unary operator and its operands. + + Parameters + ---------- + op : str + The token used to represent the operator. + operand : Term or Op + The Term or Op operand to the operator. + + Raises + ------ + ValueError + * If no function associated with the passed operator token is found. + """ + + def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None: + super().__init__(op, (operand,)) + self.operand = operand + + try: + self.func = _unary_ops_dict[op] + except KeyError as err: + raise ValueError( + f"Invalid unary operator {repr(op)}, " + f"valid operators are {UNARY_OPS_SYMS}" + ) from err + + def __call__(self, env) -> MathCall: + operand = self.operand(env) + # error: Cannot call function of unknown type + return self.func(operand) # type: ignore[operator] + + def __repr__(self) -> str: + return pprint_thing(f"{self.op}({self.operand})") + + @property + def return_type(self) -> np.dtype: + operand = self.operand + if operand.return_type == np.dtype("bool"): + return np.dtype("bool") + if isinstance(operand, Op) and ( + operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict + ): + return np.dtype("bool") + return np.dtype("int") + + +class MathCall(Op): + def __init__(self, func, args) -> None: + super().__init__(func.name, args) + self.func = func + + def __call__(self, env): + # error: "Op" not callable + operands = [op(env) for op in self.operands] # type: ignore[operator] + return self.func.func(*operands) + + def __repr__(self) -> str: + operands = map(str, self.operands) + return pprint_thing(f"{self.op}({','.join(operands)})") + + +class FuncNode: + def __init__(self, name: str) -> None: + if name not in MATHOPS: + raise ValueError(f'"{name}" is not a supported function') + self.name = name + self.func = getattr(np, name) + + def __call__(self, *args) -> MathCall: + return MathCall(self, args) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/parsing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/parsing.py new file mode 100644 index 0000000000000000000000000000000000000000..4cfa0f2baffd5ed45db19242c2afd00b6e5e23dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/parsing.py @@ -0,0 +1,198 @@ +""" +:func:`~pandas.eval` source string parsing functions +""" +from __future__ import annotations + +from io import StringIO +from keyword import iskeyword +import token +import tokenize +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterator, + ) + +# A token value Python's tokenizer probably will never use. +BACKTICK_QUOTED_STRING = 100 + + +def create_valid_python_identifier(name: str) -> str: + """ + Create valid Python identifiers from any string. + + Check if name contains any special characters. If it contains any + special characters, the special characters will be replaced by + a special string and a prefix is added. + + Raises + ------ + SyntaxError + If the returned name is not a Python valid identifier, raise an exception. + This can happen if there is a hashtag in the name, as the tokenizer will + than terminate and not find the backtick. + But also for characters that fall out of the range of (U+0001..U+007F). + """ + if name.isidentifier() and not iskeyword(name): + return name + + # Create a dict with the special characters and their replacement string. + # EXACT_TOKEN_TYPES contains these special characters + # token.tok_name contains a readable description of the replacement string. + special_characters_replacements = { + char: f"_{token.tok_name[tokval]}_" + for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items()) + } + special_characters_replacements.update( + { + " ": "_", + "?": "_QUESTIONMARK_", + "!": "_EXCLAMATIONMARK_", + "$": "_DOLLARSIGN_", + "€": "_EUROSIGN_", + "°": "_DEGREESIGN_", + # Including quotes works, but there are exceptions. + "'": "_SINGLEQUOTE_", + '"': "_DOUBLEQUOTE_", + # Currently not possible. Terminates parser and won't find backtick. + # "#": "_HASH_", + } + ) + + name = "".join([special_characters_replacements.get(char, char) for char in name]) + name = f"BACKTICK_QUOTED_STRING_{name}" + + if not name.isidentifier(): + raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") + + return name + + +def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]: + """ + Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`_create_valid_python_identifier` so that the parser can find this + string when the query is executed. + In this case the tok will get the NAME tokval. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tok : Tuple[int, str] + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == BACKTICK_QUOTED_STRING: + return tokenize.NAME, create_valid_python_identifier(tokval) + return toknum, tokval + + +def clean_column_name(name: Hashable) -> Hashable: + """ + Function to emulate the cleaning of a backtick quoted name. + + The purpose for this function is to see what happens to the name of + identifier if it goes to the process of being parsed a Python code + inside a backtick quoted string and than being cleaned + (removed of any special characters). + + Parameters + ---------- + name : hashable + Name to be cleaned. + + Returns + ------- + name : hashable + Returns the name after tokenizing and cleaning. + + Notes + ----- + For some cases, a name cannot be converted to a valid Python identifier. + In that case :func:`tokenize_string` raises a SyntaxError. + In that case, we just return the name unmodified. + + If this name was used in the query string (this makes the query call impossible) + an error will be raised by :func:`tokenize_backtick_quoted_string` instead, + which is not caught and propagates to the user level. + """ + try: + tokenized = tokenize_string(f"`{name}`") + tokval = next(tokenized)[1] + return create_valid_python_identifier(tokval) + except SyntaxError: + return name + + +def tokenize_backtick_quoted_string( + token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int +) -> tuple[int, str]: + """ + Creates a token from a backtick quoted string. + + Moves the token_generator forwards till right after the next backtick. + + Parameters + ---------- + token_generator : Iterator[tokenize.TokenInfo] + The generator that yields the tokens of the source string (Tuple[int, str]). + The generator is at the first token after the backtick (`) + + source : str + The Python source code string. + + string_start : int + This is the start of backtick quoted string inside the source string. + + Returns + ------- + tok: Tuple[int, str] + The token that represents the backtick quoted string. + The integer is equal to BACKTICK_QUOTED_STRING (100). + """ + for _, tokval, start, _, _ in token_generator: + if tokval == "`": + string_end = start[1] + break + + return BACKTICK_QUOTED_STRING, source[string_start:string_end] + + +def tokenize_string(source: str) -> Iterator[tuple[int, str]]: + """ + Tokenize a Python source code string. + + Parameters + ---------- + source : str + The Python source code string. + + Returns + ------- + tok_generator : Iterator[Tuple[int, str]] + An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). + """ + line_reader = StringIO(source).readline + token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted string + for toknum, tokval, start, _, _ in token_generator: + if tokval == "`": + try: + yield tokenize_backtick_quoted_string( + token_generator, source, string_start=start[1] + 1 + ) + except Exception as err: + raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err + else: + yield toknum, tokval diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/pytables.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/pytables.py new file mode 100644 index 0000000000000000000000000000000000000000..04a8ad7ef0be6b044baf65b80cbf4161d45f8cac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/pytables.py @@ -0,0 +1,666 @@ +""" manage PyTables query interface via Expressions """ +from __future__ import annotations + +import ast +from decimal import ( + Decimal, + InvalidOperation, +) +from functools import partial +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, +) + +import numpy as np + +from pandas._libs.tslibs import ( + Timedelta, + Timestamp, +) +from pandas.errors import UndefinedVariableError + +from pandas.core.dtypes.common import is_list_like + +import pandas.core.common as com +from pandas.core.computation import ( + expr, + ops, + scope as _scope, +) +from pandas.core.computation.common import ensure_decoded +from pandas.core.computation.expr import BaseExprVisitor +from pandas.core.computation.ops import is_term +from pandas.core.construction import extract_array +from pandas.core.indexes.base import Index + +from pandas.io.formats.printing import ( + pprint_thing, + pprint_thing_encoded, +) + +if TYPE_CHECKING: + from pandas._typing import ( + Self, + npt, + ) + + +class PyTablesScope(_scope.Scope): + __slots__ = ("queryables",) + + queryables: dict[str, Any] + + def __init__( + self, + level: int, + global_dict=None, + local_dict=None, + queryables: dict[str, Any] | None = None, + ) -> None: + super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict) + self.queryables = queryables or {} + + +class Term(ops.Term): + env: PyTablesScope + + def __new__(cls, name, env, side=None, encoding=None): + if isinstance(name, str): + klass = cls + else: + klass = Constant + return object.__new__(klass) + + def __init__(self, name, env: PyTablesScope, side=None, encoding=None) -> None: + super().__init__(name, env, side=side, encoding=encoding) + + def _resolve_name(self): + # must be a queryables + if self.side == "left": + # Note: The behavior of __new__ ensures that self.name is a str here + if self.name not in self.env.queryables: + raise NameError(f"name {repr(self.name)} is not defined") + return self.name + + # resolve the rhs (and allow it to be None) + try: + return self.env.resolve(self.name, is_local=False) + except UndefinedVariableError: + return self.name + + # read-only property overwriting read/write property + @property # type: ignore[misc] + def value(self): + return self._value + + +class Constant(Term): + def __init__(self, name, env: PyTablesScope, side=None, encoding=None) -> None: + assert isinstance(env, PyTablesScope), type(env) + super().__init__(name, env, side=side, encoding=encoding) + + def _resolve_name(self): + return self._name + + +class BinOp(ops.BinOp): + _max_selectors = 31 + + op: str + queryables: dict[str, Any] + condition: str | None + + def __init__(self, op: str, lhs, rhs, queryables: dict[str, Any], encoding) -> None: + super().__init__(op, lhs, rhs) + self.queryables = queryables + self.encoding = encoding + self.condition = None + + def _disallow_scalar_only_bool_ops(self) -> None: + pass + + def prune(self, klass): + def pr(left, right): + """create and return a new specialized BinOp from myself""" + if left is None: + return right + elif right is None: + return left + + k = klass + if isinstance(left, ConditionBinOp): + if isinstance(right, ConditionBinOp): + k = JointConditionBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + elif isinstance(left, FilterBinOp): + if isinstance(right, FilterBinOp): + k = JointFilterBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + return k( + self.op, left, right, queryables=self.queryables, encoding=self.encoding + ).evaluate() + + left, right = self.lhs, self.rhs + + if is_term(left) and is_term(right): + res = pr(left.value, right.value) + elif not is_term(left) and is_term(right): + res = pr(left.prune(klass), right.value) + elif is_term(left) and not is_term(right): + res = pr(left.value, right.prune(klass)) + elif not (is_term(left) or is_term(right)): + res = pr(left.prune(klass), right.prune(klass)) + + return res + + def conform(self, rhs): + """inplace conform rhs""" + if not is_list_like(rhs): + rhs = [rhs] + if isinstance(rhs, np.ndarray): + rhs = rhs.ravel() + return rhs + + @property + def is_valid(self) -> bool: + """return True if this is a valid field""" + return self.lhs in self.queryables + + @property + def is_in_table(self) -> bool: + """ + return True if this is a valid column name for generation (e.g. an + actual column in the table) + """ + return self.queryables.get(self.lhs) is not None + + @property + def kind(self): + """the kind of my field""" + return getattr(self.queryables.get(self.lhs), "kind", None) + + @property + def meta(self): + """the meta of my field""" + return getattr(self.queryables.get(self.lhs), "meta", None) + + @property + def metadata(self): + """the metadata of my field""" + return getattr(self.queryables.get(self.lhs), "metadata", None) + + def generate(self, v) -> str: + """create and return the op string for this TermValue""" + val = v.tostring(self.encoding) + return f"({self.lhs} {self.op} {val})" + + def convert_value(self, v) -> TermValue: + """ + convert the expression that is in the term to something that is + accepted by pytables + """ + + def stringify(value): + if self.encoding is not None: + return pprint_thing_encoded(value, encoding=self.encoding) + return pprint_thing(value) + + kind = ensure_decoded(self.kind) + meta = ensure_decoded(self.meta) + if kind == "datetime" or (kind and kind.startswith("datetime64")): + if isinstance(v, (int, float)): + v = stringify(v) + v = ensure_decoded(v) + v = Timestamp(v).as_unit("ns") + if v.tz is not None: + v = v.tz_convert("UTC") + return TermValue(v, v._value, kind) + elif kind in ("timedelta64", "timedelta"): + if isinstance(v, str): + v = Timedelta(v) + else: + v = Timedelta(v, unit="s") + v = v.as_unit("ns")._value + return TermValue(int(v), v, kind) + elif meta == "category": + metadata = extract_array(self.metadata, extract_numpy=True) + result: npt.NDArray[np.intp] | np.intp | int + if v not in metadata: + result = -1 + else: + result = metadata.searchsorted(v, side="left") + return TermValue(result, result, "integer") + elif kind == "integer": + try: + v_dec = Decimal(v) + except InvalidOperation: + # GH 54186 + # convert v to float to raise float's ValueError + float(v) + else: + v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) + return TermValue(v, v, kind) + elif kind == "float": + v = float(v) + return TermValue(v, v, kind) + elif kind == "bool": + if isinstance(v, str): + v = v.strip().lower() not in [ + "false", + "f", + "no", + "n", + "none", + "0", + "[]", + "{}", + "", + ] + else: + v = bool(v) + return TermValue(v, v, kind) + elif isinstance(v, str): + # string quoting + return TermValue(v, stringify(v), "string") + else: + raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column") + + def convert_values(self) -> None: + pass + + +class FilterBinOp(BinOp): + filter: tuple[Any, Any, Index] | None = None + + def __repr__(self) -> str: + if self.filter is None: + return "Filter: Not Initialized" + return pprint_thing(f"[Filter : [{self.filter[0]}] -> [{self.filter[1]}]") + + def invert(self) -> Self: + """invert the filter""" + if self.filter is not None: + self.filter = ( + self.filter[0], + self.generate_filter_op(invert=True), + self.filter[2], + ) + return self + + def format(self): + """return the actual filter format""" + return [self.filter] + + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self | None: # type: ignore[override] + if not self.is_valid: + raise ValueError(f"query term is not valid [{self}]") + + rhs = self.conform(self.rhs) + values = list(rhs) + + if self.is_in_table: + # if too many values to create the expression, use a filter instead + if self.op in ["==", "!="] and len(values) > self._max_selectors: + filter_op = self.generate_filter_op() + self.filter = (self.lhs, filter_op, Index(values)) + + return self + return None + + # equality conditions + if self.op in ["==", "!="]: + filter_op = self.generate_filter_op() + self.filter = (self.lhs, filter_op, Index(values)) + + else: + raise TypeError( + f"passing a filterable condition to a non-table indexer [{self}]" + ) + + return self + + def generate_filter_op(self, invert: bool = False): + if (self.op == "!=" and not invert) or (self.op == "==" and invert): + return lambda axis, vals: ~axis.isin(vals) + else: + return lambda axis, vals: axis.isin(vals) + + +class JointFilterBinOp(FilterBinOp): + def format(self): + raise NotImplementedError("unable to collapse Joint Filters") + + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self: # type: ignore[override] + return self + + +class ConditionBinOp(BinOp): + def __repr__(self) -> str: + return pprint_thing(f"[Condition : [{self.condition}]]") + + def invert(self): + """invert the condition""" + # if self.condition is not None: + # self.condition = "~(%s)" % self.condition + # return self + raise NotImplementedError( + "cannot use an invert condition when passing to numexpr" + ) + + def format(self): + """return the actual ne format""" + return self.condition + + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self | None: # type: ignore[override] + if not self.is_valid: + raise ValueError(f"query term is not valid [{self}]") + + # convert values if we are in the table + if not self.is_in_table: + return None + + rhs = self.conform(self.rhs) + values = [self.convert_value(v) for v in rhs] + + # equality conditions + if self.op in ["==", "!="]: + # too many values to create the expression? + if len(values) <= self._max_selectors: + vs = [self.generate(v) for v in values] + self.condition = f"({' | '.join(vs)})" + + # use a filter after reading + else: + return None + else: + self.condition = self.generate(values[0]) + + return self + + +class JointConditionBinOp(ConditionBinOp): + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self: # type: ignore[override] + self.condition = f"({self.lhs.condition} {self.op} {self.rhs.condition})" + return self + + +class UnaryOp(ops.UnaryOp): + def prune(self, klass): + if self.op != "~": + raise NotImplementedError("UnaryOp only support invert type ops") + + operand = self.operand + operand = operand.prune(klass) + + if operand is not None and ( + issubclass(klass, ConditionBinOp) + and operand.condition is not None + or not issubclass(klass, ConditionBinOp) + and issubclass(klass, FilterBinOp) + and operand.filter is not None + ): + return operand.invert() + return None + + +class PyTablesExprVisitor(BaseExprVisitor): + const_type: ClassVar[type[ops.Term]] = Constant + term_type: ClassVar[type[Term]] = Term + + def __init__(self, env, engine, parser, **kwargs) -> None: + super().__init__(env, engine, parser) + for bin_op in self.binary_ops: + bin_node = self.binary_op_nodes_map[bin_op] + setattr( + self, + f"visit_{bin_node}", + lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs), + ) + + def visit_UnaryOp(self, node, **kwargs) -> ops.Term | UnaryOp | None: + if isinstance(node.op, (ast.Not, ast.Invert)): + return UnaryOp("~", self.visit(node.operand)) + elif isinstance(node.op, ast.USub): + return self.const_type(-self.visit(node.operand).value, self.env) + elif isinstance(node.op, ast.UAdd): + raise NotImplementedError("Unary addition not supported") + # TODO: return None might never be reached + return None + + def visit_Index(self, node, **kwargs): + return self.visit(node.value).value + + def visit_Assign(self, node, **kwargs): + cmpr = ast.Compare( + ops=[ast.Eq()], left=node.targets[0], comparators=[node.value] + ) + return self.visit(cmpr) + + def visit_Subscript(self, node, **kwargs) -> ops.Term: + # only allow simple subscripts + + value = self.visit(node.value) + slobj = self.visit(node.slice) + try: + value = value.value + except AttributeError: + pass + + if isinstance(slobj, Term): + # In py39 np.ndarray lookups with Term containing int raise + slobj = slobj.value + + try: + return self.const_type(value[slobj], self.env) + except TypeError as err: + raise ValueError( + f"cannot subscript {repr(value)} with {repr(slobj)}" + ) from err + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = type(node.ctx) + if ctx == ast.Load: + # resolve the value + resolved = self.visit(value) + + # try to get the value to see if we are another expression + try: + resolved = resolved.value + except AttributeError: + pass + + try: + return self.term_type(getattr(resolved, attr), self.env) + except AttributeError: + # something like datetime.datetime where scope is overridden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + + raise ValueError(f"Invalid Attribute context {ctx.__name__}") + + def translate_In(self, op): + return ast.Eq() if isinstance(op, ast.In) else op + + def _rewrite_membership_op(self, node, left, right): + return self.visit(node.op), node.op, left, right + + +def _validate_where(w): + """ + Validate that the where statement is of the right type. + + The type may either be String, Expr, or list-like of Exprs. + + Parameters + ---------- + w : String term expression, Expr, or list-like of Exprs. + + Returns + ------- + where : The original where clause if the check was successful. + + Raises + ------ + TypeError : An invalid data type was passed in for w (e.g. dict). + """ + if not (isinstance(w, (PyTablesExpr, str)) or is_list_like(w)): + raise TypeError( + "where must be passed as a string, PyTablesExpr, " + "or list-like of PyTablesExpr" + ) + + return w + + +class PyTablesExpr(expr.Expr): + """ + Hold a pytables-like expression, comprised of possibly multiple 'terms'. + + Parameters + ---------- + where : string term expression, PyTablesExpr, or list-like of PyTablesExprs + queryables : a "kinds" map (dict of column name -> kind), or None if column + is non-indexable + encoding : an encoding that will encode the query terms + + Returns + ------- + a PyTablesExpr object + + Examples + -------- + 'index>=date' + "columns=['A', 'D']" + 'columns=A' + 'columns==A' + "~(columns=['A','B'])" + 'index>df.index[3] & string="bar"' + '(index>df.index[3] & index<=df.index[6]) | string="bar"' + "ts>=Timestamp('2012-02-01')" + "major_axis>=20130101" + """ + + _visitor: PyTablesExprVisitor | None + env: PyTablesScope + expr: str + + def __init__( + self, + where, + queryables: dict[str, Any] | None = None, + encoding=None, + scope_level: int = 0, + ) -> None: + where = _validate_where(where) + + self.encoding = encoding + self.condition = None + self.filter = None + self.terms = None + self._visitor = None + + # capture the environment if needed + local_dict: _scope.DeepChainMap[Any, Any] | None = None + + if isinstance(where, PyTablesExpr): + local_dict = where.env.scope + _where = where.expr + + elif is_list_like(where): + where = list(where) + for idx, w in enumerate(where): + if isinstance(w, PyTablesExpr): + local_dict = w.env.scope + else: + where[idx] = _validate_where(w) + _where = " & ".join([f"({w})" for w in com.flatten(where)]) + else: + # _validate_where ensures we otherwise have a string + _where = where + + self.expr = _where + self.env = PyTablesScope(scope_level + 1, local_dict=local_dict) + + if queryables is not None and isinstance(self.expr, str): + self.env.queryables.update(queryables) + self._visitor = PyTablesExprVisitor( + self.env, + queryables=queryables, + parser="pytables", + engine="pytables", + encoding=encoding, + ) + self.terms = self.parse() + + def __repr__(self) -> str: + if self.terms is not None: + return pprint_thing(self.terms) + return pprint_thing(self.expr) + + def evaluate(self): + """create and return the numexpr condition and filter""" + try: + self.condition = self.terms.prune(ConditionBinOp) + except AttributeError as err: + raise ValueError( + f"cannot process expression [{self.expr}], [{self}] " + "is not a valid condition" + ) from err + try: + self.filter = self.terms.prune(FilterBinOp) + except AttributeError as err: + raise ValueError( + f"cannot process expression [{self.expr}], [{self}] " + "is not a valid filter" + ) from err + + return self.condition, self.filter + + +class TermValue: + """hold a term value the we use to construct a condition/filter""" + + def __init__(self, value, converted, kind: str) -> None: + assert isinstance(kind, str), kind + self.value = value + self.converted = converted + self.kind = kind + + def tostring(self, encoding) -> str: + """quote the string if not encoded else encode and return""" + if self.kind == "string": + if encoding is not None: + return str(self.converted) + return f'"{self.converted}"' + elif self.kind == "float": + # python 2 str(float) is not always + # round-trippable so use repr() + return repr(self.converted) + return str(self.converted) + + +def maybe_expression(s) -> bool: + """loose checking if s is a pytables-acceptable expression""" + if not isinstance(s, str): + return False + operations = PyTablesExprVisitor.binary_ops + PyTablesExprVisitor.unary_ops + ("=",) + + # make sure we have an op at least + return any(op in s for op in operations) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/scope.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/scope.py new file mode 100644 index 0000000000000000000000000000000000000000..7e553ca448218435eafe1fd7ca97dce6f739e2a3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/computation/scope.py @@ -0,0 +1,355 @@ +""" +Module for scope operations +""" +from __future__ import annotations + +from collections import ChainMap +import datetime +import inspect +from io import StringIO +import itertools +import pprint +import struct +import sys +from typing import TypeVar + +import numpy as np + +from pandas._libs.tslibs import Timestamp +from pandas.errors import UndefinedVariableError + +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") + + +# https://docs.python.org/3/library/collections.html#chainmap-examples-and-recipes +class DeepChainMap(ChainMap[_KT, _VT]): + """ + Variant of ChainMap that allows direct updates to inner scopes. + + Only works when all passed mapping are mutable. + """ + + def __setitem__(self, key: _KT, value: _VT) -> None: + for mapping in self.maps: + if key in mapping: + mapping[key] = value + return + self.maps[0][key] = value + + def __delitem__(self, key: _KT) -> None: + """ + Raises + ------ + KeyError + If `key` doesn't exist. + """ + for mapping in self.maps: + if key in mapping: + del mapping[key] + return + raise KeyError(key) + + +def ensure_scope( + level: int, global_dict=None, local_dict=None, resolvers=(), target=None +) -> Scope: + """Ensure that we are grabbing the correct scope.""" + return Scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) + + +def _replacer(x) -> str: + """ + Replace a number with its hexadecimal representation. Used to tag + temporary variables with their calling scope's id. + """ + # get the hex repr of the binary char and remove 0x and pad by pad_size + # zeros + try: + hexin = ord(x) + except TypeError: + # bytes literals masquerade as ints when iterating in py3 + hexin = x + + return hex(hexin) + + +def _raw_hex_id(obj) -> str: + """Return the padded hexadecimal id of ``obj``.""" + # interpret as a pointer since that's what really what id returns + packed = struct.pack("@P", id(obj)) + return "".join([_replacer(x) for x in packed]) + + +DEFAULT_GLOBALS = { + "Timestamp": Timestamp, + "datetime": datetime.datetime, + "True": True, + "False": False, + "list": list, + "tuple": tuple, + "inf": np.inf, + "Inf": np.inf, +} + + +def _get_pretty_string(obj) -> str: + """ + Return a prettier version of obj. + + Parameters + ---------- + obj : object + Object to pretty print + + Returns + ------- + str + Pretty print object repr + """ + sio = StringIO() + pprint.pprint(obj, stream=sio) + return sio.getvalue() + + +class Scope: + """ + Object to hold scope, with a few bells to deal with some custom syntax + and contexts added by pandas. + + Parameters + ---------- + level : int + global_dict : dict or None, optional, default None + local_dict : dict or Scope or None, optional, default None + resolvers : list-like or None, optional, default None + target : object + + Attributes + ---------- + level : int + scope : DeepChainMap + target : object + temps : dict + """ + + __slots__ = ["level", "scope", "target", "resolvers", "temps"] + level: int + scope: DeepChainMap + resolvers: DeepChainMap + temps: dict + + def __init__( + self, level: int, global_dict=None, local_dict=None, resolvers=(), target=None + ) -> None: + self.level = level + 1 + + # shallow copy because we don't want to keep filling this up with what + # was there before if there are multiple calls to Scope/_ensure_scope + self.scope = DeepChainMap(DEFAULT_GLOBALS.copy()) + self.target = target + + if isinstance(local_dict, Scope): + self.scope.update(local_dict.scope) + if local_dict.target is not None: + self.target = local_dict.target + self._update(local_dict.level) + + frame = sys._getframe(self.level) + + try: + # shallow copy here because we don't want to replace what's in + # scope when we align terms (alignment accesses the underlying + # numpy array of pandas objects) + scope_global = self.scope.new_child( + (global_dict if global_dict is not None else frame.f_globals).copy() + ) + self.scope = DeepChainMap(scope_global) + if not isinstance(local_dict, Scope): + scope_local = self.scope.new_child( + (local_dict if local_dict is not None else frame.f_locals).copy() + ) + self.scope = DeepChainMap(scope_local) + finally: + del frame + + # assumes that resolvers are going from outermost scope to inner + if isinstance(local_dict, Scope): + resolvers += tuple(local_dict.resolvers.maps) + self.resolvers = DeepChainMap(*resolvers) + self.temps = {} + + def __repr__(self) -> str: + scope_keys = _get_pretty_string(list(self.scope.keys())) + res_keys = _get_pretty_string(list(self.resolvers.keys())) + return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" + + @property + def has_resolvers(self) -> bool: + """ + Return whether we have any extra scope. + + For example, DataFrames pass Their columns as resolvers during calls to + ``DataFrame.eval()`` and ``DataFrame.query()``. + + Returns + ------- + hr : bool + """ + return bool(len(self.resolvers)) + + def resolve(self, key: str, is_local: bool): + """ + Resolve a variable name in a possibly local context. + + Parameters + ---------- + key : str + A variable name + is_local : bool + Flag indicating whether the variable is local or not (prefixed with + the '@' symbol) + + Returns + ------- + value : object + The value of a particular variable + """ + try: + # only look for locals in outer scope + if is_local: + return self.scope[key] + + # not a local variable so check in resolvers if we have them + if self.has_resolvers: + return self.resolvers[key] + + # if we're here that means that we have no locals and we also have + # no resolvers + assert not is_local and not self.has_resolvers + return self.scope[key] + except KeyError: + try: + # last ditch effort we look in temporaries + # these are created when parsing indexing expressions + # e.g., df[df > 0] + return self.temps[key] + except KeyError as err: + raise UndefinedVariableError(key, is_local) from err + + def swapkey(self, old_key: str, new_key: str, new_value=None) -> None: + """ + Replace a variable name, with a potentially new value. + + Parameters + ---------- + old_key : str + Current variable name to replace + new_key : str + New variable name to replace `old_key` with + new_value : object + Value to be replaced along with the possible renaming + """ + if self.has_resolvers: + maps = self.resolvers.maps + self.scope.maps + else: + maps = self.scope.maps + + maps.append(self.temps) + + for mapping in maps: + if old_key in mapping: + mapping[new_key] = new_value + return + + def _get_vars(self, stack, scopes: list[str]) -> None: + """ + Get specifically scoped variables from a list of stack frames. + + Parameters + ---------- + stack : list + A list of stack frames as returned by ``inspect.stack()`` + scopes : sequence of strings + A sequence containing valid stack frame attribute names that + evaluate to a dictionary. For example, ('locals', 'globals') + """ + variables = itertools.product(scopes, stack) + for scope, (frame, _, _, _, _, _) in variables: + try: + d = getattr(frame, f"f_{scope}") + self.scope = DeepChainMap(self.scope.new_child(d)) + finally: + # won't remove it, but DECREF it + # in Py3 this probably isn't necessary since frame won't be + # scope after the loop + del frame + + def _update(self, level: int) -> None: + """ + Update the current scope by going back `level` levels. + + Parameters + ---------- + level : int + """ + sl = level + 1 + + # add sl frames to the scope starting with the + # most distant and overwriting with more current + # makes sure that we can capture variable scope + stack = inspect.stack() + + try: + self._get_vars(stack[:sl], scopes=["locals"]) + finally: + del stack[:], stack + + def add_tmp(self, value) -> str: + """ + Add a temporary variable to the scope. + + Parameters + ---------- + value : object + An arbitrary object to be assigned to a temporary variable. + + Returns + ------- + str + The name of the temporary variable created. + """ + name = f"{type(value).__name__}_{self.ntemps}_{_raw_hex_id(self)}" + + # add to inner most scope + assert name not in self.temps + self.temps[name] = value + assert name in self.temps + + # only increment if the variable gets put in the scope + return name + + @property + def ntemps(self) -> int: + """The number of temporary variables in this scope""" + return len(self.temps) + + @property + def full_scope(self) -> DeepChainMap: + """ + Return the full scope for use with passing to engines transparently + as a mapping. + + Returns + ------- + vars : DeepChainMap + All variables in this scope. + """ + maps = [self.temps] + self.resolvers.maps + self.scope.maps + return DeepChainMap(*maps) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82d1638b2a68f440ce227a5aab530fec73d3c433 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cf8136e18a866871f18cb3bd7551f99e3f05403 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d8d4eed48c7e8d3c307b6b6ebb9d457d83679fb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/base.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c3b7a517495f6789a553b516c0d00d0080c8ef4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/base.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/cast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/cast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb0e15e72693d7e3a4d4e19fd1eafafb1d8f1bf0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/cast.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6cdec8265be9e9fd74aedf8865a2a111de0cf2b3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/concat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/concat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b097487529bf46fe90ed4abc183e71ae13ddc2a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/concat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/dtypes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/dtypes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccbb1f87a4d979457c702f862abd199ec551973f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/dtypes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/generic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/generic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76f8dfcb9aff87a41aac1ab28f4137740e8bbce5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/generic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/inference.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/inference.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94bb79baa12625689150cfaf1ad2c0b2b50d749e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/inference.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/missing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/missing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..050a2e8151b1aa6d411f100c1d3276ce397553c0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/__pycache__/missing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/api.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/api.py new file mode 100644 index 0000000000000000000000000000000000000000..254abe330b8e7229d0c2a27519e82efbe902c537 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/api.py @@ -0,0 +1,85 @@ +from pandas.core.dtypes.common import ( + is_any_real_numeric_dtype, + is_array_like, + is_bool, + is_bool_dtype, + is_categorical_dtype, + is_complex, + is_complex_dtype, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_file_like, + is_float, + is_float_dtype, + is_hashable, + is_int64_dtype, + is_integer, + is_integer_dtype, + is_interval, + is_interval_dtype, + is_iterator, + is_list_like, + is_named_tuple, + is_number, + is_numeric_dtype, + is_object_dtype, + is_period_dtype, + is_re, + is_re_compilable, + is_scalar, + is_signed_integer_dtype, + is_sparse, + is_string_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) + +__all__ = [ + "is_any_real_numeric_dtype", + "is_array_like", + "is_bool", + "is_bool_dtype", + "is_categorical_dtype", + "is_complex", + "is_complex_dtype", + "is_datetime64_any_dtype", + "is_datetime64_dtype", + "is_datetime64_ns_dtype", + "is_datetime64tz_dtype", + "is_dict_like", + "is_dtype_equal", + "is_extension_array_dtype", + "is_file_like", + "is_float", + "is_float_dtype", + "is_hashable", + "is_int64_dtype", + "is_integer", + "is_integer_dtype", + "is_interval", + "is_interval_dtype", + "is_iterator", + "is_list_like", + "is_named_tuple", + "is_number", + "is_numeric_dtype", + "is_object_dtype", + "is_period_dtype", + "is_re", + "is_re_compilable", + "is_scalar", + "is_signed_integer_dtype", + "is_sparse", + "is_string_dtype", + "is_timedelta64_dtype", + "is_timedelta64_ns_dtype", + "is_unsigned_integer_dtype", + "pandas_dtype", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/astype.py new file mode 100644 index 0000000000000000000000000000000000000000..f5579082c679bf131c056f3f2029b2485e88bd0d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/astype.py @@ -0,0 +1,301 @@ +""" +Functions for implementing 'astype' methods according to pandas conventions, +particularly ones that differ from numpy. +""" +from __future__ import annotations + +import inspect +from typing import ( + TYPE_CHECKING, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas._libs.tslibs.timedeltas import array_to_timedelta64 +from pandas.errors import IntCastingNaNError + +from pandas.core.dtypes.common import ( + is_object_dtype, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, + NumpyEADtype, +) + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + DtypeObj, + IgnoreRaise, + ) + + from pandas.core.arrays import ExtensionArray + +_dtype_obj = np.dtype(object) + + +@overload +def _astype_nansafe( + arr: np.ndarray, dtype: np.dtype, copy: bool = ..., skipna: bool = ... +) -> np.ndarray: + ... + + +@overload +def _astype_nansafe( + arr: np.ndarray, dtype: ExtensionDtype, copy: bool = ..., skipna: bool = ... +) -> ExtensionArray: + ... + + +def _astype_nansafe( + arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False +) -> ArrayLike: + """ + Cast the elements of an array to a given dtype a nan-safe manner. + + Parameters + ---------- + arr : ndarray + dtype : np.dtype or ExtensionDtype + copy : bool, default True + If False, a view will be attempted but may fail, if + e.g. the item sizes don't align. + skipna: bool, default False + Whether or not we should skip NaN when casting as a string-type. + + Raises + ------ + ValueError + The dtype was a datetime64/timedelta64 dtype, but it had no unit. + """ + + # dispatch on extension dtype if needed + if isinstance(dtype, ExtensionDtype): + return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) + + elif not isinstance(dtype, np.dtype): # pragma: no cover + raise ValueError("dtype must be np.dtype or ExtensionDtype") + + if arr.dtype.kind in "mM": + from pandas.core.construction import ensure_wrapped_if_datetimelike + + arr = ensure_wrapped_if_datetimelike(arr) + res = arr.astype(dtype, copy=copy) + return np.asarray(res) + + if issubclass(dtype.type, str): + shape = arr.shape + if arr.ndim > 1: + arr = arr.ravel() + return lib.ensure_string_array( + arr, skipna=skipna, convert_na_value=False + ).reshape(shape) + + elif np.issubdtype(arr.dtype, np.floating) and dtype.kind in "iu": + return _astype_float_to_int_nansafe(arr, dtype, copy) + + elif arr.dtype == object: + # if we have a datetime/timedelta array of objects + # then coerce to datetime64[ns] and use DatetimeArray.astype + + if lib.is_np_dtype(dtype, "M"): + from pandas.core.arrays import DatetimeArray + + dta = DatetimeArray._from_sequence(arr, dtype=dtype) + return dta._ndarray + + elif lib.is_np_dtype(dtype, "m"): + from pandas.core.construction import ensure_wrapped_if_datetimelike + + # bc we know arr.dtype == object, this is equivalent to + # `np.asarray(to_timedelta(arr))`, but using a lower-level API that + # does not require a circular import. + tdvals = array_to_timedelta64(arr).view("m8[ns]") + + tda = ensure_wrapped_if_datetimelike(tdvals) + return tda.astype(dtype, copy=False)._ndarray + + if dtype.name in ("datetime64", "timedelta64"): + msg = ( + f"The '{dtype.name}' dtype has no unit. Please pass in " + f"'{dtype.name}[ns]' instead." + ) + raise ValueError(msg) + + if copy or arr.dtype == object or dtype == object: + # Explicit copy, or required since NumPy can't view from / to object. + return arr.astype(dtype, copy=True) + + return arr.astype(dtype, copy=copy) + + +def _astype_float_to_int_nansafe( + values: np.ndarray, dtype: np.dtype, copy: bool +) -> np.ndarray: + """ + astype with a check preventing converting NaN to an meaningless integer value. + """ + if not np.isfinite(values).all(): + raise IntCastingNaNError( + "Cannot convert non-finite values (NA or inf) to integer" + ) + if dtype.kind == "u": + # GH#45151 + if not (values >= 0).all(): + raise ValueError(f"Cannot losslessly cast from {values.dtype} to {dtype}") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=RuntimeWarning) + return values.astype(dtype, copy=copy) + + +def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike: + """ + Cast array (ndarray or ExtensionArray) to the new dtype. + + Parameters + ---------- + values : ndarray or ExtensionArray + dtype : dtype object + copy : bool, default False + copy if indicated + + Returns + ------- + ndarray or ExtensionArray + """ + if values.dtype == dtype: + if copy: + return values.copy() + return values + + if not isinstance(values, np.ndarray): + # i.e. ExtensionArray + values = values.astype(dtype, copy=copy) + + else: + values = _astype_nansafe(values, dtype, copy=copy) + + # in pandas we don't store numpy str dtypes, so convert to object + if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): + values = np.array(values, dtype=object) + + return values + + +def astype_array_safe( + values: ArrayLike, dtype, copy: bool = False, errors: IgnoreRaise = "raise" +) -> ArrayLike: + """ + Cast array (ndarray or ExtensionArray) to the new dtype. + + This basically is the implementation for DataFrame/Series.astype and + includes all custom logic for pandas (NaN-safety, converting str to object, + not allowing ) + + Parameters + ---------- + values : ndarray or ExtensionArray + dtype : str, dtype convertible + copy : bool, default False + copy if indicated + errors : str, {'raise', 'ignore'}, default 'raise' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + + Returns + ------- + ndarray or ExtensionArray + """ + errors_legal_values = ("raise", "ignore") + + if errors not in errors_legal_values: + invalid_arg = ( + "Expected value of kwarg 'errors' to be one of " + f"{list(errors_legal_values)}. Supplied value is '{errors}'" + ) + raise ValueError(invalid_arg) + + if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): + msg = ( + f"Expected an instance of {dtype.__name__}, " + "but got the class instead. Try instantiating 'dtype'." + ) + raise TypeError(msg) + + dtype = pandas_dtype(dtype) + if isinstance(dtype, NumpyEADtype): + # Ensure we don't end up with a NumpyExtensionArray + dtype = dtype.numpy_dtype + + try: + new_values = astype_array(values, dtype, copy=copy) + except (ValueError, TypeError): + # e.g. _astype_nansafe can fail on object-dtype of strings + # trying to convert to float + if errors == "ignore": + new_values = values + else: + raise + + return new_values + + +def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: + """Checks if astype avoided copying the data. + + Parameters + ---------- + dtype : Original dtype + new_dtype : target dtype + + Returns + ------- + True if new data is a view or not guaranteed to be a copy, False otherwise + """ + if isinstance(dtype, np.dtype) and not isinstance(new_dtype, np.dtype): + new_dtype, dtype = dtype, new_dtype + + if dtype == new_dtype: + return True + + elif isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype): + # Only equal numpy dtypes avoid a copy + return False + + elif is_string_dtype(dtype) and is_string_dtype(new_dtype): + # Potentially! a view when converting from object to string + return True + + elif is_object_dtype(dtype) and new_dtype.kind == "O": + # When the underlying array has dtype object, we don't have to make a copy + return True + + elif dtype.kind in "mM" and new_dtype.kind in "mM": + dtype = getattr(dtype, "numpy_dtype", dtype) + new_dtype = getattr(new_dtype, "numpy_dtype", new_dtype) + return getattr(dtype, "unit", None) == getattr(new_dtype, "unit", None) + + numpy_dtype = getattr(dtype, "numpy_dtype", None) + new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None) + + if numpy_dtype is None and isinstance(dtype, np.dtype): + numpy_dtype = dtype + + if new_numpy_dtype is None and isinstance(new_dtype, np.dtype): + new_numpy_dtype = new_dtype + + if numpy_dtype is not None and new_numpy_dtype is not None: + # if both have NumPy dtype or one of them is a numpy dtype + # they are only a view when the numpy dtypes are equal, e.g. + # int64 -> Int64 or int64[pyarrow] + # int64 -> Int32 copies + return numpy_dtype == new_numpy_dtype + + # Assume this is a view since we don't know for sure if a copy was made + return True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/base.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/base.py new file mode 100644 index 0000000000000000000000000000000000000000..6b00a5284ec5b18809e233e9ef89e31771ac651e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/base.py @@ -0,0 +1,583 @@ +""" +Extend pandas with custom array types. +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, + TypeVar, + cast, + overload, +) + +import numpy as np + +from pandas._libs import missing as libmissing +from pandas._libs.hashtable import object_hash +from pandas._libs.properties import cache_readonly +from pandas.errors import AbstractMethodError + +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndex, + ABCSeries, +) + +if TYPE_CHECKING: + from pandas._typing import ( + DtypeObj, + Self, + Shape, + npt, + type_t, + ) + + from pandas import Index + from pandas.core.arrays import ExtensionArray + + # To parameterize on same ExtensionDtype + ExtensionDtypeT = TypeVar("ExtensionDtypeT", bound="ExtensionDtype") + + +class ExtensionDtype: + """ + A custom data type, to be paired with an ExtensionArray. + + See Also + -------- + extensions.register_extension_dtype: Register an ExtensionType + with pandas as class decorator. + extensions.ExtensionArray: Abstract base class for custom 1-D array types. + + Notes + ----- + The interface includes the following abstract methods that must + be implemented by subclasses: + + * type + * name + * construct_array_type + + The following attributes and methods influence the behavior of the dtype in + pandas operations + + * _is_numeric + * _is_boolean + * _get_common_dtype + + The `na_value` class attribute can be used to set the default NA value + for this type. :attr:`numpy.nan` is used by default. + + ExtensionDtypes are required to be hashable. The base class provides + a default implementation, which relies on the ``_metadata`` class + attribute. ``_metadata`` should be a tuple containing the strings + that define your data type. For example, with ``PeriodDtype`` that's + the ``freq`` attribute. + + **If you have a parametrized dtype you should set the ``_metadata`` + class property**. + + Ideally, the attributes in ``_metadata`` will match the + parameters to your ``ExtensionDtype.__init__`` (if any). If any of + the attributes in ``_metadata`` don't implement the standard + ``__eq__`` or ``__hash__``, the default implementations here will not + work. + + Examples + -------- + + For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method + can be implemented: this method receives a pyarrow Array or ChunkedArray + as only argument and is expected to return the appropriate pandas + ExtensionArray for this dtype and the passed values: + + >>> import pyarrow + >>> from pandas.api.extensions import ExtensionArray + >>> class ExtensionDtype: + ... def __from_arrow__( + ... self, + ... array: pyarrow.Array | pyarrow.ChunkedArray + ... ) -> ExtensionArray: + ... ... + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. + """ + + _metadata: tuple[str, ...] = () + + def __str__(self) -> str: + return self.name + + def __eq__(self, other: object) -> bool: + """ + Check whether 'other' is equal to self. + + By default, 'other' is considered equal if either + + * it's a string matching 'self.name'. + * it's an instance of this type and all of the attributes + in ``self._metadata`` are equal between `self` and `other`. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, str): + try: + other = self.construct_from_string(other) + except TypeError: + return False + if isinstance(other, type(self)): + return all( + getattr(self, attr) == getattr(other, attr) for attr in self._metadata + ) + return False + + def __hash__(self) -> int: + # for python>=3.10, different nan objects have different hashes + # we need to avoid that and thus use hash function with old behavior + return object_hash(tuple(getattr(self, attr) for attr in self._metadata)) + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) + + @property + def na_value(self) -> object: + """ + Default NA value to use for this type. + + This is used in e.g. ExtensionArray.take. This should be the + user-facing "boxed" version of the NA value, not the physical NA value + for storage. e.g. for JSONArray, this is an empty dictionary. + """ + return np.nan + + @property + def type(self) -> type_t[Any]: + """ + The scalar type for the array, e.g. ``int`` + + It's expected ``ExtensionArray[item]`` returns an instance + of ``ExtensionDtype.type`` for scalar ``item``, assuming + that value is valid (not NA). NA values do not need to be + instances of `type`. + """ + raise AbstractMethodError(self) + + @property + def kind(self) -> str: + """ + A character code (one of 'biufcmMOSUV'), default 'O' + + This should match the NumPy dtype used when the array is + converted to an ndarray, which is probably 'O' for object if + the extension type cannot be represented as a built-in NumPy + type. + + See Also + -------- + numpy.dtype.kind + """ + return "O" + + @property + def name(self) -> str: + """ + A string identifying the data type. + + Will be used for display in, e.g. ``Series.dtype`` + """ + raise AbstractMethodError(self) + + @property + def names(self) -> list[str] | None: + """ + Ordered list of field names, or None if there are no fields. + + This is for compatibility with NumPy arrays, and may be removed in the + future. + """ + return None + + @classmethod + def construct_array_type(cls) -> type_t[ExtensionArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + raise AbstractMethodError(cls) + + def empty(self, shape: Shape) -> ExtensionArray: + """ + Construct an ExtensionArray of this dtype with the given shape. + + Analogous to numpy.empty. + + Parameters + ---------- + shape : int or tuple[int] + + Returns + ------- + ExtensionArray + """ + cls = self.construct_array_type() + return cls._empty(shape, dtype=self) + + @classmethod + def construct_from_string(cls, string: str) -> Self: + r""" + Construct this type from a string. + + This is useful mainly for data types that accept parameters. + For example, a period dtype accepts a frequency parameter that + can be set as ``period[h]`` (where H means hourly frequency). + + By default, in the abstract class, just the name of the type is + expected. But subclasses can overwrite this method to accept + parameters. + + Parameters + ---------- + string : str + The name of the type, for example ``category``. + + Returns + ------- + ExtensionDtype + Instance of the dtype. + + Raises + ------ + TypeError + If a class cannot be constructed from this 'string'. + + Examples + -------- + For extension dtypes with arguments the following may be an + adequate implementation. + + >>> import re + >>> @classmethod + ... def construct_from_string(cls, string): + ... pattern = re.compile(r"^my_type\[(?P.+)\]$") + ... match = pattern.match(string) + ... if match: + ... return cls(**match.groupdict()) + ... else: + ... raise TypeError( + ... f"Cannot construct a '{cls.__name__}' from '{string}'" + ... ) + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + # error: Non-overlapping equality check (left operand type: "str", right + # operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap] + assert isinstance(cls.name, str), (cls, type(cls.name)) + if string != cls.name: + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + return cls() + + @classmethod + def is_dtype(cls, dtype: object) -> bool: + """ + Check if we match 'dtype'. + + Parameters + ---------- + dtype : object + The object to check. + + Returns + ------- + bool + + Notes + ----- + The default implementation is True if + + 1. ``cls.construct_from_string(dtype)`` is an instance + of ``cls``. + 2. ``dtype`` is an object and is an instance of ``cls`` + 3. ``dtype`` has a ``dtype`` attribute, and any of the above + conditions is true for ``dtype.dtype``. + """ + dtype = getattr(dtype, "dtype", dtype) + + if isinstance(dtype, (ABCSeries, ABCIndex, ABCDataFrame, np.dtype)): + # https://github.com/pandas-dev/pandas/issues/22960 + # avoid passing data to `construct_from_string`. This could + # cause a FutureWarning from numpy about failing elementwise + # comparison from, e.g., comparing DataFrame == 'category'. + return False + elif dtype is None: + return False + elif isinstance(dtype, cls): + return True + if isinstance(dtype, str): + try: + return cls.construct_from_string(dtype) is not None + except TypeError: + return False + return False + + @property + def _is_numeric(self) -> bool: + """ + Whether columns with this dtype should be considered numeric. + + By default ExtensionDtypes are assumed to be non-numeric. + They'll be excluded from operations that exclude non-numeric + columns, like (groupby) reductions, plotting, etc. + """ + return False + + @property + def _is_boolean(self) -> bool: + """ + Whether this dtype should be considered boolean. + + By default, ExtensionDtypes are assumed to be non-numeric. + Setting this to True will affect the behavior of several places, + e.g. + + * is_bool + * boolean indexing + + Returns + ------- + bool + """ + return False + + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + """ + Return the common dtype, if one exists. + + Used in `find_common_type` implementation. This is for example used + to determine the resulting dtype in a concat operation. + + If no common dtype exists, return None (which gives the other dtypes + the chance to determine a common dtype). If all dtypes in the list + return None, then the common dtype will be "object" dtype (this means + it is never needed to return "object" dtype from this method itself). + + Parameters + ---------- + dtypes : list of dtypes + The dtypes for which to determine a common dtype. This is a list + of np.dtype or ExtensionDtype instances. + + Returns + ------- + Common dtype (np.dtype or ExtensionDtype) or None + """ + if len(set(dtypes)) == 1: + # only itself + return self + else: + return None + + @property + def _can_hold_na(self) -> bool: + """ + Can arrays of this dtype hold NA values? + """ + return True + + @property + def _is_immutable(self) -> bool: + """ + Can arrays with this dtype be modified with __setitem__? If not, return + True. + + Immutable arrays are expected to raise TypeError on __setitem__ calls. + """ + return False + + @cache_readonly + def index_class(self) -> type_t[Index]: + """ + The Index subclass to return from Index.__new__ when this dtype is + encountered. + """ + from pandas import Index + + return Index + + @property + def _supports_2d(self) -> bool: + """ + Do ExtensionArrays with this dtype support 2D arrays? + + Historically ExtensionArrays were limited to 1D. By returning True here, + authors can indicate that their arrays support 2D instances. This can + improve performance in some cases, particularly operations with `axis=1`. + + Arrays that support 2D values should: + + - implement Array.reshape + - subclass the Dim2CompatTests in tests.extension.base + - _concat_same_type should support `axis` keyword + - _reduce and reductions should support `axis` keyword + """ + return False + + @property + def _can_fast_transpose(self) -> bool: + """ + Is transposing an array with this dtype zero-copy? + + Only relevant for cases where _supports_2d is True. + """ + return False + + +class StorageExtensionDtype(ExtensionDtype): + """ExtensionDtype that may be backed by more than one implementation.""" + + name: str + _metadata = ("storage",) + + def __init__(self, storage: str | None = None) -> None: + self.storage = storage + + def __repr__(self) -> str: + return f"{self.name}[{self.storage}]" + + def __str__(self) -> str: + return self.name + + def __eq__(self, other: object) -> bool: + if isinstance(other, str) and other == self.name: + return True + return super().__eq__(other) + + def __hash__(self) -> int: + # custom __eq__ so have to override __hash__ + return super().__hash__() + + @property + def na_value(self) -> libmissing.NAType: + return libmissing.NA + + +def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]: + """ + Register an ExtensionType with pandas as class decorator. + + This enables operations like ``.astype(name)`` for the name + of the ExtensionDtype. + + Returns + ------- + callable + A class decorator. + + Examples + -------- + >>> from pandas.api.extensions import register_extension_dtype, ExtensionDtype + >>> @register_extension_dtype + ... class MyExtensionDtype(ExtensionDtype): + ... name = "myextension" + """ + _registry.register(cls) + return cls + + +class Registry: + """ + Registry for dtype inference. + + The registry allows one to map a string repr of a extension + dtype to an extension dtype. The string alias can be used in several + places, including + + * Series and Index constructors + * :meth:`pandas.array` + * :meth:`pandas.Series.astype` + + Multiple extension types can be registered. + These are tried in order. + """ + + def __init__(self) -> None: + self.dtypes: list[type_t[ExtensionDtype]] = [] + + def register(self, dtype: type_t[ExtensionDtype]) -> None: + """ + Parameters + ---------- + dtype : ExtensionDtype class + """ + if not issubclass(dtype, ExtensionDtype): + raise ValueError("can only register pandas extension dtypes") + + self.dtypes.append(dtype) + + @overload + def find(self, dtype: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]: + ... + + @overload + def find(self, dtype: ExtensionDtypeT) -> ExtensionDtypeT: + ... + + @overload + def find(self, dtype: str) -> ExtensionDtype | None: + ... + + @overload + def find( + self, dtype: npt.DTypeLike + ) -> type_t[ExtensionDtype] | ExtensionDtype | None: + ... + + def find( + self, dtype: type_t[ExtensionDtype] | ExtensionDtype | npt.DTypeLike + ) -> type_t[ExtensionDtype] | ExtensionDtype | None: + """ + Parameters + ---------- + dtype : ExtensionDtype class or instance or str or numpy dtype or python type + + Returns + ------- + return the first matching dtype, otherwise return None + """ + if not isinstance(dtype, str): + dtype_type: type_t + if not isinstance(dtype, type): + dtype_type = type(dtype) + else: + dtype_type = dtype + if issubclass(dtype_type, ExtensionDtype): + # cast needed here as mypy doesn't know we have figured + # out it is an ExtensionDtype or type_t[ExtensionDtype] + return cast("ExtensionDtype | type_t[ExtensionDtype]", dtype) + + return None + + for dtype_type in self.dtypes: + try: + return dtype_type.construct_from_string(dtype) + except TypeError: + pass + + return None + + +_registry = Registry() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/cast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/cast.py new file mode 100644 index 0000000000000000000000000000000000000000..b72293b52df06bcca863815da997f13a22c1dad1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/cast.py @@ -0,0 +1,1970 @@ +""" +Routines for casting. +""" + +from __future__ import annotations + +import datetime as dt +import functools +from typing import ( + TYPE_CHECKING, + Any, + Literal, + TypeVar, + cast, + overload, +) +import warnings + +import numpy as np + +from pandas._config import using_pyarrow_string_dtype + +from pandas._libs import ( + Interval, + Period, + lib, +) +from pandas._libs.missing import ( + NA, + NAType, + checknull, +) +from pandas._libs.tslibs import ( + NaT, + OutOfBoundsDatetime, + OutOfBoundsTimedelta, + Timedelta, + Timestamp, + is_supported_dtype, +) +from pandas._libs.tslibs.timedeltas import array_to_timedelta64 +from pandas.errors import ( + IntCastingNaNError, + LossySetitemError, +) + +from pandas.core.dtypes.common import ( + ensure_int8, + ensure_int16, + ensure_int32, + ensure_int64, + ensure_object, + ensure_str, + is_bool, + is_complex, + is_float, + is_integer, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype as pandas_dtype_func, +) +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + BaseMaskedDtype, + CategoricalDtype, + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PandasExtensionDtype, + PeriodDtype, +) +from pandas.core.dtypes.generic import ( + ABCExtensionArray, + ABCIndex, + ABCSeries, +) +from pandas.core.dtypes.inference import is_list_like +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna, + na_value_for_dtype, + notna, +) + +from pandas.io._util import _arrow_dtype_mapping + +if TYPE_CHECKING: + from collections.abc import ( + Sequence, + Sized, + ) + + from pandas._typing import ( + ArrayLike, + Dtype, + DtypeObj, + NumpyIndexT, + Scalar, + npt, + ) + + from pandas import Index + from pandas.core.arrays import ( + Categorical, + DatetimeArray, + ExtensionArray, + IntervalArray, + PeriodArray, + TimedeltaArray, + ) + + +_int8_max = np.iinfo(np.int8).max +_int16_max = np.iinfo(np.int16).max +_int32_max = np.iinfo(np.int32).max + +_dtype_obj = np.dtype(object) + +NumpyArrayT = TypeVar("NumpyArrayT", bound=np.ndarray) + + +def maybe_convert_platform( + values: list | tuple | range | np.ndarray | ExtensionArray, +) -> ArrayLike: + """try to do platform conversion, allow ndarray or list here""" + arr: ArrayLike + + if isinstance(values, (list, tuple, range)): + arr = construct_1d_object_array_from_listlike(values) + else: + # The caller is responsible for ensuring that we have np.ndarray + # or ExtensionArray here. + arr = values + + if arr.dtype == _dtype_obj: + arr = cast(np.ndarray, arr) + arr = lib.maybe_convert_objects(arr) + + return arr + + +def is_nested_object(obj) -> bool: + """ + return a boolean if we have a nested object, e.g. a Series with 1 or + more Series elements + + This may not be necessarily be performant. + + """ + return bool( + isinstance(obj, ABCSeries) + and is_object_dtype(obj.dtype) + and any(isinstance(v, ABCSeries) for v in obj._values) + ) + + +def maybe_box_datetimelike(value: Scalar, dtype: Dtype | None = None) -> Scalar: + """ + Cast scalar to Timestamp or Timedelta if scalar is datetime-like + and dtype is not object. + + Parameters + ---------- + value : scalar + dtype : Dtype, optional + + Returns + ------- + scalar + """ + if dtype == _dtype_obj: + pass + elif isinstance(value, (np.datetime64, dt.datetime)): + value = Timestamp(value) + elif isinstance(value, (np.timedelta64, dt.timedelta)): + value = Timedelta(value) + + return value + + +def maybe_box_native(value: Scalar | None | NAType) -> Scalar | None | NAType: + """ + If passed a scalar cast the scalar to a python native type. + + Parameters + ---------- + value : scalar or Series + + Returns + ------- + scalar or Series + """ + if is_float(value): + value = float(value) + elif is_integer(value): + value = int(value) + elif is_bool(value): + value = bool(value) + elif isinstance(value, (np.datetime64, np.timedelta64)): + value = maybe_box_datetimelike(value) + elif value is NA: + value = None + return value + + +def _maybe_unbox_datetimelike(value: Scalar, dtype: DtypeObj) -> Scalar: + """ + Convert a Timedelta or Timestamp to timedelta64 or datetime64 for setting + into a numpy array. Failing to unbox would risk dropping nanoseconds. + + Notes + ----- + Caller is responsible for checking dtype.kind in "mM" + """ + if is_valid_na_for_dtype(value, dtype): + # GH#36541: can't fill array directly with pd.NaT + # > np.empty(10, dtype="datetime64[ns]").fill(pd.NaT) + # ValueError: cannot convert float NaN to integer + value = dtype.type("NaT", "ns") + elif isinstance(value, Timestamp): + if value.tz is None: + value = value.to_datetime64() + elif not isinstance(dtype, DatetimeTZDtype): + raise TypeError("Cannot unbox tzaware Timestamp to tznaive dtype") + elif isinstance(value, Timedelta): + value = value.to_timedelta64() + + _disallow_mismatched_datetimelike(value, dtype) + return value + + +def _disallow_mismatched_datetimelike(value, dtype: DtypeObj): + """ + numpy allows np.array(dt64values, dtype="timedelta64[ns]") and + vice-versa, but we do not want to allow this, so we need to + check explicitly + """ + vdtype = getattr(value, "dtype", None) + if vdtype is None: + return + elif (vdtype.kind == "m" and dtype.kind == "M") or ( + vdtype.kind == "M" and dtype.kind == "m" + ): + raise TypeError(f"Cannot cast {repr(value)} to {dtype}") + + +@overload +def maybe_downcast_to_dtype(result: np.ndarray, dtype: str | np.dtype) -> np.ndarray: + ... + + +@overload +def maybe_downcast_to_dtype(result: ExtensionArray, dtype: str | np.dtype) -> ArrayLike: + ... + + +def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLike: + """ + try to cast to the specified dtype (e.g. convert back to bool/int + or could be an astype of float64->float32 + """ + if isinstance(result, ABCSeries): + result = result._values + do_round = False + + if isinstance(dtype, str): + if dtype == "infer": + inferred_type = lib.infer_dtype(result, skipna=False) + if inferred_type == "boolean": + dtype = "bool" + elif inferred_type == "integer": + dtype = "int64" + elif inferred_type == "datetime64": + dtype = "datetime64[ns]" + elif inferred_type in ["timedelta", "timedelta64"]: + dtype = "timedelta64[ns]" + + # try to upcast here + elif inferred_type == "floating": + dtype = "int64" + if issubclass(result.dtype.type, np.number): + do_round = True + + else: + # TODO: complex? what if result is already non-object? + dtype = "object" + + dtype = np.dtype(dtype) + + if not isinstance(dtype, np.dtype): + # enforce our signature annotation + raise TypeError(dtype) # pragma: no cover + + converted = maybe_downcast_numeric(result, dtype, do_round) + if converted is not result: + return converted + + # a datetimelike + # GH12821, iNaT is cast to float + if dtype.kind in "mM" and result.dtype.kind in "if": + result = result.astype(dtype) + + elif dtype.kind == "m" and result.dtype == _dtype_obj: + # test_where_downcast_to_td64 + result = cast(np.ndarray, result) + result = array_to_timedelta64(result) + + elif dtype == np.dtype("M8[ns]") and result.dtype == _dtype_obj: + result = cast(np.ndarray, result) + return np.asarray(maybe_cast_to_datetime(result, dtype=dtype)) + + return result + + +@overload +def maybe_downcast_numeric( + result: np.ndarray, dtype: np.dtype, do_round: bool = False +) -> np.ndarray: + ... + + +@overload +def maybe_downcast_numeric( + result: ExtensionArray, dtype: DtypeObj, do_round: bool = False +) -> ArrayLike: + ... + + +def maybe_downcast_numeric( + result: ArrayLike, dtype: DtypeObj, do_round: bool = False +) -> ArrayLike: + """ + Subset of maybe_downcast_to_dtype restricted to numeric dtypes. + + Parameters + ---------- + result : ndarray or ExtensionArray + dtype : np.dtype or ExtensionDtype + do_round : bool + + Returns + ------- + ndarray or ExtensionArray + """ + if not isinstance(dtype, np.dtype) or not isinstance(result.dtype, np.dtype): + # e.g. SparseDtype has no itemsize attr + return result + + def trans(x): + if do_round: + return x.round() + return x + + if dtype.kind == result.dtype.kind: + # don't allow upcasts here (except if empty) + if result.dtype.itemsize <= dtype.itemsize and result.size: + return result + + if dtype.kind in "biu": + if not result.size: + # if we don't have any elements, just astype it + return trans(result).astype(dtype) + + if isinstance(result, np.ndarray): + element = result.item(0) + else: + element = result.iloc[0] + if not isinstance(element, (np.integer, np.floating, int, float, bool)): + # a comparable, e.g. a Decimal may slip in here + return result + + if ( + issubclass(result.dtype.type, (np.object_, np.number)) + and notna(result).all() + ): + new_result = trans(result).astype(dtype) + if new_result.dtype.kind == "O" or result.dtype.kind == "O": + # np.allclose may raise TypeError on object-dtype + if (new_result == result).all(): + return new_result + else: + if np.allclose(new_result, result, rtol=0): + return new_result + + elif ( + issubclass(dtype.type, np.floating) + and result.dtype.kind != "b" + and not is_string_dtype(result.dtype) + ): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "overflow encountered in cast", RuntimeWarning + ) + new_result = result.astype(dtype) + + # Adjust tolerances based on floating point size + size_tols = {4: 5e-4, 8: 5e-8, 16: 5e-16} + + atol = size_tols.get(new_result.dtype.itemsize, 0.0) + + # Check downcast float values are still equal within 7 digits when + # converting from float64 to float32 + if np.allclose(new_result, result, equal_nan=True, rtol=0.0, atol=atol): + return new_result + + elif dtype.kind == result.dtype.kind == "c": + new_result = result.astype(dtype) + + if np.array_equal(new_result, result, equal_nan=True): + # TODO: use tolerance like we do for float? + return new_result + + return result + + +def maybe_upcast_numeric_to_64bit(arr: NumpyIndexT) -> NumpyIndexT: + """ + If array is a int/uint/float bit size lower than 64 bit, upcast it to 64 bit. + + Parameters + ---------- + arr : ndarray or ExtensionArray + + Returns + ------- + ndarray or ExtensionArray + """ + dtype = arr.dtype + if dtype.kind == "i" and dtype != np.int64: + return arr.astype(np.int64) + elif dtype.kind == "u" and dtype != np.uint64: + return arr.astype(np.uint64) + elif dtype.kind == "f" and dtype != np.float64: + return arr.astype(np.float64) + else: + return arr + + +def maybe_cast_pointwise_result( + result: ArrayLike, + dtype: DtypeObj, + numeric_only: bool = False, + same_dtype: bool = True, +) -> ArrayLike: + """ + Try casting result of a pointwise operation back to the original dtype if + appropriate. + + Parameters + ---------- + result : array-like + Result to cast. + dtype : np.dtype or ExtensionDtype + Input Series from which result was calculated. + numeric_only : bool, default False + Whether to cast only numerics or datetimes as well. + same_dtype : bool, default True + Specify dtype when calling _from_sequence + + Returns + ------- + result : array-like + result maybe casted to the dtype. + """ + + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + if same_dtype: + result = _maybe_cast_to_extension_array(cls, result, dtype=dtype) + else: + result = _maybe_cast_to_extension_array(cls, result) + + elif (numeric_only and dtype.kind in "iufcb") or not numeric_only: + result = maybe_downcast_to_dtype(result, dtype) + + return result + + +def _maybe_cast_to_extension_array( + cls: type[ExtensionArray], obj: ArrayLike, dtype: ExtensionDtype | None = None +) -> ArrayLike: + """ + Call to `_from_sequence` that returns the object unchanged on Exception. + + Parameters + ---------- + cls : class, subclass of ExtensionArray + obj : arraylike + Values to pass to cls._from_sequence + dtype : ExtensionDtype, optional + + Returns + ------- + ExtensionArray or obj + """ + result: ArrayLike + + if dtype is not None: + try: + result = cls._from_scalars(obj, dtype=dtype) + except (TypeError, ValueError): + return obj + return result + + try: + result = cls._from_sequence(obj, dtype=dtype) + except Exception: + # We can't predict what downstream EA constructors may raise + result = obj + return result + + +@overload +def ensure_dtype_can_hold_na(dtype: np.dtype) -> np.dtype: + ... + + +@overload +def ensure_dtype_can_hold_na(dtype: ExtensionDtype) -> ExtensionDtype: + ... + + +def ensure_dtype_can_hold_na(dtype: DtypeObj) -> DtypeObj: + """ + If we have a dtype that cannot hold NA values, find the best match that can. + """ + if isinstance(dtype, ExtensionDtype): + if dtype._can_hold_na: + return dtype + elif isinstance(dtype, IntervalDtype): + # TODO(GH#45349): don't special-case IntervalDtype, allow + # overriding instead of returning object below. + return IntervalDtype(np.float64, closed=dtype.closed) + return _dtype_obj + elif dtype.kind == "b": + return _dtype_obj + elif dtype.kind in "iu": + return np.dtype(np.float64) + return dtype + + +_canonical_nans = { + np.datetime64: np.datetime64("NaT", "ns"), + np.timedelta64: np.timedelta64("NaT", "ns"), + type(np.nan): np.nan, +} + + +def maybe_promote(dtype: np.dtype, fill_value=np.nan): + """ + Find the minimal dtype that can hold both the given dtype and fill_value. + + Parameters + ---------- + dtype : np.dtype + fill_value : scalar, default np.nan + + Returns + ------- + dtype + Upcasted from dtype argument if necessary. + fill_value + Upcasted from fill_value argument if necessary. + + Raises + ------ + ValueError + If fill_value is a non-scalar and dtype is not object. + """ + orig = fill_value + orig_is_nat = False + if checknull(fill_value): + # https://github.com/pandas-dev/pandas/pull/39692#issuecomment-1441051740 + # avoid cache misses with NaN/NaT values that are not singletons + if fill_value is not NA: + try: + orig_is_nat = np.isnat(fill_value) + except TypeError: + pass + + fill_value = _canonical_nans.get(type(fill_value), fill_value) + + # for performance, we are using a cached version of the actual implementation + # of the function in _maybe_promote. However, this doesn't always work (in case + # of non-hashable arguments), so we fallback to the actual implementation if needed + try: + # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type + # "Type[Any]"; expected "Hashable" [arg-type] + dtype, fill_value = _maybe_promote_cached( + dtype, fill_value, type(fill_value) # type: ignore[arg-type] + ) + except TypeError: + # if fill_value is not hashable (required for caching) + dtype, fill_value = _maybe_promote(dtype, fill_value) + + if (dtype == _dtype_obj and orig is not None) or ( + orig_is_nat and np.datetime_data(orig)[0] != "ns" + ): + # GH#51592,53497 restore our potentially non-canonical fill_value + fill_value = orig + return dtype, fill_value + + +@functools.lru_cache +def _maybe_promote_cached(dtype, fill_value, fill_value_type): + # The cached version of _maybe_promote below + # This also use fill_value_type as (unused) argument to use this in the + # cache lookup -> to differentiate 1 and True + return _maybe_promote(dtype, fill_value) + + +def _maybe_promote(dtype: np.dtype, fill_value=np.nan): + # The actual implementation of the function, use `maybe_promote` above for + # a cached version. + if not is_scalar(fill_value): + # with object dtype there is nothing to promote, and the user can + # pass pretty much any weird fill_value they like + if dtype != object: + # with object dtype there is nothing to promote, and the user can + # pass pretty much any weird fill_value they like + raise ValueError("fill_value must be a scalar") + dtype = _dtype_obj + return dtype, fill_value + + if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in "iufcmM": + dtype = ensure_dtype_can_hold_na(dtype) + fv = na_value_for_dtype(dtype) + return dtype, fv + + elif isinstance(dtype, CategoricalDtype): + if fill_value in dtype.categories or isna(fill_value): + return dtype, fill_value + else: + return object, ensure_object(fill_value) + + elif isna(fill_value): + dtype = _dtype_obj + if fill_value is None: + # but we retain e.g. pd.NA + fill_value = np.nan + return dtype, fill_value + + # returns tuple of (dtype, fill_value) + if issubclass(dtype.type, np.datetime64): + inferred, fv = infer_dtype_from_scalar(fill_value) + if inferred == dtype: + return dtype, fv + + from pandas.core.arrays import DatetimeArray + + dta = DatetimeArray._from_sequence([], dtype="M8[ns]") + try: + fv = dta._validate_setitem_value(fill_value) + return dta.dtype, fv + except (ValueError, TypeError): + return _dtype_obj, fill_value + + elif issubclass(dtype.type, np.timedelta64): + inferred, fv = infer_dtype_from_scalar(fill_value) + if inferred == dtype: + return dtype, fv + + elif inferred.kind == "m": + # different unit, e.g. passed np.timedelta64(24, "h") with dtype=m8[ns] + # see if we can losslessly cast it to our dtype + unit = np.datetime_data(dtype)[0] + try: + td = Timedelta(fill_value).as_unit(unit, round_ok=False) + except OutOfBoundsTimedelta: + return _dtype_obj, fill_value + else: + return dtype, td.asm8 + + return _dtype_obj, fill_value + + elif is_float(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.dtype(np.object_) + + elif issubclass(dtype.type, np.integer): + dtype = np.dtype(np.float64) + + elif dtype.kind == "f": + mst = np.min_scalar_type(fill_value) + if mst > dtype: + # e.g. mst is np.float64 and dtype is np.float32 + dtype = mst + + elif dtype.kind == "c": + mst = np.min_scalar_type(fill_value) + dtype = np.promote_types(dtype, mst) + + elif is_bool(fill_value): + if not issubclass(dtype.type, np.bool_): + dtype = np.dtype(np.object_) + + elif is_integer(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.dtype(np.object_) + + elif issubclass(dtype.type, np.integer): + if not np_can_cast_scalar(fill_value, dtype): # type: ignore[arg-type] + # upcast to prevent overflow + mst = np.min_scalar_type(fill_value) + dtype = np.promote_types(dtype, mst) + if dtype.kind == "f": + # Case where we disagree with numpy + dtype = np.dtype(np.object_) + + elif is_complex(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.dtype(np.object_) + + elif issubclass(dtype.type, (np.integer, np.floating)): + mst = np.min_scalar_type(fill_value) + dtype = np.promote_types(dtype, mst) + + elif dtype.kind == "c": + mst = np.min_scalar_type(fill_value) + if mst > dtype: + # e.g. mst is np.complex128 and dtype is np.complex64 + dtype = mst + + else: + dtype = np.dtype(np.object_) + + # in case we have a string that looked like a number + if issubclass(dtype.type, (bytes, str)): + dtype = np.dtype(np.object_) + + fill_value = _ensure_dtype_type(fill_value, dtype) + return dtype, fill_value + + +def _ensure_dtype_type(value, dtype: np.dtype): + """ + Ensure that the given value is an instance of the given dtype. + + e.g. if out dtype is np.complex64_, we should have an instance of that + as opposed to a python complex object. + + Parameters + ---------- + value : object + dtype : np.dtype + + Returns + ------- + object + """ + # Start with exceptions in which we do _not_ cast to numpy types + + if dtype == _dtype_obj: + return value + + # Note: before we get here we have already excluded isna(value) + return dtype.type(value) + + +def infer_dtype_from(val) -> tuple[DtypeObj, Any]: + """ + Interpret the dtype from a scalar or array. + + Parameters + ---------- + val : object + """ + if not is_list_like(val): + return infer_dtype_from_scalar(val) + return infer_dtype_from_array(val) + + +def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: + """ + Interpret the dtype from a scalar. + + Parameters + ---------- + val : object + """ + dtype: DtypeObj = _dtype_obj + + # a 1-element ndarray + if isinstance(val, np.ndarray): + if val.ndim != 0: + msg = "invalid ndarray passed to infer_dtype_from_scalar" + raise ValueError(msg) + + dtype = val.dtype + val = lib.item_from_zerodim(val) + + elif isinstance(val, str): + # If we create an empty array using a string to infer + # the dtype, NumPy will only allocate one character per entry + # so this is kind of bad. Alternately we could use np.repeat + # instead of np.empty (but then you still don't want things + # coming out as np.str_! + + dtype = _dtype_obj + if using_pyarrow_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype(storage="pyarrow_numpy") + + elif isinstance(val, (np.datetime64, dt.datetime)): + try: + val = Timestamp(val) + except OutOfBoundsDatetime: + return _dtype_obj, val + + if val is NaT or val.tz is None: + val = val.to_datetime64() + dtype = val.dtype + # TODO: test with datetime(2920, 10, 1) based on test_replace_dtypes + else: + dtype = DatetimeTZDtype(unit=val.unit, tz=val.tz) + + elif isinstance(val, (np.timedelta64, dt.timedelta)): + try: + val = Timedelta(val) + except (OutOfBoundsTimedelta, OverflowError): + dtype = _dtype_obj + else: + if val is NaT: + val = np.timedelta64("NaT", "ns") + else: + val = val.asm8 + dtype = val.dtype + + elif is_bool(val): + dtype = np.dtype(np.bool_) + + elif is_integer(val): + if isinstance(val, np.integer): + dtype = np.dtype(type(val)) + else: + dtype = np.dtype(np.int64) + + try: + np.array(val, dtype=dtype) + except OverflowError: + dtype = np.array(val).dtype + + elif is_float(val): + if isinstance(val, np.floating): + dtype = np.dtype(type(val)) + else: + dtype = np.dtype(np.float64) + + elif is_complex(val): + dtype = np.dtype(np.complex128) + + if isinstance(val, Period): + dtype = PeriodDtype(freq=val.freq) + elif isinstance(val, Interval): + subtype = infer_dtype_from_scalar(val.left)[0] + dtype = IntervalDtype(subtype=subtype, closed=val.closed) + + return dtype, val + + +def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]: + """ + Convert datetimelike-keyed dicts to a Timestamp-keyed dict. + + Parameters + ---------- + d: dict-like object + + Returns + ------- + dict + """ + return {maybe_box_datetimelike(key): value for key, value in d.items()} + + +def infer_dtype_from_array(arr) -> tuple[DtypeObj, ArrayLike]: + """ + Infer the dtype from an array. + + Parameters + ---------- + arr : array + + Returns + ------- + tuple (pandas-compat dtype, array) + + + Examples + -------- + >>> np.asarray([1, '1']) + array(['1', '1'], dtype='>> infer_dtype_from_array([1, '1']) + (dtype('O'), [1, '1']) + """ + if isinstance(arr, np.ndarray): + return arr.dtype, arr + + if not is_list_like(arr): + raise TypeError("'arr' must be list-like") + + arr_dtype = getattr(arr, "dtype", None) + if isinstance(arr_dtype, ExtensionDtype): + return arr.dtype, arr + + elif isinstance(arr, ABCSeries): + return arr.dtype, np.asarray(arr) + + # don't force numpy coerce with nan's + inferred = lib.infer_dtype(arr, skipna=False) + if inferred in ["string", "bytes", "mixed", "mixed-integer"]: + return (np.dtype(np.object_), arr) + + arr = np.asarray(arr) + return arr.dtype, arr + + +def _maybe_infer_dtype_type(element): + """ + Try to infer an object's dtype, for use in arithmetic ops. + + Uses `element.dtype` if that's available. + Objects implementing the iterator protocol are cast to a NumPy array, + and from there the array's type is used. + + Parameters + ---------- + element : object + Possibly has a `.dtype` attribute, and possibly the iterator + protocol. + + Returns + ------- + tipo : type + + Examples + -------- + >>> from collections import namedtuple + >>> Foo = namedtuple("Foo", "dtype") + >>> _maybe_infer_dtype_type(Foo(np.dtype("i8"))) + dtype('int64') + """ + tipo = None + if hasattr(element, "dtype"): + tipo = element.dtype + elif is_list_like(element): + element = np.asarray(element) + tipo = element.dtype + return tipo + + +def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None: + """ + Change string like dtypes to object for + ``DataFrame.select_dtypes()``. + """ + # error: Argument 1 to has incompatible type "Type[generic]"; expected + # "Union[dtype[Any], ExtensionDtype, None]" + # error: Argument 2 to has incompatible type "Type[generic]"; expected + # "Union[dtype[Any], ExtensionDtype, None]" + non_string_dtypes = dtype_set - { + np.dtype("S").type, # type: ignore[arg-type] + np.dtype(" np.ndarray: + """coerce the indexer input array to the smallest dtype possible""" + length = len(categories) + if length < _int8_max: + return ensure_int8(indexer) + elif length < _int16_max: + return ensure_int16(indexer) + elif length < _int32_max: + return ensure_int32(indexer) + return ensure_int64(indexer) + + +def convert_dtypes( + input_array: ArrayLike, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, + convert_floating: bool = True, + infer_objects: bool = False, + dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable", +) -> DtypeObj: + """ + Convert objects to best possible type, and optionally, + to types supporting ``pd.NA``. + + Parameters + ---------- + input_array : ExtensionArray or np.ndarray + convert_string : bool, default True + Whether object dtypes should be converted to ``StringDtype()``. + convert_integer : bool, default True + Whether, if possible, conversion can be done to integer extension types. + convert_boolean : bool, defaults True + Whether object dtypes should be converted to ``BooleanDtypes()``. + convert_floating : bool, defaults True + Whether, if possible, conversion can be done to floating extension types. + If `convert_integer` is also True, preference will be give to integer + dtypes if the floats can be faithfully casted to integers. + infer_objects : bool, defaults False + Whether to also infer objects to float/int if possible. Is only hit if the + object array contains pd.NA. + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + Returns + ------- + np.dtype, or ExtensionDtype + """ + inferred_dtype: str | DtypeObj + + if ( + convert_string or convert_integer or convert_boolean or convert_floating + ) and isinstance(input_array, np.ndarray): + if input_array.dtype == object: + inferred_dtype = lib.infer_dtype(input_array) + else: + inferred_dtype = input_array.dtype + + if is_string_dtype(inferred_dtype): + if not convert_string or inferred_dtype == "bytes": + inferred_dtype = input_array.dtype + else: + inferred_dtype = pandas_dtype_func("string") + + if convert_integer: + target_int_dtype = pandas_dtype_func("Int64") + + if input_array.dtype.kind in "iu": + from pandas.core.arrays.integer import NUMPY_INT_TO_DTYPE + + inferred_dtype = NUMPY_INT_TO_DTYPE.get( + input_array.dtype, target_int_dtype + ) + elif input_array.dtype.kind in "fcb": + # TODO: de-dup with maybe_cast_to_integer_array? + arr = input_array[notna(input_array)] + if (arr.astype(int) == arr).all(): + inferred_dtype = target_int_dtype + else: + inferred_dtype = input_array.dtype + elif ( + infer_objects + and input_array.dtype == object + and (isinstance(inferred_dtype, str) and inferred_dtype == "integer") + ): + inferred_dtype = target_int_dtype + + if convert_floating: + if input_array.dtype.kind in "fcb": + # i.e. numeric but not integer + from pandas.core.arrays.floating import NUMPY_FLOAT_TO_DTYPE + + inferred_float_dtype: DtypeObj = NUMPY_FLOAT_TO_DTYPE.get( + input_array.dtype, pandas_dtype_func("Float64") + ) + # if we could also convert to integer, check if all floats + # are actually integers + if convert_integer: + # TODO: de-dup with maybe_cast_to_integer_array? + arr = input_array[notna(input_array)] + if (arr.astype(int) == arr).all(): + inferred_dtype = pandas_dtype_func("Int64") + else: + inferred_dtype = inferred_float_dtype + else: + inferred_dtype = inferred_float_dtype + elif ( + infer_objects + and input_array.dtype == object + and ( + isinstance(inferred_dtype, str) + and inferred_dtype == "mixed-integer-float" + ) + ): + inferred_dtype = pandas_dtype_func("Float64") + + if convert_boolean: + if input_array.dtype.kind == "b": + inferred_dtype = pandas_dtype_func("boolean") + elif isinstance(inferred_dtype, str) and inferred_dtype == "boolean": + inferred_dtype = pandas_dtype_func("boolean") + + if isinstance(inferred_dtype, str): + # If we couldn't do anything else, then we retain the dtype + inferred_dtype = input_array.dtype + + else: + inferred_dtype = input_array.dtype + + if dtype_backend == "pyarrow": + from pandas.core.arrays.arrow.array import to_pyarrow_type + from pandas.core.arrays.string_ import StringDtype + + assert not isinstance(inferred_dtype, str) + + if ( + (convert_integer and inferred_dtype.kind in "iu") + or (convert_floating and inferred_dtype.kind in "fc") + or (convert_boolean and inferred_dtype.kind == "b") + or (convert_string and isinstance(inferred_dtype, StringDtype)) + or ( + inferred_dtype.kind not in "iufcb" + and not isinstance(inferred_dtype, StringDtype) + ) + ): + if isinstance(inferred_dtype, PandasExtensionDtype) and not isinstance( + inferred_dtype, DatetimeTZDtype + ): + base_dtype = inferred_dtype.base + elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): + base_dtype = inferred_dtype.numpy_dtype + elif isinstance(inferred_dtype, StringDtype): + base_dtype = np.dtype(str) + else: + base_dtype = inferred_dtype + if ( + base_dtype.kind == "O" # type: ignore[union-attr] + and input_array.size > 0 + and isna(input_array).all() + ): + import pyarrow as pa + + pa_type = pa.null() + else: + pa_type = to_pyarrow_type(base_dtype) + if pa_type is not None: + inferred_dtype = ArrowDtype(pa_type) + elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): + # GH 53648 + inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype] + + # error: Incompatible return value type (got "Union[str, Union[dtype[Any], + # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]") + return inferred_dtype # type: ignore[return-value] + + +def maybe_infer_to_datetimelike( + value: npt.NDArray[np.object_], +) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray: + """ + we might have a array (or single object) that is datetime like, + and no dtype is passed don't change the value unless we find a + datetime/timedelta set + + this is pretty strict in that a datetime/timedelta is REQUIRED + in addition to possible nulls/string likes + + Parameters + ---------- + value : np.ndarray[object] + + Returns + ------- + np.ndarray, DatetimeArray, TimedeltaArray, PeriodArray, or IntervalArray + + """ + if not isinstance(value, np.ndarray) or value.dtype != object: + # Caller is responsible for passing only ndarray[object] + raise TypeError(type(value)) # pragma: no cover + if value.ndim != 1: + # Caller is responsible + raise ValueError(value.ndim) # pragma: no cover + + if not len(value): + return value + + # error: Incompatible return value type (got "Union[ExtensionArray, + # ndarray[Any, Any]]", expected "Union[ndarray[Any, Any], DatetimeArray, + # TimedeltaArray, PeriodArray, IntervalArray]") + return lib.maybe_convert_objects( # type: ignore[return-value] + value, + # Here we do not convert numeric dtypes, as if we wanted that, + # numpy would have done it for us. + convert_numeric=False, + convert_non_numeric=True, + dtype_if_all_nat=np.dtype("M8[ns]"), + ) + + +def maybe_cast_to_datetime( + value: np.ndarray | list, dtype: np.dtype +) -> ExtensionArray | np.ndarray: + """ + try to cast the array/value to a datetimelike dtype, converting float + nan to iNaT + + Caller is responsible for handling ExtensionDtype cases and non dt64/td64 + cases. + """ + from pandas.core.arrays.datetimes import DatetimeArray + from pandas.core.arrays.timedeltas import TimedeltaArray + + assert dtype.kind in "mM" + if not is_list_like(value): + raise TypeError("value must be listlike") + + # TODO: _from_sequence would raise ValueError in cases where + # _ensure_nanosecond_dtype raises TypeError + _ensure_nanosecond_dtype(dtype) + + if lib.is_np_dtype(dtype, "m"): + res = TimedeltaArray._from_sequence(value, dtype=dtype) + return res + else: + try: + dta = DatetimeArray._from_sequence(value, dtype=dtype) + except ValueError as err: + # We can give a Series-specific exception message. + if "cannot supply both a tz and a timezone-naive dtype" in str(err): + raise ValueError( + "Cannot convert timezone-aware data to " + "timezone-naive dtype. Use " + "pd.Series(values).dt.tz_localize(None) instead." + ) from err + raise + + return dta + + +def _ensure_nanosecond_dtype(dtype: DtypeObj) -> None: + """ + Convert dtypes with granularity less than nanosecond to nanosecond + + >>> _ensure_nanosecond_dtype(np.dtype("M8[us]")) + + >>> _ensure_nanosecond_dtype(np.dtype("M8[D]")) + Traceback (most recent call last): + ... + TypeError: dtype=datetime64[D] is not supported. Supported resolutions are 's', 'ms', 'us', and 'ns' + + >>> _ensure_nanosecond_dtype(np.dtype("m8[ps]")) + Traceback (most recent call last): + ... + TypeError: dtype=timedelta64[ps] is not supported. Supported resolutions are 's', 'ms', 'us', and 'ns' + """ # noqa: E501 + msg = ( + f"The '{dtype.name}' dtype has no unit. " + f"Please pass in '{dtype.name}[ns]' instead." + ) + + # unpack e.g. SparseDtype + dtype = getattr(dtype, "subtype", dtype) + + if not isinstance(dtype, np.dtype): + # i.e. datetime64tz + pass + + elif dtype.kind in "mM": + if not is_supported_dtype(dtype): + # pre-2.0 we would silently swap in nanos for lower-resolutions, + # raise for above-nano resolutions + if dtype.name in ["datetime64", "timedelta64"]: + raise ValueError(msg) + # TODO: ValueError or TypeError? existing test + # test_constructor_generic_timestamp_bad_frequency expects TypeError + raise TypeError( + f"dtype={dtype} is not supported. Supported resolutions are 's', " + "'ms', 'us', and 'ns'" + ) + + +# TODO: other value-dependent functions to standardize here include +# Index._find_common_type_compat +def find_result_type(left_dtype: DtypeObj, right: Any) -> DtypeObj: + """ + Find the type/dtype for the result of an operation between objects. + + This is similar to find_common_type, but looks at the right object instead + of just its dtype. This can be useful in particular when the right + object does not have a `dtype`. + + Parameters + ---------- + left_dtype : np.dtype or ExtensionDtype + right : Any + + Returns + ------- + np.dtype or ExtensionDtype + + See also + -------- + find_common_type + numpy.result_type + """ + new_dtype: DtypeObj + + if ( + isinstance(left_dtype, np.dtype) + and left_dtype.kind in "iuc" + and (lib.is_integer(right) or lib.is_float(right)) + ): + # e.g. with int8 dtype and right=512, we want to end up with + # np.int16, whereas infer_dtype_from(512) gives np.int64, + # which will make us upcast too far. + if lib.is_float(right) and right.is_integer() and left_dtype.kind != "f": + right = int(right) + # After NEP 50, numpy won't inspect Python scalars + # TODO: do we need to recreate numpy's inspection logic for floats too + # (this breaks some tests) + if isinstance(right, int) and not isinstance(right, np.integer): + # This gives an unsigned type by default + # (if our number is positive) + + # If our left dtype is signed, we might not want this since + # this might give us 1 dtype too big + # We should check if the corresponding int dtype (e.g. int64 for uint64) + # can hold the number + right_dtype = np.min_scalar_type(right) + if right == 0: + # Special case 0 + right = left_dtype + elif ( + not np.issubdtype(left_dtype, np.unsignedinteger) + and 0 < right <= np.iinfo(right_dtype).max + ): + # If left dtype isn't unsigned, check if it fits in the signed dtype + right = np.dtype(f"i{right_dtype.itemsize}") + else: + right = right_dtype + + new_dtype = np.result_type(left_dtype, right) + + elif is_valid_na_for_dtype(right, left_dtype): + # e.g. IntervalDtype[int] and None/np.nan + new_dtype = ensure_dtype_can_hold_na(left_dtype) + + else: + dtype, _ = infer_dtype_from(right) + new_dtype = find_common_type([left_dtype, dtype]) + + return new_dtype + + +def common_dtype_categorical_compat( + objs: Sequence[Index | ArrayLike], dtype: DtypeObj +) -> DtypeObj: + """ + Update the result of find_common_type to account for NAs in a Categorical. + + Parameters + ---------- + objs : list[np.ndarray | ExtensionArray | Index] + dtype : np.dtype or ExtensionDtype + + Returns + ------- + np.dtype or ExtensionDtype + """ + # GH#38240 + + # TODO: more generally, could do `not can_hold_na(dtype)` + if lib.is_np_dtype(dtype, "iu"): + for obj in objs: + # We don't want to accientally allow e.g. "categorical" str here + obj_dtype = getattr(obj, "dtype", None) + if isinstance(obj_dtype, CategoricalDtype): + if isinstance(obj, ABCIndex): + # This check may already be cached + hasnas = obj.hasnans + else: + # Categorical + hasnas = cast("Categorical", obj)._hasna + + if hasnas: + # see test_union_int_categorical_with_nan + dtype = np.dtype(np.float64) + break + return dtype + + +def np_find_common_type(*dtypes: np.dtype) -> np.dtype: + """ + np.find_common_type implementation pre-1.25 deprecation using np.result_type + https://github.com/pandas-dev/pandas/pull/49569#issuecomment-1308300065 + + Parameters + ---------- + dtypes : np.dtypes + + Returns + ------- + np.dtype + """ + try: + common_dtype = np.result_type(*dtypes) + if common_dtype.kind in "mMSU": + # NumPy promotion currently (1.25) misbehaves for for times and strings, + # so fall back to object (find_common_dtype did unless there + # was only one dtype) + common_dtype = np.dtype("O") + + except TypeError: + common_dtype = np.dtype("O") + return common_dtype + + +@overload +def find_common_type(types: list[np.dtype]) -> np.dtype: + ... + + +@overload +def find_common_type(types: list[ExtensionDtype]) -> DtypeObj: + ... + + +@overload +def find_common_type(types: list[DtypeObj]) -> DtypeObj: + ... + + +def find_common_type(types): + """ + Find a common data type among the given dtypes. + + Parameters + ---------- + types : list of dtypes + + Returns + ------- + pandas extension or numpy dtype + + See Also + -------- + numpy.find_common_type + + """ + if not types: + raise ValueError("no types given") + + first = types[0] + + # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2) + # => object + if lib.dtypes_all_equal(list(types)): + return first + + # get unique types (dict.fromkeys is used as order-preserving set()) + types = list(dict.fromkeys(types).keys()) + + if any(isinstance(t, ExtensionDtype) for t in types): + for t in types: + if isinstance(t, ExtensionDtype): + res = t._get_common_dtype(types) + if res is not None: + return res + return np.dtype("object") + + # take lowest unit + if all(lib.is_np_dtype(t, "M") for t in types): + return np.dtype(max(types)) + if all(lib.is_np_dtype(t, "m") for t in types): + return np.dtype(max(types)) + + # don't mix bool / int or float or complex + # this is different from numpy, which casts bool with float/int as int + has_bools = any(t.kind == "b" for t in types) + if has_bools: + for t in types: + if t.kind in "iufc": + return np.dtype("object") + + return np_find_common_type(*types) + + +def construct_2d_arraylike_from_scalar( + value: Scalar, length: int, width: int, dtype: np.dtype, copy: bool +) -> np.ndarray: + shape = (length, width) + + if dtype.kind in "mM": + value = _maybe_box_and_unbox_datetimelike(value, dtype) + elif dtype == _dtype_obj: + if isinstance(value, (np.timedelta64, np.datetime64)): + # calling np.array below would cast to pytimedelta/pydatetime + out = np.empty(shape, dtype=object) + out.fill(value) + return out + + # Attempt to coerce to a numpy array + try: + if not copy: + arr = np.asarray(value, dtype=dtype) + else: + arr = np.array(value, dtype=dtype, copy=copy) + except (ValueError, TypeError) as err: + raise TypeError( + f"DataFrame constructor called with incompatible data and dtype: {err}" + ) from err + + if arr.ndim != 0: + raise ValueError("DataFrame constructor not properly called!") + + return np.full(shape, arr) + + +def construct_1d_arraylike_from_scalar( + value: Scalar, length: int, dtype: DtypeObj | None +) -> ArrayLike: + """ + create a np.ndarray / pandas type of specified shape and dtype + filled with values + + Parameters + ---------- + value : scalar value + length : int + dtype : pandas_dtype or np.dtype + + Returns + ------- + np.ndarray / pandas type of length, filled with value + + """ + + if dtype is None: + try: + dtype, value = infer_dtype_from_scalar(value) + except OutOfBoundsDatetime: + dtype = _dtype_obj + + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + seq = [] if length == 0 else [value] + subarr = cls._from_sequence(seq, dtype=dtype).repeat(length) + + else: + if length and dtype.kind in "iu" and isna(value): + # coerce if we have nan for an integer dtype + dtype = np.dtype("float64") + elif lib.is_np_dtype(dtype, "US"): + # we need to coerce to object dtype to avoid + # to allow numpy to take our string as a scalar value + dtype = np.dtype("object") + if not isna(value): + value = ensure_str(value) + elif dtype.kind in "mM": + value = _maybe_box_and_unbox_datetimelike(value, dtype) + + subarr = np.empty(length, dtype=dtype) + if length: + # GH 47391: numpy > 1.24 will raise filling np.nan into int dtypes + subarr.fill(value) + + return subarr + + +def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj): + # Caller is responsible for checking dtype.kind in "mM" + + if isinstance(value, dt.datetime): + # we dont want to box dt64, in particular datetime64("NaT") + value = maybe_box_datetimelike(value, dtype) + + return _maybe_unbox_datetimelike(value, dtype) + + +def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: + """ + Transform any list-like object in a 1-dimensional numpy array of object + dtype. + + Parameters + ---------- + values : any iterable which has a len() + + Raises + ------ + TypeError + * If `values` does not have a len() + + Returns + ------- + 1-dimensional numpy array of dtype object + """ + # numpy will try to interpret nested lists as further dimensions, hence + # making a 1D array that contains list-likes is a bit tricky: + result = np.empty(len(values), dtype="object") + result[:] = values + return result + + +def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray: + """ + Takes any dtype and returns the casted version, raising for when data is + incompatible with integer/unsigned integer dtypes. + + Parameters + ---------- + arr : np.ndarray or list + The array to cast. + dtype : np.dtype + The integer dtype to cast the array to. + + Returns + ------- + ndarray + Array of integer or unsigned integer dtype. + + Raises + ------ + OverflowError : the dtype is incompatible with the data + ValueError : loss of precision has occurred during casting + + Examples + -------- + If you try to coerce negative values to unsigned integers, it raises: + + >>> pd.Series([-1], dtype="uint64") + Traceback (most recent call last): + ... + OverflowError: Trying to coerce negative values to unsigned integers + + Also, if you try to coerce float values to integers, it raises: + + >>> maybe_cast_to_integer_array([1, 2, 3.5], dtype=np.dtype("int64")) + Traceback (most recent call last): + ... + ValueError: Trying to coerce float values to integers + """ + assert dtype.kind in "iu" + + try: + if not isinstance(arr, np.ndarray): + with warnings.catch_warnings(): + # We already disallow dtype=uint w/ negative numbers + # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. + warnings.filterwarnings( + "ignore", + "NumPy will stop allowing conversion of out-of-bound Python int", + DeprecationWarning, + ) + casted = np.asarray(arr, dtype=dtype) + else: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=RuntimeWarning) + casted = arr.astype(dtype, copy=False) + except OverflowError as err: + raise OverflowError( + "The elements provided in the data cannot all be " + f"casted to the dtype {dtype}" + ) from err + + if isinstance(arr, np.ndarray) and arr.dtype == dtype: + # avoid expensive array_equal check + return casted + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=RuntimeWarning) + warnings.filterwarnings( + "ignore", "elementwise comparison failed", FutureWarning + ) + if np.array_equal(arr, casted): + return casted + + # We do this casting to allow for proper + # data and dtype checking. + # + # We didn't do this earlier because NumPy + # doesn't handle `uint64` correctly. + arr = np.asarray(arr) + + if np.issubdtype(arr.dtype, str): + # TODO(numpy-2.0 min): This case will raise an OverflowError above + if (casted.astype(str) == arr).all(): + return casted + raise ValueError(f"string values cannot be losslessly cast to {dtype}") + + if dtype.kind == "u" and (arr < 0).any(): + # TODO: can this be hit anymore after numpy 2.0? + raise OverflowError("Trying to coerce negative values to unsigned integers") + + if arr.dtype.kind == "f": + if not np.isfinite(arr).all(): + raise IntCastingNaNError( + "Cannot convert non-finite values (NA or inf) to integer" + ) + raise ValueError("Trying to coerce float values to integers") + if arr.dtype == object: + raise ValueError("Trying to coerce float values to integers") + + if casted.dtype < arr.dtype: + # TODO: Can this path be hit anymore with numpy > 2 + # GH#41734 e.g. [1, 200, 923442] and dtype="int8" -> overflows + raise ValueError( + f"Values are too large to be losslessly converted to {dtype}. " + f"To cast anyway, use pd.Series(values).astype({dtype})" + ) + + if arr.dtype.kind in "mM": + # test_constructor_maskedarray_nonfloat + raise TypeError( + f"Constructing a Series or DataFrame from {arr.dtype} values and " + f"dtype={dtype} is not supported. Use values.view({dtype}) instead." + ) + + # No known cases that get here, but raising explicitly to cover our bases. + raise ValueError(f"values cannot be losslessly cast to {dtype}") + + +def can_hold_element(arr: ArrayLike, element: Any) -> bool: + """ + Can we do an inplace setitem with this element in an array with this dtype? + + Parameters + ---------- + arr : np.ndarray or ExtensionArray + element : Any + + Returns + ------- + bool + """ + dtype = arr.dtype + if not isinstance(dtype, np.dtype) or dtype.kind in "mM": + if isinstance(dtype, (PeriodDtype, IntervalDtype, DatetimeTZDtype, np.dtype)): + # np.dtype here catches datetime64ns and timedelta64ns; we assume + # in this case that we have DatetimeArray/TimedeltaArray + arr = cast( + "PeriodArray | DatetimeArray | TimedeltaArray | IntervalArray", arr + ) + try: + arr._validate_setitem_value(element) + return True + except (ValueError, TypeError): + return False + + # This is technically incorrect, but maintains the behavior of + # ExtensionBlock._can_hold_element + return True + + try: + np_can_hold_element(dtype, element) + return True + except (TypeError, LossySetitemError): + return False + + +def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: + """ + Raise if we cannot losslessly set this element into an ndarray with this dtype. + + Specifically about places where we disagree with numpy. i.e. there are + cases where numpy will raise in doing the setitem that we do not check + for here, e.g. setting str "X" into a numeric ndarray. + + Returns + ------- + Any + The element, potentially cast to the dtype. + + Raises + ------ + ValueError : If we cannot losslessly store this element with this dtype. + """ + if dtype == _dtype_obj: + return element + + tipo = _maybe_infer_dtype_type(element) + + if dtype.kind in "iu": + if isinstance(element, range): + if _dtype_can_hold_range(element, dtype): + return element + raise LossySetitemError + + if is_integer(element) or (is_float(element) and element.is_integer()): + # e.g. test_setitem_series_int8 if we have a python int 1 + # tipo may be np.int32, despite the fact that it will fit + # in smaller int dtypes. + info = np.iinfo(dtype) + if info.min <= element <= info.max: + return dtype.type(element) + raise LossySetitemError + + if tipo is not None: + if tipo.kind not in "iu": + if isinstance(element, np.ndarray) and element.dtype.kind == "f": + # If all can be losslessly cast to integers, then we can hold them + with np.errstate(invalid="ignore"): + # We check afterwards if cast was losslessly, so no need to show + # the warning + casted = element.astype(dtype) + comp = casted == element + if comp.all(): + # Return the casted values bc they can be passed to + # np.putmask, whereas the raw values cannot. + # see TestSetitemFloatNDarrayIntoIntegerSeries + return casted + raise LossySetitemError + + elif isinstance(element, ABCExtensionArray) and isinstance( + element.dtype, CategoricalDtype + ): + # GH#52927 setting Categorical value into non-EA frame + # TODO: general-case for EAs? + try: + casted = element.astype(dtype) + except (ValueError, TypeError): + raise LossySetitemError + # Check for cases of either + # a) lossy overflow/rounding or + # b) semantic changes like dt64->int64 + comp = casted == element + if not comp.all(): + raise LossySetitemError + return casted + + # Anything other than integer we cannot hold + raise LossySetitemError + if ( + dtype.kind == "u" + and isinstance(element, np.ndarray) + and element.dtype.kind == "i" + ): + # see test_where_uint64 + casted = element.astype(dtype) + if (casted == element).all(): + # TODO: faster to check (element >=0).all()? potential + # itemsize issues there? + return casted + raise LossySetitemError + if dtype.itemsize < tipo.itemsize: + raise LossySetitemError + if not isinstance(tipo, np.dtype): + # i.e. nullable IntegerDtype; we can put this into an ndarray + # losslessly iff it has no NAs + arr = element._values if isinstance(element, ABCSeries) else element + if arr._hasna: + raise LossySetitemError + return element + + return element + + raise LossySetitemError + + if dtype.kind == "f": + if lib.is_integer(element) or lib.is_float(element): + casted = dtype.type(element) + if np.isnan(casted) or casted == element: + return casted + # otherwise e.g. overflow see TestCoercionFloat32 + raise LossySetitemError + + if tipo is not None: + # TODO: itemsize check? + if tipo.kind not in "iuf": + # Anything other than float/integer we cannot hold + raise LossySetitemError + if not isinstance(tipo, np.dtype): + # i.e. nullable IntegerDtype or FloatingDtype; + # we can put this into an ndarray losslessly iff it has no NAs + if element._hasna: + raise LossySetitemError + return element + elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind: + if isinstance(element, np.ndarray): + # e.g. TestDataFrameIndexingWhere::test_where_alignment + casted = element.astype(dtype) + if np.array_equal(casted, element, equal_nan=True): + return casted + raise LossySetitemError + + return element + + raise LossySetitemError + + if dtype.kind == "c": + if lib.is_integer(element) or lib.is_complex(element) or lib.is_float(element): + if np.isnan(element): + # see test_where_complex GH#6345 + return dtype.type(element) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + casted = dtype.type(element) + if casted == element: + return casted + # otherwise e.g. overflow see test_32878_complex_itemsize + raise LossySetitemError + + if tipo is not None: + if tipo.kind in "iufc": + return element + raise LossySetitemError + raise LossySetitemError + + if dtype.kind == "b": + if tipo is not None: + if tipo.kind == "b": + if not isinstance(tipo, np.dtype): + # i.e. we have a BooleanArray + if element._hasna: + # i.e. there are pd.NA elements + raise LossySetitemError + return element + raise LossySetitemError + if lib.is_bool(element): + return element + raise LossySetitemError + + if dtype.kind == "S": + # TODO: test tests.frame.methods.test_replace tests get here, + # need more targeted tests. xref phofl has a PR about this + if tipo is not None: + if tipo.kind == "S" and tipo.itemsize <= dtype.itemsize: + return element + raise LossySetitemError + if isinstance(element, bytes) and len(element) <= dtype.itemsize: + return element + raise LossySetitemError + + if dtype.kind == "V": + # i.e. np.void, which cannot hold _anything_ + raise LossySetitemError + + raise NotImplementedError(dtype) + + +def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool: + """ + _maybe_infer_dtype_type infers to int64 (and float64 for very large endpoints), + but in many cases a range can be held by a smaller integer dtype. + Check if this is one of those cases. + """ + if not len(rng): + return True + return np_can_cast_scalar(rng.start, dtype) and np_can_cast_scalar(rng.stop, dtype) + + +def np_can_cast_scalar(element: Scalar, dtype: np.dtype) -> bool: + """ + np.can_cast pandas-equivalent for pre 2-0 behavior that allowed scalar + inference + + Parameters + ---------- + element : Scalar + dtype : np.dtype + + Returns + ------- + bool + """ + try: + np_can_hold_element(dtype, element) + return True + except (LossySetitemError, NotImplementedError): + return False diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/common.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/common.py new file mode 100644 index 0000000000000000000000000000000000000000..df0251d141984c9223d44a178db99b2651af7e78 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/common.py @@ -0,0 +1,1748 @@ +""" +Common type operations. +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, + Callable, +) +import warnings + +import numpy as np + +from pandas._libs import ( + Interval, + Period, + algos, + lib, +) +from pandas._libs.tslibs import conversion +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.base import _registry as registry +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PeriodDtype, + SparseDtype, +) +from pandas.core.dtypes.generic import ABCIndex +from pandas.core.dtypes.inference import ( + is_array_like, + is_bool, + is_complex, + is_dataclass, + is_decimal, + is_dict_like, + is_file_like, + is_float, + is_hashable, + is_integer, + is_interval, + is_iterator, + is_list_like, + is_named_tuple, + is_nested_list_like, + is_number, + is_re, + is_re_compilable, + is_scalar, + is_sequence, +) + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + DtypeObj, + ) + +DT64NS_DTYPE = conversion.DT64NS_DTYPE +TD64NS_DTYPE = conversion.TD64NS_DTYPE +INT64_DTYPE = np.dtype(np.int64) + +# oh the troubles to reduce import time +_is_scipy_sparse = None + +ensure_float64 = algos.ensure_float64 +ensure_int64 = algos.ensure_int64 +ensure_int32 = algos.ensure_int32 +ensure_int16 = algos.ensure_int16 +ensure_int8 = algos.ensure_int8 +ensure_platform_int = algos.ensure_platform_int +ensure_object = algos.ensure_object +ensure_uint64 = algos.ensure_uint64 + + +def ensure_str(value: bytes | Any) -> str: + """ + Ensure that bytes and non-strings get converted into ``str`` objects. + """ + if isinstance(value, bytes): + value = value.decode("utf-8") + elif not isinstance(value, str): + value = str(value) + return value + + +def ensure_python_int(value: int | np.integer) -> int: + """ + Ensure that a value is a python int. + + Parameters + ---------- + value: int or numpy.integer + + Returns + ------- + int + + Raises + ------ + TypeError: if the value isn't an int or can't be converted to one. + """ + if not (is_integer(value) or is_float(value)): + if not is_scalar(value): + raise TypeError( + f"Value needs to be a scalar value, was type {type(value).__name__}" + ) + raise TypeError(f"Wrong type {type(value)} for value {value}") + try: + new_value = int(value) + assert new_value == value + except (TypeError, ValueError, AssertionError) as err: + raise TypeError(f"Wrong type {type(value)} for value {value}") from err + return new_value + + +def classes(*klasses) -> Callable: + """Evaluate if the tipo is a subclass of the klasses.""" + return lambda tipo: issubclass(tipo, klasses) + + +def _classes_and_not_datetimelike(*klasses) -> Callable: + """ + Evaluate if the tipo is a subclass of the klasses + and not a datetimelike. + """ + return lambda tipo: ( + issubclass(tipo, klasses) + and not issubclass(tipo, (np.datetime64, np.timedelta64)) + ) + + +def is_object_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the object dtype. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the object dtype. + + Examples + -------- + >>> from pandas.api.types import is_object_dtype + >>> is_object_dtype(object) + True + >>> is_object_dtype(int) + False + >>> is_object_dtype(np.array([], dtype=object)) + True + >>> is_object_dtype(np.array([], dtype=int)) + False + >>> is_object_dtype([1, 2, 3]) + False + """ + return _is_dtype_type(arr_or_dtype, classes(np.object_)) + + +def is_sparse(arr) -> bool: + """ + Check whether an array-like is a 1-D pandas sparse array. + + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.SparseDtype) instead. + + Check that the one-dimensional array-like is a pandas sparse array. + Returns True if it is a pandas sparse array, not another type of + sparse array. + + Parameters + ---------- + arr : array-like + Array-like to check. + + Returns + ------- + bool + Whether or not the array-like is a pandas sparse array. + + Examples + -------- + Returns `True` if the parameter is a 1-D pandas sparse array. + + >>> from pandas.api.types import is_sparse + >>> is_sparse(pd.arrays.SparseArray([0, 0, 1, 0])) + True + >>> is_sparse(pd.Series(pd.arrays.SparseArray([0, 0, 1, 0]))) + True + + Returns `False` if the parameter is not sparse. + + >>> is_sparse(np.array([0, 0, 1, 0])) + False + >>> is_sparse(pd.Series([0, 1, 0, 0])) + False + + Returns `False` if the parameter is not a pandas sparse array. + + >>> from scipy.sparse import bsr_matrix + >>> is_sparse(bsr_matrix([0, 1, 0, 0])) + False + + Returns `False` if the parameter has more than one dimension. + """ + warnings.warn( + "is_sparse is deprecated and will be removed in a future " + "version. Check `isinstance(dtype, pd.SparseDtype)` instead.", + DeprecationWarning, + stacklevel=2, + ) + + dtype = getattr(arr, "dtype", arr) + return isinstance(dtype, SparseDtype) + + +def is_scipy_sparse(arr) -> bool: + """ + Check whether an array-like is a scipy.sparse.spmatrix instance. + + Parameters + ---------- + arr : array-like + The array-like to check. + + Returns + ------- + boolean + Whether or not the array-like is a scipy.sparse.spmatrix instance. + + Notes + ----- + If scipy is not installed, this function will always return False. + + Examples + -------- + >>> from scipy.sparse import bsr_matrix + >>> is_scipy_sparse(bsr_matrix([1, 2, 3])) + True + >>> is_scipy_sparse(pd.arrays.SparseArray([1, 2, 3])) + False + """ + global _is_scipy_sparse + + if _is_scipy_sparse is None: # pylint: disable=used-before-assignment + try: + from scipy.sparse import issparse as _is_scipy_sparse + except ImportError: + _is_scipy_sparse = lambda _: False + + assert _is_scipy_sparse is not None + return _is_scipy_sparse(arr) + + +def is_datetime64_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the datetime64 dtype. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the datetime64 dtype. + + Examples + -------- + >>> from pandas.api.types import is_datetime64_dtype + >>> is_datetime64_dtype(object) + False + >>> is_datetime64_dtype(np.datetime64) + True + >>> is_datetime64_dtype(np.array([], dtype=int)) + False + >>> is_datetime64_dtype(np.array([], dtype=np.datetime64)) + True + >>> is_datetime64_dtype([1, 2, 3]) + False + """ + if isinstance(arr_or_dtype, np.dtype): + # GH#33400 fastpath for dtype object + return arr_or_dtype.kind == "M" + return _is_dtype_type(arr_or_dtype, classes(np.datetime64)) + + +def is_datetime64tz_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of a DatetimeTZDtype dtype. + + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.DatetimeTZDtype) instead. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of a DatetimeTZDtype dtype. + + Examples + -------- + >>> from pandas.api.types import is_datetime64tz_dtype + >>> is_datetime64tz_dtype(object) + False + >>> is_datetime64tz_dtype([1, 2, 3]) + False + >>> is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) # tz-naive + False + >>> is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) + True + + >>> from pandas.core.dtypes.dtypes import DatetimeTZDtype + >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern") + >>> s = pd.Series([], dtype=dtype) + >>> is_datetime64tz_dtype(dtype) + True + >>> is_datetime64tz_dtype(s) + True + """ + # GH#52607 + warnings.warn( + "is_datetime64tz_dtype is deprecated and will be removed in a future " + "version. Check `isinstance(dtype, pd.DatetimeTZDtype)` instead.", + DeprecationWarning, + stacklevel=2, + ) + if isinstance(arr_or_dtype, DatetimeTZDtype): + # GH#33400 fastpath for dtype object + # GH 34986 + return True + + if arr_or_dtype is None: + return False + return DatetimeTZDtype.is_dtype(arr_or_dtype) + + +def is_timedelta64_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the timedelta64 dtype. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the timedelta64 dtype. + + Examples + -------- + >>> from pandas.core.dtypes.common import is_timedelta64_dtype + >>> is_timedelta64_dtype(object) + False + >>> is_timedelta64_dtype(np.timedelta64) + True + >>> is_timedelta64_dtype([1, 2, 3]) + False + >>> is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) + True + >>> is_timedelta64_dtype('0 days') + False + """ + if isinstance(arr_or_dtype, np.dtype): + # GH#33400 fastpath for dtype object + return arr_or_dtype.kind == "m" + + return _is_dtype_type(arr_or_dtype, classes(np.timedelta64)) + + +def is_period_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the Period dtype. + + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.Period) instead. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the Period dtype. + + Examples + -------- + >>> from pandas.core.dtypes.common import is_period_dtype + >>> is_period_dtype(object) + False + >>> is_period_dtype(pd.PeriodDtype(freq="D")) + True + >>> is_period_dtype([1, 2, 3]) + False + >>> is_period_dtype(pd.Period("2017-01-01")) + False + >>> is_period_dtype(pd.PeriodIndex([], freq="Y")) + True + """ + warnings.warn( + "is_period_dtype is deprecated and will be removed in a future version. " + "Use `isinstance(dtype, pd.PeriodDtype)` instead", + DeprecationWarning, + stacklevel=2, + ) + if isinstance(arr_or_dtype, ExtensionDtype): + # GH#33400 fastpath for dtype object + return arr_or_dtype.type is Period + + if arr_or_dtype is None: + return False + return PeriodDtype.is_dtype(arr_or_dtype) + + +def is_interval_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the Interval dtype. + + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.IntervalDtype) instead. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the Interval dtype. + + Examples + -------- + >>> from pandas.core.dtypes.common import is_interval_dtype + >>> is_interval_dtype(object) + False + >>> is_interval_dtype(pd.IntervalDtype()) + True + >>> is_interval_dtype([1, 2, 3]) + False + >>> + >>> interval = pd.Interval(1, 2, closed="right") + >>> is_interval_dtype(interval) + False + >>> is_interval_dtype(pd.IntervalIndex([interval])) + True + """ + # GH#52607 + warnings.warn( + "is_interval_dtype is deprecated and will be removed in a future version. " + "Use `isinstance(dtype, pd.IntervalDtype)` instead", + DeprecationWarning, + stacklevel=2, + ) + if isinstance(arr_or_dtype, ExtensionDtype): + # GH#33400 fastpath for dtype object + return arr_or_dtype.type is Interval + + if arr_or_dtype is None: + return False + return IntervalDtype.is_dtype(arr_or_dtype) + + +def is_categorical_dtype(arr_or_dtype) -> bool: + """ + Check whether an array-like or dtype is of the Categorical dtype. + + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.CategoricalDtype) instead. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array-like or dtype to check. + + Returns + ------- + boolean + Whether or not the array-like or dtype is of the Categorical dtype. + + Examples + -------- + >>> from pandas.api.types import is_categorical_dtype + >>> from pandas import CategoricalDtype + >>> is_categorical_dtype(object) + False + >>> is_categorical_dtype(CategoricalDtype()) + True + >>> is_categorical_dtype([1, 2, 3]) + False + >>> is_categorical_dtype(pd.Categorical([1, 2, 3])) + True + >>> is_categorical_dtype(pd.CategoricalIndex([1, 2, 3])) + True + """ + # GH#52527 + warnings.warn( + "is_categorical_dtype is deprecated and will be removed in a future " + "version. Use isinstance(dtype, pd.CategoricalDtype) instead", + DeprecationWarning, + stacklevel=2, + ) + if isinstance(arr_or_dtype, ExtensionDtype): + # GH#33400 fastpath for dtype object + return arr_or_dtype.name == "category" + + if arr_or_dtype is None: + return False + return CategoricalDtype.is_dtype(arr_or_dtype) + + +def is_string_or_object_np_dtype(dtype: np.dtype) -> bool: + """ + Faster alternative to is_string_dtype, assumes we have a np.dtype object. + """ + return dtype == object or dtype.kind in "SU" + + +def is_string_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of the string dtype. + + If an array is passed with an object dtype, the elements must be + inferred as strings. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of the string dtype. + + Examples + -------- + >>> from pandas.api.types import is_string_dtype + >>> is_string_dtype(str) + True + >>> is_string_dtype(object) + True + >>> is_string_dtype(int) + False + >>> is_string_dtype(np.array(['a', 'b'])) + True + >>> is_string_dtype(pd.Series([1, 2])) + False + >>> is_string_dtype(pd.Series([1, 2], dtype=object)) + False + """ + if hasattr(arr_or_dtype, "dtype") and _get_dtype(arr_or_dtype).kind == "O": + return is_all_strings(arr_or_dtype) + + def condition(dtype) -> bool: + if is_string_or_object_np_dtype(dtype): + return True + try: + return dtype == "string" + except TypeError: + return False + + return _is_dtype(arr_or_dtype, condition) + + +def is_dtype_equal(source, target) -> bool: + """ + Check if two dtypes are equal. + + Parameters + ---------- + source : The first dtype to compare + target : The second dtype to compare + + Returns + ------- + boolean + Whether or not the two dtypes are equal. + + Examples + -------- + >>> is_dtype_equal(int, float) + False + >>> is_dtype_equal("int", int) + True + >>> is_dtype_equal(object, "category") + False + >>> is_dtype_equal(CategoricalDtype(), "category") + True + >>> is_dtype_equal(DatetimeTZDtype(tz="UTC"), "datetime64") + False + """ + if isinstance(target, str): + if not isinstance(source, str): + # GH#38516 ensure we get the same behavior from + # is_dtype_equal(CDT, "category") and CDT == "category" + try: + src = _get_dtype(source) + if isinstance(src, ExtensionDtype): + return src == target + except (TypeError, AttributeError, ImportError): + return False + elif isinstance(source, str): + return is_dtype_equal(target, source) + + try: + source = _get_dtype(source) + target = _get_dtype(target) + return source == target + except (TypeError, AttributeError, ImportError): + # invalid comparison + # object == category will hit this + return False + + +def is_integer_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of an integer dtype. + + Unlike in `is_any_int_dtype`, timedelta64 instances will return False. + + The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered + as integer by this function. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of an integer dtype and + not an instance of timedelta64. + + Examples + -------- + >>> from pandas.api.types import is_integer_dtype + >>> is_integer_dtype(str) + False + >>> is_integer_dtype(int) + True + >>> is_integer_dtype(float) + False + >>> is_integer_dtype(np.uint64) + True + >>> is_integer_dtype('int8') + True + >>> is_integer_dtype('Int8') + True + >>> is_integer_dtype(pd.Int8Dtype) + True + >>> is_integer_dtype(np.datetime64) + False + >>> is_integer_dtype(np.timedelta64) + False + >>> is_integer_dtype(np.array(['a', 'b'])) + False + >>> is_integer_dtype(pd.Series([1, 2])) + True + >>> is_integer_dtype(np.array([], dtype=np.timedelta64)) + False + >>> is_integer_dtype(pd.Index([1, 2.])) # float + False + """ + return _is_dtype_type( + arr_or_dtype, _classes_and_not_datetimelike(np.integer) + ) or _is_dtype( + arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind in "iu" + ) + + +def is_signed_integer_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a signed integer dtype. + + Unlike in `is_any_int_dtype`, timedelta64 instances will return False. + + The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered + as integer by this function. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a signed integer dtype + and not an instance of timedelta64. + + Examples + -------- + >>> from pandas.core.dtypes.common import is_signed_integer_dtype + >>> is_signed_integer_dtype(str) + False + >>> is_signed_integer_dtype(int) + True + >>> is_signed_integer_dtype(float) + False + >>> is_signed_integer_dtype(np.uint64) # unsigned + False + >>> is_signed_integer_dtype('int8') + True + >>> is_signed_integer_dtype('Int8') + True + >>> is_signed_integer_dtype(pd.Int8Dtype) + True + >>> is_signed_integer_dtype(np.datetime64) + False + >>> is_signed_integer_dtype(np.timedelta64) + False + >>> is_signed_integer_dtype(np.array(['a', 'b'])) + False + >>> is_signed_integer_dtype(pd.Series([1, 2])) + True + >>> is_signed_integer_dtype(np.array([], dtype=np.timedelta64)) + False + >>> is_signed_integer_dtype(pd.Index([1, 2.])) # float + False + >>> is_signed_integer_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned + False + """ + return _is_dtype_type( + arr_or_dtype, _classes_and_not_datetimelike(np.signedinteger) + ) or _is_dtype( + arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind == "i" + ) + + +def is_unsigned_integer_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of an unsigned integer dtype. + + The nullable Integer dtypes (e.g. pandas.UInt64Dtype) are also + considered as integer by this function. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of an unsigned integer dtype. + + Examples + -------- + >>> from pandas.api.types import is_unsigned_integer_dtype + >>> is_unsigned_integer_dtype(str) + False + >>> is_unsigned_integer_dtype(int) # signed + False + >>> is_unsigned_integer_dtype(float) + False + >>> is_unsigned_integer_dtype(np.uint64) + True + >>> is_unsigned_integer_dtype('uint8') + True + >>> is_unsigned_integer_dtype('UInt8') + True + >>> is_unsigned_integer_dtype(pd.UInt8Dtype) + True + >>> is_unsigned_integer_dtype(np.array(['a', 'b'])) + False + >>> is_unsigned_integer_dtype(pd.Series([1, 2])) # signed + False + >>> is_unsigned_integer_dtype(pd.Index([1, 2.])) # float + False + >>> is_unsigned_integer_dtype(np.array([1, 2], dtype=np.uint32)) + True + """ + return _is_dtype_type( + arr_or_dtype, _classes_and_not_datetimelike(np.unsignedinteger) + ) or _is_dtype( + arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind == "u" + ) + + +def is_int64_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of the int64 dtype. + + .. deprecated:: 2.1.0 + + is_int64_dtype is deprecated and will be removed in a future + version. Use dtype == np.int64 instead. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of the int64 dtype. + + Notes + ----- + Depending on system architecture, the return value of `is_int64_dtype( + int)` will be True if the OS uses 64-bit integers and False if the OS + uses 32-bit integers. + + Examples + -------- + >>> from pandas.api.types import is_int64_dtype + >>> is_int64_dtype(str) # doctest: +SKIP + False + >>> is_int64_dtype(np.int32) # doctest: +SKIP + False + >>> is_int64_dtype(np.int64) # doctest: +SKIP + True + >>> is_int64_dtype('int8') # doctest: +SKIP + False + >>> is_int64_dtype('Int8') # doctest: +SKIP + False + >>> is_int64_dtype(pd.Int64Dtype) # doctest: +SKIP + True + >>> is_int64_dtype(float) # doctest: +SKIP + False + >>> is_int64_dtype(np.uint64) # unsigned # doctest: +SKIP + False + >>> is_int64_dtype(np.array(['a', 'b'])) # doctest: +SKIP + False + >>> is_int64_dtype(np.array([1, 2], dtype=np.int64)) # doctest: +SKIP + True + >>> is_int64_dtype(pd.Index([1, 2.])) # float # doctest: +SKIP + False + >>> is_int64_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned # doctest: +SKIP + False + """ + # GH#52564 + warnings.warn( + "is_int64_dtype is deprecated and will be removed in a future " + "version. Use dtype == np.int64 instead.", + DeprecationWarning, + stacklevel=2, + ) + return _is_dtype_type(arr_or_dtype, classes(np.int64)) + + +def is_datetime64_any_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of the datetime64 dtype. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + bool + Whether or not the array or dtype is of the datetime64 dtype. + + Examples + -------- + >>> from pandas.api.types import is_datetime64_any_dtype + >>> from pandas.core.dtypes.dtypes import DatetimeTZDtype + >>> is_datetime64_any_dtype(str) + False + >>> is_datetime64_any_dtype(int) + False + >>> is_datetime64_any_dtype(np.datetime64) # can be tz-naive + True + >>> is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern")) + True + >>> is_datetime64_any_dtype(np.array(['a', 'b'])) + False + >>> is_datetime64_any_dtype(np.array([1, 2])) + False + >>> is_datetime64_any_dtype(np.array([], dtype="datetime64[ns]")) + True + >>> is_datetime64_any_dtype(pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]")) + True + """ + if isinstance(arr_or_dtype, (np.dtype, ExtensionDtype)): + # GH#33400 fastpath for dtype object + return arr_or_dtype.kind == "M" + + if arr_or_dtype is None: + return False + + try: + tipo = _get_dtype(arr_or_dtype) + except TypeError: + return False + return lib.is_np_dtype(tipo, "M") or isinstance(tipo, DatetimeTZDtype) + + +def is_datetime64_ns_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of the datetime64[ns] dtype. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + bool + Whether or not the array or dtype is of the datetime64[ns] dtype. + + Examples + -------- + >>> from pandas.api.types import is_datetime64_ns_dtype + >>> from pandas.core.dtypes.dtypes import DatetimeTZDtype + >>> is_datetime64_ns_dtype(str) + False + >>> is_datetime64_ns_dtype(int) + False + >>> is_datetime64_ns_dtype(np.datetime64) # no unit + False + >>> is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern")) + True + >>> is_datetime64_ns_dtype(np.array(['a', 'b'])) + False + >>> is_datetime64_ns_dtype(np.array([1, 2])) + False + >>> is_datetime64_ns_dtype(np.array([], dtype="datetime64")) # no unit + False + >>> is_datetime64_ns_dtype(np.array([], dtype="datetime64[ps]")) # wrong unit + False + >>> is_datetime64_ns_dtype(pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]")) + True + """ + if arr_or_dtype is None: + return False + try: + tipo = _get_dtype(arr_or_dtype) + except TypeError: + return False + return tipo == DT64NS_DTYPE or ( + isinstance(tipo, DatetimeTZDtype) and tipo.unit == "ns" + ) + + +def is_timedelta64_ns_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of the timedelta64[ns] dtype. + + This is a very specific dtype, so generic ones like `np.timedelta64` + will return False if passed into this function. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of the timedelta64[ns] dtype. + + Examples + -------- + >>> from pandas.core.dtypes.common import is_timedelta64_ns_dtype + >>> is_timedelta64_ns_dtype(np.dtype('m8[ns]')) + True + >>> is_timedelta64_ns_dtype(np.dtype('m8[ps]')) # Wrong frequency + False + >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype='m8[ns]')) + True + >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64)) + False + """ + return _is_dtype(arr_or_dtype, lambda dtype: dtype == TD64NS_DTYPE) + + +# This exists to silence numpy deprecation warnings, see GH#29553 +def is_numeric_v_string_like(a: ArrayLike, b) -> bool: + """ + Check if we are comparing a string-like object to a numeric ndarray. + NumPy doesn't like to compare such objects, especially numeric arrays + and scalar string-likes. + + Parameters + ---------- + a : array-like, scalar + The first object to check. + b : array-like, scalar + The second object to check. + + Returns + ------- + boolean + Whether we return a comparing a string-like object to a numeric array. + + Examples + -------- + >>> is_numeric_v_string_like(np.array([1]), "foo") + True + >>> is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"])) + True + >>> is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2])) + True + >>> is_numeric_v_string_like(np.array([1]), np.array([2])) + False + >>> is_numeric_v_string_like(np.array(["foo"]), np.array(["foo"])) + False + """ + is_a_array = isinstance(a, np.ndarray) + is_b_array = isinstance(b, np.ndarray) + + is_a_numeric_array = is_a_array and a.dtype.kind in ("u", "i", "f", "c", "b") + is_b_numeric_array = is_b_array and b.dtype.kind in ("u", "i", "f", "c", "b") + is_a_string_array = is_a_array and a.dtype.kind in ("S", "U") + is_b_string_array = is_b_array and b.dtype.kind in ("S", "U") + + is_b_scalar_string_like = not is_b_array and isinstance(b, str) + + return ( + (is_a_numeric_array and is_b_scalar_string_like) + or (is_a_numeric_array and is_b_string_array) + or (is_b_numeric_array and is_a_string_array) + ) + + +def needs_i8_conversion(dtype: DtypeObj | None) -> bool: + """ + Check whether the dtype should be converted to int64. + + Dtype "needs" such a conversion if the dtype is of a datetime-like dtype + + Parameters + ---------- + dtype : np.dtype, ExtensionDtype, or None + + Returns + ------- + boolean + Whether or not the dtype should be converted to int64. + + Examples + -------- + >>> needs_i8_conversion(str) + False + >>> needs_i8_conversion(np.int64) + False + >>> needs_i8_conversion(np.datetime64) + False + >>> needs_i8_conversion(np.dtype(np.datetime64)) + True + >>> needs_i8_conversion(np.array(['a', 'b'])) + False + >>> needs_i8_conversion(pd.Series([1, 2])) + False + >>> needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]")) + False + >>> needs_i8_conversion(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) + False + >>> needs_i8_conversion(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern").dtype) + True + """ + if isinstance(dtype, np.dtype): + return dtype.kind in "mM" + return isinstance(dtype, (PeriodDtype, DatetimeTZDtype)) + + +def is_numeric_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a numeric dtype. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a numeric dtype. + + Examples + -------- + >>> from pandas.api.types import is_numeric_dtype + >>> is_numeric_dtype(str) + False + >>> is_numeric_dtype(int) + True + >>> is_numeric_dtype(float) + True + >>> is_numeric_dtype(np.uint64) + True + >>> is_numeric_dtype(np.datetime64) + False + >>> is_numeric_dtype(np.timedelta64) + False + >>> is_numeric_dtype(np.array(['a', 'b'])) + False + >>> is_numeric_dtype(pd.Series([1, 2])) + True + >>> is_numeric_dtype(pd.Index([1, 2.])) + True + >>> is_numeric_dtype(np.array([], dtype=np.timedelta64)) + False + """ + return _is_dtype_type( + arr_or_dtype, _classes_and_not_datetimelike(np.number, np.bool_) + ) or _is_dtype( + arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ._is_numeric + ) + + +def is_any_real_numeric_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a real number dtype. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a real number dtype. + + Examples + -------- + >>> from pandas.api.types import is_any_real_numeric_dtype + >>> is_any_real_numeric_dtype(int) + True + >>> is_any_real_numeric_dtype(float) + True + >>> is_any_real_numeric_dtype(object) + False + >>> is_any_real_numeric_dtype(str) + False + >>> is_any_real_numeric_dtype(complex(1, 2)) + False + >>> is_any_real_numeric_dtype(bool) + False + """ + return ( + is_numeric_dtype(arr_or_dtype) + and not is_complex_dtype(arr_or_dtype) + and not is_bool_dtype(arr_or_dtype) + ) + + +def is_float_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a float dtype. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a float dtype. + + Examples + -------- + >>> from pandas.api.types import is_float_dtype + >>> is_float_dtype(str) + False + >>> is_float_dtype(int) + False + >>> is_float_dtype(float) + True + >>> is_float_dtype(np.array(['a', 'b'])) + False + >>> is_float_dtype(pd.Series([1, 2])) + False + >>> is_float_dtype(pd.Index([1, 2.])) + True + """ + return _is_dtype_type(arr_or_dtype, classes(np.floating)) or _is_dtype( + arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind in "f" + ) + + +def is_bool_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a boolean dtype. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a boolean dtype. + + Notes + ----- + An ExtensionArray is considered boolean when the ``_is_boolean`` + attribute is set to True. + + Examples + -------- + >>> from pandas.api.types import is_bool_dtype + >>> is_bool_dtype(str) + False + >>> is_bool_dtype(int) + False + >>> is_bool_dtype(bool) + True + >>> is_bool_dtype(np.bool_) + True + >>> is_bool_dtype(np.array(['a', 'b'])) + False + >>> is_bool_dtype(pd.Series([1, 2])) + False + >>> is_bool_dtype(np.array([True, False])) + True + >>> is_bool_dtype(pd.Categorical([True, False])) + True + >>> is_bool_dtype(pd.arrays.SparseArray([True, False])) + True + """ + if arr_or_dtype is None: + return False + try: + dtype = _get_dtype(arr_or_dtype) + except (TypeError, ValueError): + return False + + if isinstance(dtype, CategoricalDtype): + arr_or_dtype = dtype.categories + # now we use the special definition for Index + + if isinstance(arr_or_dtype, ABCIndex): + # Allow Index[object] that is all-bools or Index["boolean"] + if arr_or_dtype.inferred_type == "boolean": + if not is_bool_dtype(arr_or_dtype.dtype): + # GH#52680 + warnings.warn( + "The behavior of is_bool_dtype with an object-dtype Index " + "of bool objects is deprecated. In a future version, " + "this will return False. Cast the Index to a bool dtype instead.", + DeprecationWarning, + stacklevel=2, + ) + return True + return False + elif isinstance(dtype, ExtensionDtype): + return getattr(dtype, "_is_boolean", False) + + return issubclass(dtype.type, np.bool_) + + +def is_1d_only_ea_dtype(dtype: DtypeObj | None) -> bool: + """ + Analogue to is_extension_array_dtype but excluding DatetimeTZDtype. + """ + return isinstance(dtype, ExtensionDtype) and not dtype._supports_2d + + +def is_extension_array_dtype(arr_or_dtype) -> bool: + """ + Check if an object is a pandas extension array type. + + See the :ref:`Use Guide ` for more. + + Parameters + ---------- + arr_or_dtype : object + For array-like input, the ``.dtype`` attribute will + be extracted. + + Returns + ------- + bool + Whether the `arr_or_dtype` is an extension array type. + + Notes + ----- + This checks whether an object implements the pandas extension + array interface. In pandas, this includes: + + * Categorical + * Sparse + * Interval + * Period + * DatetimeArray + * TimedeltaArray + + Third-party libraries may implement arrays or types satisfying + this interface as well. + + Examples + -------- + >>> from pandas.api.types import is_extension_array_dtype + >>> arr = pd.Categorical(['a', 'b']) + >>> is_extension_array_dtype(arr) + True + >>> is_extension_array_dtype(arr.dtype) + True + + >>> arr = np.array(['a', 'b']) + >>> is_extension_array_dtype(arr.dtype) + False + """ + dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) + if isinstance(dtype, ExtensionDtype): + return True + elif isinstance(dtype, np.dtype): + return False + else: + return registry.find(dtype) is not None + + +def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool: + """ + Check for ExtensionDtype, datetime64 dtype, or timedelta64 dtype. + + Notes + ----- + Checks only for dtype objects, not dtype-castable strings or types. + """ + return isinstance(dtype, ExtensionDtype) or (lib.is_np_dtype(dtype, "mM")) + + +def is_complex_dtype(arr_or_dtype) -> bool: + """ + Check whether the provided array or dtype is of a complex dtype. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array or dtype to check. + + Returns + ------- + boolean + Whether or not the array or dtype is of a complex dtype. + + Examples + -------- + >>> from pandas.api.types import is_complex_dtype + >>> is_complex_dtype(str) + False + >>> is_complex_dtype(int) + False + >>> is_complex_dtype(np.complex128) + True + >>> is_complex_dtype(np.array(['a', 'b'])) + False + >>> is_complex_dtype(pd.Series([1, 2])) + False + >>> is_complex_dtype(np.array([1 + 1j, 5])) + True + """ + return _is_dtype_type(arr_or_dtype, classes(np.complexfloating)) + + +def _is_dtype(arr_or_dtype, condition) -> bool: + """ + Return true if the condition is satisfied for the arr_or_dtype. + + Parameters + ---------- + arr_or_dtype : array-like, str, np.dtype, or ExtensionArrayType + The array-like or dtype object whose dtype we want to extract. + condition : callable[Union[np.dtype, ExtensionDtype]] + + Returns + ------- + bool + + """ + if arr_or_dtype is None: + return False + try: + dtype = _get_dtype(arr_or_dtype) + except (TypeError, ValueError): + return False + return condition(dtype) + + +def _get_dtype(arr_or_dtype) -> DtypeObj: + """ + Get the dtype instance associated with an array + or dtype object. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array-like or dtype object whose dtype we want to extract. + + Returns + ------- + obj_dtype : The extract dtype instance from the + passed in array or dtype object. + + Raises + ------ + TypeError : The passed in object is None. + """ + if arr_or_dtype is None: + raise TypeError("Cannot deduce dtype from null object") + + # fastpath + if isinstance(arr_or_dtype, np.dtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, type): + return np.dtype(arr_or_dtype) + + # if we have an array-like + elif hasattr(arr_or_dtype, "dtype"): + arr_or_dtype = arr_or_dtype.dtype + + return pandas_dtype(arr_or_dtype) + + +def _is_dtype_type(arr_or_dtype, condition) -> bool: + """ + Return true if the condition is satisfied for the arr_or_dtype. + + Parameters + ---------- + arr_or_dtype : array-like or dtype + The array-like or dtype object whose dtype we want to extract. + condition : callable[Union[np.dtype, ExtensionDtypeType]] + + Returns + ------- + bool : if the condition is satisfied for the arr_or_dtype + """ + if arr_or_dtype is None: + return condition(type(None)) + + # fastpath + if isinstance(arr_or_dtype, np.dtype): + return condition(arr_or_dtype.type) + elif isinstance(arr_or_dtype, type): + if issubclass(arr_or_dtype, ExtensionDtype): + arr_or_dtype = arr_or_dtype.type + return condition(np.dtype(arr_or_dtype).type) + + # if we have an array-like + if hasattr(arr_or_dtype, "dtype"): + arr_or_dtype = arr_or_dtype.dtype + + # we are not possibly a dtype + elif is_list_like(arr_or_dtype): + return condition(type(None)) + + try: + tipo = pandas_dtype(arr_or_dtype).type + except (TypeError, ValueError): + if is_scalar(arr_or_dtype): + return condition(type(None)) + + return False + + return condition(tipo) + + +def infer_dtype_from_object(dtype) -> type: + """ + Get a numpy dtype.type-style object for a dtype object. + + This methods also includes handling of the datetime64[ns] and + datetime64[ns, TZ] objects. + + If no dtype can be found, we return ``object``. + + Parameters + ---------- + dtype : dtype, type + The dtype object whose numpy dtype.type-style + object we want to extract. + + Returns + ------- + type + """ + if isinstance(dtype, type) and issubclass(dtype, np.generic): + # Type object from a dtype + + return dtype + elif isinstance(dtype, (np.dtype, ExtensionDtype)): + # dtype object + try: + _validate_date_like_dtype(dtype) + except TypeError: + # Should still pass if we don't have a date-like + pass + if hasattr(dtype, "numpy_dtype"): + # TODO: Implement this properly + # https://github.com/pandas-dev/pandas/issues/52576 + return dtype.numpy_dtype.type + return dtype.type + + try: + dtype = pandas_dtype(dtype) + except TypeError: + pass + + if isinstance(dtype, ExtensionDtype): + return dtype.type + elif isinstance(dtype, str): + # TODO(jreback) + # should deprecate these + if dtype in ["datetimetz", "datetime64tz"]: + return DatetimeTZDtype.type + elif dtype in ["period"]: + raise NotImplementedError + + if dtype in ["datetime", "timedelta"]: + dtype += "64" + try: + return infer_dtype_from_object(getattr(np, dtype)) + except (AttributeError, TypeError): + # Handles cases like _get_dtype(int) i.e., + # Python objects that are valid dtypes + # (unlike user-defined types, in general) + # + # TypeError handles the float16 type code of 'e' + # further handle internal types + pass + + return infer_dtype_from_object(np.dtype(dtype)) + + +def _validate_date_like_dtype(dtype) -> None: + """ + Check whether the dtype is a date-like dtype. Raises an error if invalid. + + Parameters + ---------- + dtype : dtype, type + The dtype to check. + + Raises + ------ + TypeError : The dtype could not be casted to a date-like dtype. + ValueError : The dtype is an illegal date-like dtype (e.g. the + frequency provided is too specific) + """ + try: + typ = np.datetime_data(dtype)[0] + except ValueError as e: + raise TypeError(e) from e + if typ not in ["generic", "ns"]: + raise ValueError( + f"{repr(dtype.name)} is too specific of a frequency, " + f"try passing {repr(dtype.type.__name__)}" + ) + + +def validate_all_hashable(*args, error_name: str | None = None) -> None: + """ + Return None if all args are hashable, else raise a TypeError. + + Parameters + ---------- + *args + Arguments to validate. + error_name : str, optional + The name to use if error + + Raises + ------ + TypeError : If an argument is not hashable + + Returns + ------- + None + """ + if not all(is_hashable(arg) for arg in args): + if error_name: + raise TypeError(f"{error_name} must be a hashable type") + raise TypeError("All elements must be hashable") + + +def pandas_dtype(dtype) -> DtypeObj: + """ + Convert input into a pandas only dtype object or a numpy dtype object. + + Parameters + ---------- + dtype : object to be converted + + Returns + ------- + np.dtype or a pandas dtype + + Raises + ------ + TypeError if not a dtype + + Examples + -------- + >>> pd.api.types.pandas_dtype(int) + dtype('int64') + """ + # short-circuit + if isinstance(dtype, np.ndarray): + return dtype.dtype + elif isinstance(dtype, (np.dtype, ExtensionDtype)): + return dtype + + # registered extension types + result = registry.find(dtype) + if result is not None: + if isinstance(result, type): + # GH 31356, GH 54592 + warnings.warn( + f"Instantiating {result.__name__} without any arguments." + f"Pass a {result.__name__} instance to silence this warning.", + UserWarning, + stacklevel=find_stack_level(), + ) + result = result() + return result + + # try a numpy dtype + # raise a consistent TypeError if failed + try: + with warnings.catch_warnings(): + # GH#51523 - Series.astype(np.integer) doesn't show + # numpy deprecation warning of np.integer + # Hence enabling DeprecationWarning + warnings.simplefilter("always", DeprecationWarning) + npdtype = np.dtype(dtype) + except SyntaxError as err: + # np.dtype uses `eval` which can raise SyntaxError + raise TypeError(f"data type '{dtype}' not understood") from err + + # Any invalid dtype (such as pd.Timestamp) should raise an error. + # np.dtype(invalid_type).kind = 0 for such objects. However, this will + # also catch some valid dtypes such as object, np.object_ and 'object' + # which we safeguard against by catching them earlier and returning + # np.dtype(valid_dtype) before this condition is evaluated. + if is_hashable(dtype) and dtype in [ + object, + np.object_, + "object", + "O", + "object_", + ]: + # check hashability to avoid errors/DeprecationWarning when we get + # here and `dtype` is an array + return npdtype + elif npdtype.kind == "O": + raise TypeError(f"dtype '{dtype}' not understood") + + return npdtype + + +def is_all_strings(value: ArrayLike) -> bool: + """ + Check if this is an array of strings that we should try parsing. + + Includes object-dtype ndarray containing all-strings, StringArray, + and Categorical with all-string categories. + Does not include numpy string dtypes. + """ + dtype = value.dtype + + if isinstance(dtype, np.dtype): + if len(value) == 0: + return dtype == np.dtype("object") + else: + return dtype == np.dtype("object") and lib.is_string_array( + np.asarray(value), skipna=False + ) + elif isinstance(dtype, CategoricalDtype): + return dtype.categories.inferred_type == "string" + return dtype == "string" + + +__all__ = [ + "classes", + "DT64NS_DTYPE", + "ensure_float64", + "ensure_python_int", + "ensure_str", + "infer_dtype_from_object", + "INT64_DTYPE", + "is_1d_only_ea_dtype", + "is_all_strings", + "is_any_real_numeric_dtype", + "is_array_like", + "is_bool", + "is_bool_dtype", + "is_categorical_dtype", + "is_complex", + "is_complex_dtype", + "is_dataclass", + "is_datetime64_any_dtype", + "is_datetime64_dtype", + "is_datetime64_ns_dtype", + "is_datetime64tz_dtype", + "is_decimal", + "is_dict_like", + "is_dtype_equal", + "is_ea_or_datetimelike_dtype", + "is_extension_array_dtype", + "is_file_like", + "is_float_dtype", + "is_int64_dtype", + "is_integer_dtype", + "is_interval", + "is_interval_dtype", + "is_iterator", + "is_named_tuple", + "is_nested_list_like", + "is_number", + "is_numeric_dtype", + "is_object_dtype", + "is_period_dtype", + "is_re", + "is_re_compilable", + "is_scipy_sparse", + "is_sequence", + "is_signed_integer_dtype", + "is_sparse", + "is_string_dtype", + "is_string_or_object_np_dtype", + "is_timedelta64_dtype", + "is_timedelta64_ns_dtype", + "is_unsigned_integer_dtype", + "needs_i8_conversion", + "pandas_dtype", + "TD64NS_DTYPE", + "validate_all_hashable", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/concat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/concat.py new file mode 100644 index 0000000000000000000000000000000000000000..9ec662a6cd3520aaa49fdc96142ad1b02bb518d8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/concat.py @@ -0,0 +1,348 @@ +""" +Utility functions related to concat. +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + cast, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.astype import astype_array +from pandas.core.dtypes.cast import ( + common_dtype_categorical_compat, + find_common_type, + np_find_common_type, +) +from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.generic import ( + ABCCategoricalIndex, + ABCSeries, +) + +if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import ( + ArrayLike, + AxisInt, + DtypeObj, + ) + + from pandas.core.arrays import ( + Categorical, + ExtensionArray, + ) + + +def _is_nonempty(x, axis) -> bool: + # filter empty arrays + # 1-d dtypes always are included here + if x.ndim <= axis: + return True + return x.shape[axis] > 0 + + +def concat_compat( + to_concat: Sequence[ArrayLike], axis: AxisInt = 0, ea_compat_axis: bool = False +) -> ArrayLike: + """ + provide concatenation of an array of arrays each of which is a single + 'normalized' dtypes (in that for example, if it's object, then it is a + non-datetimelike and provide a combined dtype for the resulting array that + preserves the overall dtype if possible) + + Parameters + ---------- + to_concat : sequence of arrays + axis : axis to provide concatenation + ea_compat_axis : bool, default False + For ExtensionArray compat, behave as if axis == 1 when determining + whether to drop empty arrays. + + Returns + ------- + a single array, preserving the combined dtypes + """ + if len(to_concat) and lib.dtypes_all_equal([obj.dtype for obj in to_concat]): + # fastpath! + obj = to_concat[0] + if isinstance(obj, np.ndarray): + to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) + return np.concatenate(to_concat_arrs, axis=axis) + + to_concat_eas = cast("Sequence[ExtensionArray]", to_concat) + if ea_compat_axis: + # We have 1D objects, that don't support axis keyword + return obj._concat_same_type(to_concat_eas) + elif axis == 0: + return obj._concat_same_type(to_concat_eas) + else: + # e.g. DatetimeArray + # NB: We are assuming here that ensure_wrapped_if_arraylike has + # been called where relevant. + return obj._concat_same_type( + # error: Unexpected keyword argument "axis" for "_concat_same_type" + # of "ExtensionArray" + to_concat_eas, + axis=axis, # type: ignore[call-arg] + ) + + # If all arrays are empty, there's nothing to convert, just short-cut to + # the concatenation, #3121. + # + # Creating an empty array directly is tempting, but the winnings would be + # marginal given that it would still require shape & dtype calculation and + # np.concatenate which has them both implemented is compiled. + orig = to_concat + non_empties = [x for x in to_concat if _is_nonempty(x, axis)] + if non_empties and axis == 0 and not ea_compat_axis: + # ea_compat_axis see GH#39574 + to_concat = non_empties + + any_ea, kinds, target_dtype = _get_result_dtype(to_concat, non_empties) + + if len(to_concat) < len(orig): + _, _, alt_dtype = _get_result_dtype(orig, non_empties) + if alt_dtype != target_dtype: + # GH#39122 + warnings.warn( + "The behavior of array concatenation with empty entries is " + "deprecated. In a future version, this will no longer exclude " + "empty items when determining the result dtype. " + "To retain the old behavior, exclude the empty entries before " + "the concat operation.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if target_dtype is not None: + to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat] + + if not isinstance(to_concat[0], np.ndarray): + # i.e. isinstance(to_concat[0], ExtensionArray) + to_concat_eas = cast("Sequence[ExtensionArray]", to_concat) + cls = type(to_concat[0]) + # GH#53640: eg. for datetime array, axis=1 but 0 is default + # However, class method `_concat_same_type()` for some classes + # may not support the `axis` keyword + if ea_compat_axis or axis == 0: + return cls._concat_same_type(to_concat_eas) + else: + return cls._concat_same_type( + to_concat_eas, + axis=axis, # type: ignore[call-arg] + ) + else: + to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) + result = np.concatenate(to_concat_arrs, axis=axis) + + if not any_ea and "b" in kinds and result.dtype.kind in "iuf": + # GH#39817 cast to object instead of casting bools to numeric + result = result.astype(object, copy=False) + return result + + +def _get_result_dtype( + to_concat: Sequence[ArrayLike], non_empties: Sequence[ArrayLike] +) -> tuple[bool, set[str], DtypeObj | None]: + target_dtype = None + + dtypes = {obj.dtype for obj in to_concat} + kinds = {obj.dtype.kind for obj in to_concat} + + any_ea = any(not isinstance(x, np.ndarray) for x in to_concat) + if any_ea: + # i.e. any ExtensionArrays + + # we ignore axis here, as internally concatting with EAs is always + # for axis=0 + if len(dtypes) != 1: + target_dtype = find_common_type([x.dtype for x in to_concat]) + target_dtype = common_dtype_categorical_compat(to_concat, target_dtype) + + elif not len(non_empties): + # we have all empties, but may need to coerce the result dtype to + # object if we have non-numeric type operands (numpy would otherwise + # cast this to float) + if len(kinds) != 1: + if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}): + # let numpy coerce + pass + else: + # coerce to object + target_dtype = np.dtype(object) + kinds = {"o"} + else: + # error: Argument 1 to "np_find_common_type" has incompatible type + # "*Set[Union[ExtensionDtype, Any]]"; expected "dtype[Any]" + target_dtype = np_find_common_type(*dtypes) # type: ignore[arg-type] + + return any_ea, kinds, target_dtype + + +def union_categoricals( + to_union, sort_categories: bool = False, ignore_order: bool = False +) -> Categorical: + """ + Combine list-like of Categorical-like, unioning categories. + + All categories must have the same dtype. + + Parameters + ---------- + to_union : list-like + Categorical, CategoricalIndex, or Series with dtype='category'. + sort_categories : bool, default False + If true, resulting categories will be lexsorted, otherwise + they will be ordered as they appear in the data. + ignore_order : bool, default False + If true, the ordered attribute of the Categoricals will be ignored. + Results in an unordered categorical. + + Returns + ------- + Categorical + + Raises + ------ + TypeError + - all inputs do not have the same dtype + - all inputs do not have the same ordered property + - all inputs are ordered and their categories are not identical + - sort_categories=True and Categoricals are ordered + ValueError + Empty list of categoricals passed + + Notes + ----- + To learn more about categories, see `link + `__ + + Examples + -------- + If you want to combine categoricals that do not necessarily have + the same categories, `union_categoricals` will combine a list-like + of categoricals. The new categories will be the union of the + categories being combined. + + >>> a = pd.Categorical(["b", "c"]) + >>> b = pd.Categorical(["a", "b"]) + >>> pd.api.types.union_categoricals([a, b]) + ['b', 'c', 'a', 'b'] + Categories (3, object): ['b', 'c', 'a'] + + By default, the resulting categories will be ordered as they appear + in the `categories` of the data. If you want the categories to be + lexsorted, use `sort_categories=True` argument. + + >>> pd.api.types.union_categoricals([a, b], sort_categories=True) + ['b', 'c', 'a', 'b'] + Categories (3, object): ['a', 'b', 'c'] + + `union_categoricals` also works with the case of combining two + categoricals of the same categories and order information (e.g. what + you could also `append` for). + + >>> a = pd.Categorical(["a", "b"], ordered=True) + >>> b = pd.Categorical(["a", "b", "a"], ordered=True) + >>> pd.api.types.union_categoricals([a, b]) + ['a', 'b', 'a', 'b', 'a'] + Categories (2, object): ['a' < 'b'] + + Raises `TypeError` because the categories are ordered and not identical. + + >>> a = pd.Categorical(["a", "b"], ordered=True) + >>> b = pd.Categorical(["a", "b", "c"], ordered=True) + >>> pd.api.types.union_categoricals([a, b]) + Traceback (most recent call last): + ... + TypeError: to union ordered Categoricals, all categories must be the same + + Ordered categoricals with different categories or orderings can be + combined by using the `ignore_ordered=True` argument. + + >>> a = pd.Categorical(["a", "b", "c"], ordered=True) + >>> b = pd.Categorical(["c", "b", "a"], ordered=True) + >>> pd.api.types.union_categoricals([a, b], ignore_order=True) + ['a', 'b', 'c', 'c', 'b', 'a'] + Categories (3, object): ['a', 'b', 'c'] + + `union_categoricals` also works with a `CategoricalIndex`, or `Series` + containing categorical data, but note that the resulting array will + always be a plain `Categorical` + + >>> a = pd.Series(["b", "c"], dtype='category') + >>> b = pd.Series(["a", "b"], dtype='category') + >>> pd.api.types.union_categoricals([a, b]) + ['b', 'c', 'a', 'b'] + Categories (3, object): ['b', 'c', 'a'] + """ + from pandas import Categorical + from pandas.core.arrays.categorical import recode_for_categories + + if len(to_union) == 0: + raise ValueError("No Categoricals to union") + + def _maybe_unwrap(x): + if isinstance(x, (ABCCategoricalIndex, ABCSeries)): + return x._values + elif isinstance(x, Categorical): + return x + else: + raise TypeError("all components to combine must be Categorical") + + to_union = [_maybe_unwrap(x) for x in to_union] + first = to_union[0] + + if not lib.dtypes_all_equal([obj.categories.dtype for obj in to_union]): + raise TypeError("dtype of categories must be the same") + + ordered = False + if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): + # identical categories - fastpath + categories = first.categories + ordered = first.ordered + + all_codes = [first._encode_with_my_categories(x)._codes for x in to_union] + new_codes = np.concatenate(all_codes) + + if sort_categories and not ignore_order and ordered: + raise TypeError("Cannot use sort_categories=True with ordered Categoricals") + + if sort_categories and not categories.is_monotonic_increasing: + categories = categories.sort_values() + indexer = categories.get_indexer(first.categories) + + from pandas.core.algorithms import take_nd + + new_codes = take_nd(indexer, new_codes, fill_value=-1) + elif ignore_order or all(not c.ordered for c in to_union): + # different categories - union and recode + cats = first.categories.append([c.categories for c in to_union[1:]]) + categories = cats.unique() + if sort_categories: + categories = categories.sort_values() + + new_codes = [ + recode_for_categories(c.codes, c.categories, categories) for c in to_union + ] + new_codes = np.concatenate(new_codes) + else: + # ordered - to show a proper error message + if all(c.ordered for c in to_union): + msg = "to union ordered Categoricals, all categories must be the same" + raise TypeError(msg) + raise TypeError("Categorical.ordered must be the same") + + if ignore_order: + ordered = False + + dtype = CategoricalDtype(categories=categories, ordered=ordered) + return Categorical._simple_new(new_codes, dtype=dtype) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/dtypes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..1c43ef55c11d792df6cbfc4b249caf8889c7af58 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/dtypes.py @@ -0,0 +1,2348 @@ +""" +Define extension dtypes. +""" +from __future__ import annotations + +from datetime import ( + date, + datetime, + time, + timedelta, +) +from decimal import Decimal +import re +from typing import ( + TYPE_CHECKING, + Any, + cast, +) +import warnings + +import numpy as np +import pytz + +from pandas._libs import ( + lib, + missing as libmissing, +) +from pandas._libs.interval import Interval +from pandas._libs.properties import cache_readonly +from pandas._libs.tslibs import ( + BaseOffset, + NaT, + NaTType, + Period, + Timedelta, + Timestamp, + timezones, + to_offset, + tz_compare, +) +from pandas._libs.tslibs.dtypes import ( + PeriodDtypeBase, + abbrev_to_npy_unit, +) +from pandas._libs.tslibs.offsets import BDay +from pandas.compat import pa_version_under10p1 +from pandas.errors import PerformanceWarning +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.base import ( + ExtensionDtype, + StorageExtensionDtype, + register_extension_dtype, +) +from pandas.core.dtypes.generic import ( + ABCCategoricalIndex, + ABCIndex, + ABCRangeIndex, +) +from pandas.core.dtypes.inference import ( + is_bool, + is_list_like, +) + +from pandas.util import capitalize_first_letter + +if not pa_version_under10p1: + import pyarrow as pa + +if TYPE_CHECKING: + from collections.abc import MutableMapping + from datetime import tzinfo + + import pyarrow as pa # noqa: TCH004 + + from pandas._typing import ( + Dtype, + DtypeObj, + IntervalClosedType, + Ordered, + Self, + npt, + type_t, + ) + + from pandas import ( + Categorical, + CategoricalIndex, + DatetimeIndex, + Index, + IntervalIndex, + PeriodIndex, + ) + from pandas.core.arrays import ( + BaseMaskedArray, + DatetimeArray, + IntervalArray, + NumpyExtensionArray, + PeriodArray, + SparseArray, + ) + from pandas.core.arrays.arrow import ArrowExtensionArray + +str_type = str + + +class PandasExtensionDtype(ExtensionDtype): + """ + A np.dtype duck-typed class, suitable for holding a custom dtype. + + THIS IS NOT A REAL NUMPY DTYPE + """ + + type: Any + kind: Any + # The Any type annotations above are here only because mypy seems to have a + # problem dealing with multiple inheritance from PandasExtensionDtype + # and ExtensionDtype's @properties in the subclasses below. The kind and + # type variables in those subclasses are explicitly typed below. + subdtype = None + str: str_type + num = 100 + shape: tuple[int, ...] = () + itemsize = 8 + base: DtypeObj | None = None + isbuiltin = 0 + isnative = 0 + _cache_dtypes: dict[str_type, PandasExtensionDtype] = {} + + def __repr__(self) -> str_type: + """ + Return a string representation for a particular object. + """ + return str(self) + + def __hash__(self) -> int: + raise NotImplementedError("sub-classes should implement an __hash__ method") + + def __getstate__(self) -> dict[str_type, Any]: + # pickle support; we don't want to pickle the cache + return {k: getattr(self, k, None) for k in self._metadata} + + @classmethod + def reset_cache(cls) -> None: + """clear the cache""" + cls._cache_dtypes = {} + + +class CategoricalDtypeType(type): + """ + the type of CategoricalDtype, this metaclass determines subclass ability + """ + + +@register_extension_dtype +class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): + """ + Type for categorical data with the categories and orderedness. + + Parameters + ---------- + categories : sequence, optional + Must be unique, and must not contain any nulls. + The categories are stored in an Index, + and if an index is provided the dtype of that index will be used. + ordered : bool or None, default False + Whether or not this categorical is treated as a ordered categorical. + None can be used to maintain the ordered value of existing categoricals when + used in operations that combine categoricals, e.g. astype, and will resolve to + False if there is no existing ordered to maintain. + + Attributes + ---------- + categories + ordered + + Methods + ------- + None + + See Also + -------- + Categorical : Represent a categorical variable in classic R / S-plus fashion. + + Notes + ----- + This class is useful for specifying the type of a ``Categorical`` + independent of the values. See :ref:`categorical.categoricaldtype` + for more. + + Examples + -------- + >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) + 0 a + 1 b + 2 a + 3 NaN + dtype: category + Categories (2, object): ['b' < 'a'] + + An empty CategoricalDtype with a specific dtype can be created + by providing an empty index. As follows, + + >>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype + dtype(' None: + self._finalize(categories, ordered, fastpath=False) + + @classmethod + def _from_fastpath( + cls, categories=None, ordered: bool | None = None + ) -> CategoricalDtype: + self = cls.__new__(cls) + self._finalize(categories, ordered, fastpath=True) + return self + + @classmethod + def _from_categorical_dtype( + cls, dtype: CategoricalDtype, categories=None, ordered: Ordered | None = None + ) -> CategoricalDtype: + if categories is ordered is None: + return dtype + if categories is None: + categories = dtype.categories + if ordered is None: + ordered = dtype.ordered + return cls(categories, ordered) + + @classmethod + def _from_values_or_dtype( + cls, + values=None, + categories=None, + ordered: bool | None = None, + dtype: Dtype | None = None, + ) -> CategoricalDtype: + """ + Construct dtype from the input parameters used in :class:`Categorical`. + + This constructor method specifically does not do the factorization + step, if that is needed to find the categories. This constructor may + therefore return ``CategoricalDtype(categories=None, ordered=None)``, + which may not be useful. Additional steps may therefore have to be + taken to create the final dtype. + + The return dtype is specified from the inputs in this prioritized + order: + 1. if dtype is a CategoricalDtype, return dtype + 2. if dtype is the string 'category', create a CategoricalDtype from + the supplied categories and ordered parameters, and return that. + 3. if values is a categorical, use value.dtype, but override it with + categories and ordered if either/both of those are not None. + 4. if dtype is None and values is not a categorical, construct the + dtype from categories and ordered, even if either of those is None. + + Parameters + ---------- + values : list-like, optional + The list-like must be 1-dimensional. + categories : list-like, optional + Categories for the CategoricalDtype. + ordered : bool, optional + Designating if the categories are ordered. + dtype : CategoricalDtype or the string "category", optional + If ``CategoricalDtype``, cannot be used together with + `categories` or `ordered`. + + Returns + ------- + CategoricalDtype + + Examples + -------- + >>> pd.CategoricalDtype._from_values_or_dtype() + CategoricalDtype(categories=None, ordered=None, categories_dtype=None) + >>> pd.CategoricalDtype._from_values_or_dtype( + ... categories=['a', 'b'], ordered=True + ... ) + CategoricalDtype(categories=['a', 'b'], ordered=True, categories_dtype=object) + >>> dtype1 = pd.CategoricalDtype(['a', 'b'], ordered=True) + >>> dtype2 = pd.CategoricalDtype(['x', 'y'], ordered=False) + >>> c = pd.Categorical([0, 1], dtype=dtype1) + >>> pd.CategoricalDtype._from_values_or_dtype( + ... c, ['x', 'y'], ordered=True, dtype=dtype2 + ... ) + Traceback (most recent call last): + ... + ValueError: Cannot specify `categories` or `ordered` together with + `dtype`. + + The supplied dtype takes precedence over values' dtype: + + >>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2) + CategoricalDtype(categories=['x', 'y'], ordered=False, categories_dtype=object) + """ + + if dtype is not None: + # The dtype argument takes precedence over values.dtype (if any) + if isinstance(dtype, str): + if dtype == "category": + if ordered is None and cls.is_dtype(values): + # GH#49309 preserve orderedness + ordered = values.dtype.ordered + + dtype = CategoricalDtype(categories, ordered) + else: + raise ValueError(f"Unknown dtype {repr(dtype)}") + elif categories is not None or ordered is not None: + raise ValueError( + "Cannot specify `categories` or `ordered` together with `dtype`." + ) + elif not isinstance(dtype, CategoricalDtype): + raise ValueError(f"Cannot not construct CategoricalDtype from {dtype}") + elif cls.is_dtype(values): + # If no "dtype" was passed, use the one from "values", but honor + # the "ordered" and "categories" arguments + dtype = values.dtype._from_categorical_dtype( + values.dtype, categories, ordered + ) + else: + # If dtype=None and values is not categorical, create a new dtype. + # Note: This could potentially have categories=None and + # ordered=None. + dtype = CategoricalDtype(categories, ordered) + + return cast(CategoricalDtype, dtype) + + @classmethod + def construct_from_string(cls, string: str_type) -> CategoricalDtype: + """ + Construct a CategoricalDtype from a string. + + Parameters + ---------- + string : str + Must be the string "category" in order to be successfully constructed. + + Returns + ------- + CategoricalDtype + Instance of the dtype. + + Raises + ------ + TypeError + If a CategoricalDtype cannot be constructed from the input. + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + if string != cls.name: + raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'") + + # need ordered=None to ensure that operations specifying dtype="category" don't + # override the ordered value for existing categoricals + return cls(ordered=None) + + def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None: + if ordered is not None: + self.validate_ordered(ordered) + + if categories is not None: + categories = self.validate_categories(categories, fastpath=fastpath) + + self._categories = categories + self._ordered = ordered + + def __setstate__(self, state: MutableMapping[str_type, Any]) -> None: + # for pickle compat. __get_state__ is defined in the + # PandasExtensionDtype superclass and uses the public properties to + # pickle -> need to set the settable private ones here (see GH26067) + self._categories = state.pop("categories", None) + self._ordered = state.pop("ordered", False) + + def __hash__(self) -> int: + # _hash_categories returns a uint64, so use the negative + # space for when we have unknown categories to avoid a conflict + if self.categories is None: + if self.ordered: + return -1 + else: + return -2 + # We *do* want to include the real self.ordered here + return int(self._hash_categories) + + def __eq__(self, other: object) -> bool: + """ + Rules for CDT equality: + 1) Any CDT is equal to the string 'category' + 2) Any CDT is equal to itself + 3) Any CDT is equal to a CDT with categories=None regardless of ordered + 4) A CDT with ordered=True is only equal to another CDT with + ordered=True and identical categories in the same order + 5) A CDT with ordered={False, None} is only equal to another CDT with + ordered={False, None} and identical categories, but same order is + not required. There is no distinction between False/None. + 6) Any other comparison returns False + """ + if isinstance(other, str): + return other == self.name + elif other is self: + return True + elif not (hasattr(other, "ordered") and hasattr(other, "categories")): + return False + elif self.categories is None or other.categories is None: + # For non-fully-initialized dtypes, these are only equal to + # - the string "category" (handled above) + # - other CategoricalDtype with categories=None + return self.categories is other.categories + elif self.ordered or other.ordered: + # At least one has ordered=True; equal if both have ordered=True + # and the same values for categories in the same order. + return (self.ordered == other.ordered) and self.categories.equals( + other.categories + ) + else: + # Neither has ordered=True; equal if both have the same categories, + # but same order is not necessary. There is no distinction between + # ordered=False and ordered=None: CDT(., False) and CDT(., None) + # will be equal if they have the same categories. + left = self.categories + right = other.categories + + # GH#36280 the ordering of checks here is for performance + if not left.dtype == right.dtype: + return False + + if len(left) != len(right): + return False + + if self.categories.equals(other.categories): + # Check and see if they happen to be identical categories + return True + + if left.dtype != object: + # Faster than calculating hash + indexer = left.get_indexer(right) + # Because left and right have the same length and are unique, + # `indexer` not having any -1s implies that there is a + # bijection between `left` and `right`. + return (indexer != -1).all() + + # With object-dtype we need a comparison that identifies + # e.g. int(2) as distinct from float(2) + return set(left) == set(right) + + def __repr__(self) -> str_type: + if self.categories is None: + data = "None" + dtype = "None" + else: + data = self.categories._format_data(name=type(self).__name__) + if isinstance(self.categories, ABCRangeIndex): + data = str(self.categories._range) + data = data.rstrip(", ") + dtype = self.categories.dtype + + return ( + f"CategoricalDtype(categories={data}, ordered={self.ordered}, " + f"categories_dtype={dtype})" + ) + + @cache_readonly + def _hash_categories(self) -> int: + from pandas.core.util.hashing import ( + combine_hash_arrays, + hash_array, + hash_tuples, + ) + + categories = self.categories + ordered = self.ordered + + if len(categories) and isinstance(categories[0], tuple): + # assumes if any individual category is a tuple, then all our. ATM + # I don't really want to support just some of the categories being + # tuples. + cat_list = list(categories) # breaks if a np.array of categories + cat_array = hash_tuples(cat_list) + else: + if categories.dtype == "O" and len({type(x) for x in categories}) != 1: + # TODO: hash_array doesn't handle mixed types. It casts + # everything to a str first, which means we treat + # {'1', '2'} the same as {'1', 2} + # find a better solution + hashed = hash((tuple(categories), ordered)) + return hashed + + if DatetimeTZDtype.is_dtype(categories.dtype): + # Avoid future warning. + categories = categories.view("datetime64[ns]") + + cat_array = hash_array(np.asarray(categories), categorize=False) + if ordered: + cat_array = np.vstack( + [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)] + ) + else: + cat_array = np.array([cat_array]) + combined_hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) + return np.bitwise_xor.reduce(combined_hashed) + + @classmethod + def construct_array_type(cls) -> type_t[Categorical]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas import Categorical + + return Categorical + + @staticmethod + def validate_ordered(ordered: Ordered) -> None: + """ + Validates that we have a valid ordered parameter. If + it is not a boolean, a TypeError will be raised. + + Parameters + ---------- + ordered : object + The parameter to be verified. + + Raises + ------ + TypeError + If 'ordered' is not a boolean. + """ + if not is_bool(ordered): + raise TypeError("'ordered' must either be 'True' or 'False'") + + @staticmethod + def validate_categories(categories, fastpath: bool = False) -> Index: + """ + Validates that we have good categories + + Parameters + ---------- + categories : array-like + fastpath : bool + Whether to skip nan and uniqueness checks + + Returns + ------- + categories : Index + """ + from pandas.core.indexes.base import Index + + if not fastpath and not is_list_like(categories): + raise TypeError( + f"Parameter 'categories' must be list-like, was {repr(categories)}" + ) + if not isinstance(categories, ABCIndex): + categories = Index._with_infer(categories, tupleize_cols=False) + + if not fastpath: + if categories.hasnans: + raise ValueError("Categorical categories cannot be null") + + if not categories.is_unique: + raise ValueError("Categorical categories must be unique") + + if isinstance(categories, ABCCategoricalIndex): + categories = categories.categories + + return categories + + def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: + """ + Returns a CategoricalDtype with categories and ordered taken from dtype + if specified, otherwise falling back to self if unspecified + + Parameters + ---------- + dtype : CategoricalDtype + + Returns + ------- + new_dtype : CategoricalDtype + """ + if isinstance(dtype, str) and dtype == "category": + # dtype='category' should not change anything + return self + elif not self.is_dtype(dtype): + raise ValueError( + f"a CategoricalDtype must be passed to perform an update, " + f"got {repr(dtype)}" + ) + else: + # from here on, dtype is a CategoricalDtype + dtype = cast(CategoricalDtype, dtype) + + # update categories/ordered unless they've been explicitly passed as None + new_categories = ( + dtype.categories if dtype.categories is not None else self.categories + ) + new_ordered = dtype.ordered if dtype.ordered is not None else self.ordered + + return CategoricalDtype(new_categories, new_ordered) + + @property + def categories(self) -> Index: + """ + An ``Index`` containing the unique categories allowed. + + Examples + -------- + >>> cat_type = pd.CategoricalDtype(categories=['a', 'b'], ordered=True) + >>> cat_type.categories + Index(['a', 'b'], dtype='object') + """ + return self._categories + + @property + def ordered(self) -> Ordered: + """ + Whether the categories have an ordered relationship. + + Examples + -------- + >>> cat_type = pd.CategoricalDtype(categories=['a', 'b'], ordered=True) + >>> cat_type.ordered + True + + >>> cat_type = pd.CategoricalDtype(categories=['a', 'b'], ordered=False) + >>> cat_type.ordered + False + """ + return self._ordered + + @property + def _is_boolean(self) -> bool: + from pandas.core.dtypes.common import is_bool_dtype + + return is_bool_dtype(self.categories) + + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + # check if we have all categorical dtype with identical categories + if all(isinstance(x, CategoricalDtype) for x in dtypes): + first = dtypes[0] + if all(first == other for other in dtypes[1:]): + return first + + # special case non-initialized categorical + # TODO we should figure out the expected return value in general + non_init_cats = [ + isinstance(x, CategoricalDtype) and x.categories is None for x in dtypes + ] + if all(non_init_cats): + return self + elif any(non_init_cats): + return None + + # categorical is aware of Sparse -> extract sparse subdtypes + dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] + # extract the categories' dtype + non_cat_dtypes = [ + x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes + ] + # TODO should categorical always give an answer? + from pandas.core.dtypes.cast import find_common_type + + return find_common_type(non_cat_dtypes) + + @cache_readonly + def index_class(self) -> type_t[CategoricalIndex]: + from pandas import CategoricalIndex + + return CategoricalIndex + + +@register_extension_dtype +class DatetimeTZDtype(PandasExtensionDtype): + """ + An ExtensionDtype for timezone-aware datetime data. + + **This is not an actual numpy dtype**, but a duck type. + + Parameters + ---------- + unit : str, default "ns" + The precision of the datetime data. Currently limited + to ``"ns"``. + tz : str, int, or datetime.tzinfo + The timezone. + + Attributes + ---------- + unit + tz + + Methods + ------- + None + + Raises + ------ + ZoneInfoNotFoundError + When the requested timezone cannot be found. + + Examples + -------- + >>> from zoneinfo import ZoneInfo + >>> pd.DatetimeTZDtype(tz=ZoneInfo('UTC')) + datetime64[ns, UTC] + + >>> pd.DatetimeTZDtype(tz=ZoneInfo('Europe/Paris')) + datetime64[ns, Europe/Paris] + """ + + type: type[Timestamp] = Timestamp + kind: str_type = "M" + num = 101 + _metadata = ("unit", "tz") + _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") + _cache_dtypes: dict[str_type, PandasExtensionDtype] = {} + _supports_2d = True + _can_fast_transpose = True + + @property + def na_value(self) -> NaTType: + return NaT + + @cache_readonly + def base(self) -> DtypeObj: # type: ignore[override] + return np.dtype(f"M8[{self.unit}]") + + # error: Signature of "str" incompatible with supertype "PandasExtensionDtype" + @cache_readonly + def str(self) -> str: # type: ignore[override] + return f"|M8[{self.unit}]" + + def __init__(self, unit: str_type | DatetimeTZDtype = "ns", tz=None) -> None: + if isinstance(unit, DatetimeTZDtype): + # error: "str" has no attribute "tz" + unit, tz = unit.unit, unit.tz # type: ignore[attr-defined] + + if unit != "ns": + if isinstance(unit, str) and tz is None: + # maybe a string like datetime64[ns, tz], which we support for + # now. + result = type(self).construct_from_string(unit) + unit = result.unit + tz = result.tz + msg = ( + f"Passing a dtype alias like 'datetime64[ns, {tz}]' " + "to DatetimeTZDtype is no longer supported. Use " + "'DatetimeTZDtype.construct_from_string()' instead." + ) + raise ValueError(msg) + if unit not in ["s", "ms", "us", "ns"]: + raise ValueError("DatetimeTZDtype only supports s, ms, us, ns units") + + if tz: + tz = timezones.maybe_get_tz(tz) + tz = timezones.tz_standardize(tz) + elif tz is not None: + raise pytz.UnknownTimeZoneError(tz) + if tz is None: + raise TypeError("A 'tz' is required.") + + self._unit = unit + self._tz = tz + + @cache_readonly + def _creso(self) -> int: + """ + The NPY_DATETIMEUNIT corresponding to this dtype's resolution. + """ + return abbrev_to_npy_unit(self.unit) + + @property + def unit(self) -> str_type: + """ + The precision of the datetime data. + + Examples + -------- + >>> from zoneinfo import ZoneInfo + >>> dtype = pd.DatetimeTZDtype(tz=ZoneInfo('America/Los_Angeles')) + >>> dtype.unit + 'ns' + """ + return self._unit + + @property + def tz(self) -> tzinfo: + """ + The timezone. + + Examples + -------- + >>> from zoneinfo import ZoneInfo + >>> dtype = pd.DatetimeTZDtype(tz=ZoneInfo('America/Los_Angeles')) + >>> dtype.tz + zoneinfo.ZoneInfo(key='America/Los_Angeles') + """ + return self._tz + + @classmethod + def construct_array_type(cls) -> type_t[DatetimeArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays import DatetimeArray + + return DatetimeArray + + @classmethod + def construct_from_string(cls, string: str_type) -> DatetimeTZDtype: + """ + Construct a DatetimeTZDtype from a string. + + Parameters + ---------- + string : str + The string alias for this DatetimeTZDtype. + Should be formatted like ``datetime64[ns, ]``, + where ```` is the timezone name. + + Examples + -------- + >>> DatetimeTZDtype.construct_from_string('datetime64[ns, UTC]') + datetime64[ns, UTC] + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + + msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'" + match = cls._match.match(string) + if match: + d = match.groupdict() + try: + return cls(unit=d["unit"], tz=d["tz"]) + except (KeyError, TypeError, ValueError) as err: + # KeyError if maybe_get_tz tries and fails to get a + # pytz timezone (actually pytz.UnknownTimeZoneError). + # TypeError if we pass a nonsense tz; + # ValueError if we pass a unit other than "ns" + raise TypeError(msg) from err + raise TypeError(msg) + + def __str__(self) -> str_type: + return f"datetime64[{self.unit}, {self.tz}]" + + @property + def name(self) -> str_type: + """A string representation of the dtype.""" + return str(self) + + def __hash__(self) -> int: + # make myself hashable + # TODO: update this. + return hash(str(self)) + + def __eq__(self, other: object) -> bool: + if isinstance(other, str): + if other.startswith("M8["): + other = f"datetime64[{other[3:]}" + return other == self.name + + return ( + isinstance(other, DatetimeTZDtype) + and self.unit == other.unit + and tz_compare(self.tz, other.tz) + ) + + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> DatetimeArray: + """ + Construct DatetimeArray from pyarrow Array/ChunkedArray. + + Note: If the units in the pyarrow Array are the same as this + DatetimeDtype, then values corresponding to the integer representation + of ``NaT`` (e.g. one nanosecond before :attr:`pandas.Timestamp.min`) + are converted to ``NaT``, regardless of the null indicator in the + pyarrow array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + The Arrow array to convert to DatetimeArray. + + Returns + ------- + extension array : DatetimeArray + """ + import pyarrow + + from pandas.core.arrays import DatetimeArray + + array = array.cast(pyarrow.timestamp(unit=self._unit), safe=True) + + if isinstance(array, pyarrow.Array): + np_arr = array.to_numpy(zero_copy_only=False) + else: + np_arr = array.to_numpy() + + return DatetimeArray._simple_new(np_arr, dtype=self) + + def __setstate__(self, state) -> None: + # for pickle compat. __get_state__ is defined in the + # PandasExtensionDtype superclass and uses the public properties to + # pickle -> need to set the settable private ones here (see GH26067) + self._tz = state["tz"] + self._unit = state["unit"] + + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + if all(isinstance(t, DatetimeTZDtype) and t.tz == self.tz for t in dtypes): + np_dtype = np.max([cast(DatetimeTZDtype, t).base for t in [self, *dtypes]]) + unit = np.datetime_data(np_dtype)[0] + return type(self)(unit=unit, tz=self.tz) + return super()._get_common_dtype(dtypes) + + @cache_readonly + def index_class(self) -> type_t[DatetimeIndex]: + from pandas import DatetimeIndex + + return DatetimeIndex + + +@register_extension_dtype +class PeriodDtype(PeriodDtypeBase, PandasExtensionDtype): + """ + An ExtensionDtype for Period data. + + **This is not an actual numpy dtype**, but a duck type. + + Parameters + ---------- + freq : str or DateOffset + The frequency of this PeriodDtype. + + Attributes + ---------- + freq + + Methods + ------- + None + + Examples + -------- + >>> pd.PeriodDtype(freq='D') + period[D] + + >>> pd.PeriodDtype(freq=pd.offsets.MonthEnd()) + period[M] + """ + + type: type[Period] = Period + kind: str_type = "O" + str = "|O08" + base = np.dtype("O") + num = 102 + _metadata = ("freq",) + _match = re.compile(r"(P|p)eriod\[(?P.+)\]") + # error: Incompatible types in assignment (expression has type + # "Dict[int, PandasExtensionDtype]", base class "PandasExtensionDtype" + # defined the type as "Dict[str, PandasExtensionDtype]") [assignment] + _cache_dtypes: dict[BaseOffset, int] = {} # type: ignore[assignment] + __hash__ = PeriodDtypeBase.__hash__ + _freq: BaseOffset + _supports_2d = True + _can_fast_transpose = True + + def __new__(cls, freq) -> PeriodDtype: # noqa: PYI034 + """ + Parameters + ---------- + freq : PeriodDtype, BaseOffset, or string + """ + if isinstance(freq, PeriodDtype): + return freq + + if not isinstance(freq, BaseOffset): + freq = cls._parse_dtype_strict(freq) + + if isinstance(freq, BDay): + # GH#53446 + # TODO(3.0): enforcing this will close GH#10575 + warnings.warn( + "PeriodDtype[B] is deprecated and will be removed in a future " + "version. Use a DatetimeIndex with freq='B' instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + try: + dtype_code = cls._cache_dtypes[freq] + except KeyError: + dtype_code = freq._period_dtype_code + cls._cache_dtypes[freq] = dtype_code + u = PeriodDtypeBase.__new__(cls, dtype_code, freq.n) + u._freq = freq + return u + + def __reduce__(self) -> tuple[type_t[Self], tuple[str_type]]: + return type(self), (self.name,) + + @property + def freq(self) -> BaseOffset: + """ + The frequency object of this PeriodDtype. + + Examples + -------- + >>> dtype = pd.PeriodDtype(freq='D') + >>> dtype.freq + + """ + return self._freq + + @classmethod + def _parse_dtype_strict(cls, freq: str_type) -> BaseOffset: + if isinstance(freq, str): # note: freq is already of type str! + if freq.startswith(("Period[", "period[")): + m = cls._match.search(freq) + if m is not None: + freq = m.group("freq") + + freq_offset = to_offset(freq, is_period=True) + if freq_offset is not None: + return freq_offset + + raise TypeError( + "PeriodDtype argument should be string or BaseOffset, " + f"got {type(freq).__name__}" + ) + + @classmethod + def construct_from_string(cls, string: str_type) -> PeriodDtype: + """ + Strict construction from a string, raise a TypeError if not + possible + """ + if ( + isinstance(string, str) + and (string.startswith(("period[", "Period["))) + or isinstance(string, BaseOffset) + ): + # do not parse string like U as period[U] + # avoid tuple to be regarded as freq + try: + return cls(freq=string) + except ValueError: + pass + if isinstance(string, str): + msg = f"Cannot construct a 'PeriodDtype' from '{string}'" + else: + msg = f"'construct_from_string' expects a string, got {type(string)}" + raise TypeError(msg) + + def __str__(self) -> str_type: + return self.name + + @property + def name(self) -> str_type: + return f"period[{self._freqstr}]" + + @property + def na_value(self) -> NaTType: + return NaT + + def __eq__(self, other: object) -> bool: + if isinstance(other, str): + return other in [self.name, capitalize_first_letter(self.name)] + + return super().__eq__(other) + + def __ne__(self, other: object) -> bool: + return not self.__eq__(other) + + @classmethod + def is_dtype(cls, dtype: object) -> bool: + """ + Return a boolean if we if the passed type is an actual dtype that we + can match (via string or type) + """ + if isinstance(dtype, str): + # PeriodDtype can be instantiated from freq string like "U", + # but doesn't regard freq str like "U" as dtype. + if dtype.startswith(("period[", "Period[")): + try: + return cls._parse_dtype_strict(dtype) is not None + except ValueError: + return False + else: + return False + return super().is_dtype(dtype) + + @classmethod + def construct_array_type(cls) -> type_t[PeriodArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays import PeriodArray + + return PeriodArray + + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> PeriodArray: + """ + Construct PeriodArray from pyarrow Array/ChunkedArray. + """ + import pyarrow + + from pandas.core.arrays import PeriodArray + from pandas.core.arrays.arrow._arrow_utils import ( + pyarrow_array_to_numpy_and_mask, + ) + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=np.dtype(np.int64)) + parr = PeriodArray(data.copy(), dtype=self, copy=False) + # error: Invalid index type "ndarray[Any, dtype[bool_]]" for "PeriodArray"; + # expected type "Union[int, Sequence[int], Sequence[bool], slice]" + parr[~mask] = NaT # type: ignore[index] + results.append(parr) + + if not results: + return PeriodArray(np.array([], dtype="int64"), dtype=self, copy=False) + return PeriodArray._concat_same_type(results) + + @cache_readonly + def index_class(self) -> type_t[PeriodIndex]: + from pandas import PeriodIndex + + return PeriodIndex + + +@register_extension_dtype +class IntervalDtype(PandasExtensionDtype): + """ + An ExtensionDtype for Interval data. + + **This is not an actual numpy dtype**, but a duck type. + + Parameters + ---------- + subtype : str, np.dtype + The dtype of the Interval bounds. + + Attributes + ---------- + subtype + + Methods + ------- + None + + Examples + -------- + >>> pd.IntervalDtype(subtype='int64', closed='both') + interval[int64, both] + """ + + name = "interval" + kind: str_type = "O" + str = "|O08" + base = np.dtype("O") + num = 103 + _metadata = ( + "subtype", + "closed", + ) + + _match = re.compile( + r"(I|i)nterval\[(?P[^,]+(\[.+\])?)" + r"(, (?P(right|left|both|neither)))?\]" + ) + + _cache_dtypes: dict[str_type, PandasExtensionDtype] = {} + _subtype: None | np.dtype + _closed: IntervalClosedType | None + + def __init__(self, subtype=None, closed: IntervalClosedType | None = None) -> None: + from pandas.core.dtypes.common import ( + is_string_dtype, + pandas_dtype, + ) + + if closed is not None and closed not in {"right", "left", "both", "neither"}: + raise ValueError("closed must be one of 'right', 'left', 'both', 'neither'") + + if isinstance(subtype, IntervalDtype): + if closed is not None and closed != subtype.closed: + raise ValueError( + "dtype.closed and 'closed' do not match. " + "Try IntervalDtype(dtype.subtype, closed) instead." + ) + self._subtype = subtype._subtype + self._closed = subtype._closed + elif subtype is None: + # we are called as an empty constructor + # generally for pickle compat + self._subtype = None + self._closed = closed + elif isinstance(subtype, str) and subtype.lower() == "interval": + self._subtype = None + self._closed = closed + else: + if isinstance(subtype, str): + m = IntervalDtype._match.search(subtype) + if m is not None: + gd = m.groupdict() + subtype = gd["subtype"] + if gd.get("closed", None) is not None: + if closed is not None: + if closed != gd["closed"]: + raise ValueError( + "'closed' keyword does not match value " + "specified in dtype string" + ) + closed = gd["closed"] # type: ignore[assignment] + + try: + subtype = pandas_dtype(subtype) + except TypeError as err: + raise TypeError("could not construct IntervalDtype") from err + if CategoricalDtype.is_dtype(subtype) or is_string_dtype(subtype): + # GH 19016 + msg = ( + "category, object, and string subtypes are not supported " + "for IntervalDtype" + ) + raise TypeError(msg) + self._subtype = subtype + self._closed = closed + + @cache_readonly + def _can_hold_na(self) -> bool: + subtype = self._subtype + if subtype is None: + # partially-initialized + raise NotImplementedError( + "_can_hold_na is not defined for partially-initialized IntervalDtype" + ) + if subtype.kind in "iu": + return False + return True + + @property + def closed(self) -> IntervalClosedType: + return self._closed # type: ignore[return-value] + + @property + def subtype(self): + """ + The dtype of the Interval bounds. + + Examples + -------- + >>> dtype = pd.IntervalDtype(subtype='int64', closed='both') + >>> dtype.subtype + dtype('int64') + """ + return self._subtype + + @classmethod + def construct_array_type(cls) -> type[IntervalArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays import IntervalArray + + return IntervalArray + + @classmethod + def construct_from_string(cls, string: str_type) -> IntervalDtype: + """ + attempt to construct this type from a string, raise a TypeError + if its not possible + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + + if string.lower() == "interval" or cls._match.search(string) is not None: + return cls(string) + + msg = ( + f"Cannot construct a 'IntervalDtype' from '{string}'.\n\n" + "Incorrectly formatted string passed to constructor. " + "Valid formats include Interval or Interval[dtype] " + "where dtype is numeric, datetime, or timedelta" + ) + raise TypeError(msg) + + @property + def type(self) -> type[Interval]: + return Interval + + def __str__(self) -> str_type: + if self.subtype is None: + return "interval" + if self.closed is None: + # Only partially initialized GH#38394 + return f"interval[{self.subtype}]" + return f"interval[{self.subtype}, {self.closed}]" + + def __hash__(self) -> int: + # make myself hashable + return hash(str(self)) + + def __eq__(self, other: object) -> bool: + if isinstance(other, str): + return other.lower() in (self.name.lower(), str(self).lower()) + elif not isinstance(other, IntervalDtype): + return False + elif self.subtype is None or other.subtype is None: + # None should match any subtype + return True + elif self.closed != other.closed: + return False + else: + return self.subtype == other.subtype + + def __setstate__(self, state) -> None: + # for pickle compat. __get_state__ is defined in the + # PandasExtensionDtype superclass and uses the public properties to + # pickle -> need to set the settable private ones here (see GH26067) + self._subtype = state["subtype"] + + # backward-compat older pickles won't have "closed" key + self._closed = state.pop("closed", None) + + @classmethod + def is_dtype(cls, dtype: object) -> bool: + """ + Return a boolean if we if the passed type is an actual dtype that we + can match (via string or type) + """ + if isinstance(dtype, str): + if dtype.lower().startswith("interval"): + try: + return cls.construct_from_string(dtype) is not None + except (ValueError, TypeError): + return False + else: + return False + return super().is_dtype(dtype) + + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> IntervalArray: + """ + Construct IntervalArray from pyarrow Array/ChunkedArray. + """ + import pyarrow + + from pandas.core.arrays import IntervalArray + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + if isinstance(arr, pyarrow.ExtensionArray): + arr = arr.storage + left = np.asarray(arr.field("left"), dtype=self.subtype) + right = np.asarray(arr.field("right"), dtype=self.subtype) + iarr = IntervalArray.from_arrays(left, right, closed=self.closed) + results.append(iarr) + + if not results: + return IntervalArray.from_arrays( + np.array([], dtype=self.subtype), + np.array([], dtype=self.subtype), + closed=self.closed, + ) + return IntervalArray._concat_same_type(results) + + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + if not all(isinstance(x, IntervalDtype) for x in dtypes): + return None + + closed = cast("IntervalDtype", dtypes[0]).closed + if not all(cast("IntervalDtype", x).closed == closed for x in dtypes): + return np.dtype(object) + + from pandas.core.dtypes.cast import find_common_type + + common = find_common_type([cast("IntervalDtype", x).subtype for x in dtypes]) + if common == object: + return np.dtype(object) + return IntervalDtype(common, closed=closed) + + @cache_readonly + def index_class(self) -> type_t[IntervalIndex]: + from pandas import IntervalIndex + + return IntervalIndex + + +class NumpyEADtype(ExtensionDtype): + """ + A Pandas ExtensionDtype for NumPy dtypes. + + This is mostly for internal compatibility, and is not especially + useful on its own. + + Parameters + ---------- + dtype : object + Object to be converted to a NumPy data type object. + + See Also + -------- + numpy.dtype + """ + + _metadata = ("_dtype",) + _supports_2d = False + _can_fast_transpose = False + + def __init__(self, dtype: npt.DTypeLike | NumpyEADtype | None) -> None: + if isinstance(dtype, NumpyEADtype): + # make constructor idempotent + dtype = dtype.numpy_dtype + self._dtype = np.dtype(dtype) + + def __repr__(self) -> str: + return f"NumpyEADtype({repr(self.name)})" + + @property + def numpy_dtype(self) -> np.dtype: + """ + The NumPy dtype this NumpyEADtype wraps. + """ + return self._dtype + + @property + def name(self) -> str: + """ + A bit-width name for this data-type. + """ + return self._dtype.name + + @property + def type(self) -> type[np.generic]: + """ + The type object used to instantiate a scalar of this NumPy data-type. + """ + return self._dtype.type + + @property + def _is_numeric(self) -> bool: + # exclude object, str, unicode, void. + return self.kind in set("biufc") + + @property + def _is_boolean(self) -> bool: + return self.kind == "b" + + @classmethod + def construct_from_string(cls, string: str) -> NumpyEADtype: + try: + dtype = np.dtype(string) + except TypeError as err: + if not isinstance(string, str): + msg = f"'construct_from_string' expects a string, got {type(string)}" + else: + msg = f"Cannot construct a 'NumpyEADtype' from '{string}'" + raise TypeError(msg) from err + return cls(dtype) + + @classmethod + def construct_array_type(cls) -> type_t[NumpyExtensionArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays import NumpyExtensionArray + + return NumpyExtensionArray + + @property + def kind(self) -> str: + """ + A character code (one of 'biufcmMOSUV') identifying the general kind of data. + """ + return self._dtype.kind + + @property + def itemsize(self) -> int: + """ + The element size of this data-type object. + """ + return self._dtype.itemsize + + +class BaseMaskedDtype(ExtensionDtype): + """ + Base class for dtypes for BaseMaskedArray subclasses. + """ + + base = None + type: type + + @property + def na_value(self) -> libmissing.NAType: + return libmissing.NA + + @cache_readonly + def numpy_dtype(self) -> np.dtype: + """Return an instance of our numpy dtype""" + return np.dtype(self.type) + + @cache_readonly + def kind(self) -> str: + return self.numpy_dtype.kind + + @cache_readonly + def itemsize(self) -> int: + """Return the number of bytes in this dtype""" + return self.numpy_dtype.itemsize + + @classmethod + def construct_array_type(cls) -> type_t[BaseMaskedArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + raise NotImplementedError + + @classmethod + def from_numpy_dtype(cls, dtype: np.dtype) -> BaseMaskedDtype: + """ + Construct the MaskedDtype corresponding to the given numpy dtype. + """ + if dtype.kind == "b": + from pandas.core.arrays.boolean import BooleanDtype + + return BooleanDtype() + elif dtype.kind in "iu": + from pandas.core.arrays.integer import NUMPY_INT_TO_DTYPE + + return NUMPY_INT_TO_DTYPE[dtype] + elif dtype.kind == "f": + from pandas.core.arrays.floating import NUMPY_FLOAT_TO_DTYPE + + return NUMPY_FLOAT_TO_DTYPE[dtype] + else: + raise NotImplementedError(dtype) + + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + # We unwrap any masked dtypes, find the common dtype we would use + # for that, then re-mask the result. + from pandas.core.dtypes.cast import find_common_type + + new_dtype = find_common_type( + [ + dtype.numpy_dtype if isinstance(dtype, BaseMaskedDtype) else dtype + for dtype in dtypes + ] + ) + if not isinstance(new_dtype, np.dtype): + # If we ever support e.g. Masked[DatetimeArray] then this will change + return None + try: + return type(self).from_numpy_dtype(new_dtype) + except (KeyError, NotImplementedError): + return None + + +@register_extension_dtype +class SparseDtype(ExtensionDtype): + """ + Dtype for data stored in :class:`SparseArray`. + + This dtype implements the pandas ExtensionDtype interface. + + Parameters + ---------- + dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 + The dtype of the underlying array storing the non-fill value values. + fill_value : scalar, optional + The scalar value not stored in the SparseArray. By default, this + depends on `dtype`. + + =========== ========== + dtype na_value + =========== ========== + float ``np.nan`` + int ``0`` + bool ``False`` + datetime64 ``pd.NaT`` + timedelta64 ``pd.NaT`` + =========== ========== + + The default value may be overridden by specifying a `fill_value`. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> ser = pd.Series([1, 0, 0], dtype=pd.SparseDtype(dtype=int, fill_value=0)) + >>> ser + 0 1 + 1 0 + 2 0 + dtype: Sparse[int64, 0] + >>> ser.sparse.density + 0.3333333333333333 + """ + + _is_immutable = True + + # We include `_is_na_fill_value` in the metadata to avoid hash collisions + # between SparseDtype(float, 0.0) and SparseDtype(float, nan). + # Without is_na_fill_value in the comparison, those would be equal since + # hash(nan) is (sometimes?) 0. + _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") + + def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: + if isinstance(dtype, type(self)): + if fill_value is None: + fill_value = dtype.fill_value + dtype = dtype.subtype + + from pandas.core.dtypes.common import ( + is_string_dtype, + pandas_dtype, + ) + from pandas.core.dtypes.missing import na_value_for_dtype + + dtype = pandas_dtype(dtype) + if is_string_dtype(dtype): + dtype = np.dtype("object") + if not isinstance(dtype, np.dtype): + # GH#53160 + raise TypeError("SparseDtype subtype must be a numpy dtype") + + if fill_value is None: + fill_value = na_value_for_dtype(dtype) + + self._dtype = dtype + self._fill_value = fill_value + self._check_fill_value() + + def __hash__(self) -> int: + # Python3 doesn't inherit __hash__ when a base class overrides + # __eq__, so we explicitly do it here. + return super().__hash__() + + def __eq__(self, other: object) -> bool: + # We have to override __eq__ to handle NA values in _metadata. + # The base class does simple == checks, which fail for NA. + if isinstance(other, str): + try: + other = self.construct_from_string(other) + except TypeError: + return False + + if isinstance(other, type(self)): + subtype = self.subtype == other.subtype + if self._is_na_fill_value: + # this case is complicated by two things: + # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) + # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) + # i.e. we want to treat any floating-point NaN as equal, but + # not a floating-point NaN and a datetime NaT. + fill_value = ( + other._is_na_fill_value + and isinstance(self.fill_value, type(other.fill_value)) + or isinstance(other.fill_value, type(self.fill_value)) + ) + else: + with warnings.catch_warnings(): + # Ignore spurious numpy warning + warnings.filterwarnings( + "ignore", + "elementwise comparison failed", + category=DeprecationWarning, + ) + + fill_value = self.fill_value == other.fill_value + + return subtype and fill_value + return False + + @property + def fill_value(self): + """ + The fill value of the array. + + Converting the SparseArray to a dense ndarray will fill the + array with this value. + + .. warning:: + + It's possible to end up with a SparseArray that has ``fill_value`` + values in ``sp_values``. This can occur, for example, when setting + ``SparseArray.fill_value`` directly. + """ + return self._fill_value + + def _check_fill_value(self) -> None: + if not lib.is_scalar(self._fill_value): + raise ValueError( + f"fill_value must be a scalar. Got {self._fill_value} instead" + ) + + from pandas.core.dtypes.cast import can_hold_element + from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna, + ) + + from pandas.core.construction import ensure_wrapped_if_datetimelike + + # GH#23124 require fill_value and subtype to match + val = self._fill_value + if isna(val): + if not is_valid_na_for_dtype(val, self.subtype): + warnings.warn( + "Allowing arbitrary scalar fill_value in SparseDtype is " + "deprecated. In a future version, the fill_value must be " + "a valid value for the SparseDtype.subtype.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + dummy = np.empty(0, dtype=self.subtype) + dummy = ensure_wrapped_if_datetimelike(dummy) + + if not can_hold_element(dummy, val): + warnings.warn( + "Allowing arbitrary scalar fill_value in SparseDtype is " + "deprecated. In a future version, the fill_value must be " + "a valid value for the SparseDtype.subtype.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + @property + def _is_na_fill_value(self) -> bool: + from pandas import isna + + return isna(self.fill_value) + + @property + def _is_numeric(self) -> bool: + return not self.subtype == object + + @property + def _is_boolean(self) -> bool: + return self.subtype.kind == "b" + + @property + def kind(self) -> str: + """ + The sparse kind. Either 'integer', or 'block'. + """ + return self.subtype.kind + + @property + def type(self): + return self.subtype.type + + @property + def subtype(self): + return self._dtype + + @property + def name(self) -> str: + return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]" + + def __repr__(self) -> str: + return self.name + + @classmethod + def construct_array_type(cls) -> type_t[SparseArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays.sparse.array import SparseArray + + return SparseArray + + @classmethod + def construct_from_string(cls, string: str) -> SparseDtype: + """ + Construct a SparseDtype from a string form. + + Parameters + ---------- + string : str + Can take the following forms. + + string dtype + ================ ============================ + 'int' SparseDtype[np.int64, 0] + 'Sparse' SparseDtype[np.float64, nan] + 'Sparse[int]' SparseDtype[np.int64, 0] + 'Sparse[int, 0]' SparseDtype[np.int64, 0] + ================ ============================ + + It is not possible to specify non-default fill values + with a string. An argument like ``'Sparse[int, 1]'`` + will raise a ``TypeError`` because the default fill value + for integers is 0. + + Returns + ------- + SparseDtype + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + msg = f"Cannot construct a 'SparseDtype' from '{string}'" + if string.startswith("Sparse"): + try: + sub_type, has_fill_value = cls._parse_subtype(string) + except ValueError as err: + raise TypeError(msg) from err + else: + result = SparseDtype(sub_type) + msg = ( + f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt " + "looks like the fill_value in the string is not " + "the default for the dtype. Non-default fill_values " + "are not supported. Use the 'SparseDtype()' " + "constructor instead." + ) + if has_fill_value and str(result) != string: + raise TypeError(msg) + return result + else: + raise TypeError(msg) + + @staticmethod + def _parse_subtype(dtype: str) -> tuple[str, bool]: + """ + Parse a string to get the subtype + + Parameters + ---------- + dtype : str + A string like + + * Sparse[subtype] + * Sparse[subtype, fill_value] + + Returns + ------- + subtype : str + + Raises + ------ + ValueError + When the subtype cannot be extracted. + """ + xpr = re.compile(r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$") + m = xpr.match(dtype) + has_fill_value = False + if m: + subtype = m.groupdict()["subtype"] + has_fill_value = bool(m.groupdict()["fill_value"]) + elif dtype == "Sparse": + subtype = "float64" + else: + raise ValueError(f"Cannot parse {dtype}") + return subtype, has_fill_value + + @classmethod + def is_dtype(cls, dtype: object) -> bool: + dtype = getattr(dtype, "dtype", dtype) + if isinstance(dtype, str) and dtype.startswith("Sparse"): + sub_type, _ = cls._parse_subtype(dtype) + dtype = np.dtype(sub_type) + elif isinstance(dtype, cls): + return True + return isinstance(dtype, np.dtype) or dtype == "Sparse" + + def update_dtype(self, dtype) -> SparseDtype: + """ + Convert the SparseDtype to a new dtype. + + This takes care of converting the ``fill_value``. + + Parameters + ---------- + dtype : Union[str, numpy.dtype, SparseDtype] + The new dtype to use. + + * For a SparseDtype, it is simply returned + * For a NumPy dtype (or str), the current fill value + is converted to the new dtype, and a SparseDtype + with `dtype` and the new fill value is returned. + + Returns + ------- + SparseDtype + A new SparseDtype with the correct `dtype` and fill value + for that `dtype`. + + Raises + ------ + ValueError + When the current fill value cannot be converted to the + new `dtype` (e.g. trying to convert ``np.nan`` to an + integer dtype). + + + Examples + -------- + >>> SparseDtype(int, 0).update_dtype(float) + Sparse[float64, 0.0] + + >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) + Sparse[float64, nan] + """ + from pandas.core.dtypes.astype import astype_array + from pandas.core.dtypes.common import pandas_dtype + + cls = type(self) + dtype = pandas_dtype(dtype) + + if not isinstance(dtype, cls): + if not isinstance(dtype, np.dtype): + raise TypeError("sparse arrays of extension dtypes not supported") + + fv_asarray = np.atleast_1d(np.array(self.fill_value)) + fvarr = astype_array(fv_asarray, dtype) + # NB: not fv_0d.item(), as that casts dt64->int + fill_value = fvarr[0] + dtype = cls(dtype, fill_value=fill_value) + + return dtype + + @property + def _subtype_with_str(self): + """ + Whether the SparseDtype's subtype should be considered ``str``. + + Typically, pandas will store string data in an object-dtype array. + When converting values to a dtype, e.g. in ``.astype``, we need to + be more specific, we need the actual underlying type. + + Returns + ------- + >>> SparseDtype(int, 1)._subtype_with_str + dtype('int64') + + >>> SparseDtype(object, 1)._subtype_with_str + dtype('O') + + >>> dtype = SparseDtype(str, '') + >>> dtype.subtype + dtype('O') + + >>> dtype._subtype_with_str + + """ + if isinstance(self.fill_value, str): + return type(self.fill_value) + return self.subtype + + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + # TODO for now only handle SparseDtypes and numpy dtypes => extend + # with other compatible extension dtypes + from pandas.core.dtypes.cast import np_find_common_type + + if any( + isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype) + for x in dtypes + ): + return None + + fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)] + fill_value = fill_values[0] + + from pandas import isna + + # np.nan isn't a singleton, so we may end up with multiple + # NaNs here, so we ignore the all NA case too. + if not (len(set(fill_values)) == 1 or isna(fill_values).all()): + warnings.warn( + "Concatenating sparse arrays with multiple fill " + f"values: '{fill_values}'. Picking the first and " + "converting the rest.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) + + np_dtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes) + return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value) + + +@register_extension_dtype +class ArrowDtype(StorageExtensionDtype): + """ + An ExtensionDtype for PyArrow data types. + + .. warning:: + + ArrowDtype is considered experimental. The implementation and + parts of the API may change without warning. + + While most ``dtype`` arguments can accept the "string" + constructor, e.g. ``"int64[pyarrow]"``, ArrowDtype is useful + if the data type contains parameters like ``pyarrow.timestamp``. + + Parameters + ---------- + pyarrow_dtype : pa.DataType + An instance of a `pyarrow.DataType `__. + + Attributes + ---------- + pyarrow_dtype + + Methods + ------- + None + + Returns + ------- + ArrowDtype + + Examples + -------- + >>> import pyarrow as pa + >>> pd.ArrowDtype(pa.int64()) + int64[pyarrow] + + Types with parameters must be constructed with ArrowDtype. + + >>> pd.ArrowDtype(pa.timestamp("s", tz="America/New_York")) + timestamp[s, tz=America/New_York][pyarrow] + >>> pd.ArrowDtype(pa.list_(pa.int64())) + list[pyarrow] + """ + + _metadata = ("storage", "pyarrow_dtype") # type: ignore[assignment] + + def __init__(self, pyarrow_dtype: pa.DataType) -> None: + super().__init__("pyarrow") + if pa_version_under10p1: + raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") + if not isinstance(pyarrow_dtype, pa.DataType): + raise ValueError( + f"pyarrow_dtype ({pyarrow_dtype}) must be an instance " + f"of a pyarrow.DataType. Got {type(pyarrow_dtype)} instead." + ) + self.pyarrow_dtype = pyarrow_dtype + + def __repr__(self) -> str: + return self.name + + def __hash__(self) -> int: + # make myself hashable + return hash(str(self)) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, type(self)): + return super().__eq__(other) + return self.pyarrow_dtype == other.pyarrow_dtype + + @property + def type(self): + """ + Returns associated scalar type. + """ + pa_type = self.pyarrow_dtype + if pa.types.is_integer(pa_type): + return int + elif pa.types.is_floating(pa_type): + return float + elif pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): + return str + elif ( + pa.types.is_binary(pa_type) + or pa.types.is_fixed_size_binary(pa_type) + or pa.types.is_large_binary(pa_type) + ): + return bytes + elif pa.types.is_boolean(pa_type): + return bool + elif pa.types.is_duration(pa_type): + if pa_type.unit == "ns": + return Timedelta + else: + return timedelta + elif pa.types.is_timestamp(pa_type): + if pa_type.unit == "ns": + return Timestamp + else: + return datetime + elif pa.types.is_date(pa_type): + return date + elif pa.types.is_time(pa_type): + return time + elif pa.types.is_decimal(pa_type): + return Decimal + elif pa.types.is_dictionary(pa_type): + # TODO: Potentially change this & CategoricalDtype.type to + # something more representative of the scalar + return CategoricalDtypeType + elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): + return list + elif pa.types.is_fixed_size_list(pa_type): + return list + elif pa.types.is_map(pa_type): + return list + elif pa.types.is_struct(pa_type): + return dict + elif pa.types.is_null(pa_type): + # TODO: None? pd.NA? pa.null? + return type(pa_type) + elif isinstance(pa_type, pa.ExtensionType): + return type(self)(pa_type.storage_type).type + raise NotImplementedError(pa_type) + + @property + def name(self) -> str: # type: ignore[override] + """ + A string identifying the data type. + """ + return f"{str(self.pyarrow_dtype)}[{self.storage}]" + + @cache_readonly + def numpy_dtype(self) -> np.dtype: + """Return an instance of the related numpy dtype""" + if pa.types.is_timestamp(self.pyarrow_dtype): + # pa.timestamp(unit).to_pandas_dtype() returns ns units + # regardless of the pyarrow timestamp units. + # This can be removed if/when pyarrow addresses it: + # https://github.com/apache/arrow/issues/34462 + return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") + if pa.types.is_duration(self.pyarrow_dtype): + # pa.duration(unit).to_pandas_dtype() returns ns units + # regardless of the pyarrow duration units + # This can be removed if/when pyarrow addresses it: + # https://github.com/apache/arrow/issues/34462 + return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") + if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( + self.pyarrow_dtype + ): + # pa.string().to_pandas_dtype() = object which we don't want + return np.dtype(str) + try: + return np.dtype(self.pyarrow_dtype.to_pandas_dtype()) + except (NotImplementedError, TypeError): + return np.dtype(object) + + @cache_readonly + def kind(self) -> str: + if pa.types.is_timestamp(self.pyarrow_dtype): + # To mirror DatetimeTZDtype + return "M" + return self.numpy_dtype.kind + + @cache_readonly + def itemsize(self) -> int: + """Return the number of bytes in this dtype""" + return self.numpy_dtype.itemsize + + @classmethod + def construct_array_type(cls) -> type_t[ArrowExtensionArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays.arrow import ArrowExtensionArray + + return ArrowExtensionArray + + @classmethod + def construct_from_string(cls, string: str) -> ArrowDtype: + """ + Construct this type from a string. + + Parameters + ---------- + string : str + string should follow the format f"{pyarrow_type}[pyarrow]" + e.g. int64[pyarrow] + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + if not string.endswith("[pyarrow]"): + raise TypeError(f"'{string}' must end with '[pyarrow]'") + if string == "string[pyarrow]": + # Ensure Registry.find skips ArrowDtype to use StringDtype instead + raise TypeError("string[pyarrow] should be constructed by StringDtype") + + base_type = string[:-9] # get rid of "[pyarrow]" + try: + pa_dtype = pa.type_for_alias(base_type) + except ValueError as err: + has_parameters = re.search(r"[\[\(].*[\]\)]", base_type) + if has_parameters: + # Fallback to try common temporal types + try: + return cls._parse_temporal_dtype_string(base_type) + except (NotImplementedError, ValueError): + # Fall through to raise with nice exception message below + pass + + raise NotImplementedError( + "Passing pyarrow type specific parameters " + f"({has_parameters.group()}) in the string is not supported. " + "Please construct an ArrowDtype object with a pyarrow_dtype " + "instance with specific parameters." + ) from err + raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err + return cls(pa_dtype) + + # TODO(arrow#33642): This can be removed once supported by pyarrow + @classmethod + def _parse_temporal_dtype_string(cls, string: str) -> ArrowDtype: + """ + Construct a temporal ArrowDtype from string. + """ + # we assume + # 1) "[pyarrow]" has already been stripped from the end of our string. + # 2) we know "[" is present + head, tail = string.split("[", 1) + + if not tail.endswith("]"): + raise ValueError + tail = tail[:-1] + + if head == "timestamp": + assert "," in tail # otherwise type_for_alias should work + unit, tz = tail.split(",", 1) + unit = unit.strip() + tz = tz.strip() + if tz.startswith("tz="): + tz = tz[3:] + + pa_type = pa.timestamp(unit, tz=tz) + dtype = cls(pa_type) + return dtype + + raise NotImplementedError(string) + + @property + def _is_numeric(self) -> bool: + """ + Whether columns with this dtype should be considered numeric. + """ + # TODO: pa.types.is_boolean? + return ( + pa.types.is_integer(self.pyarrow_dtype) + or pa.types.is_floating(self.pyarrow_dtype) + or pa.types.is_decimal(self.pyarrow_dtype) + ) + + @property + def _is_boolean(self) -> bool: + """ + Whether this dtype should be considered boolean. + """ + return pa.types.is_boolean(self.pyarrow_dtype) + + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + # We unwrap any masked dtypes, find the common dtype we would use + # for that, then re-mask the result. + # Mirrors BaseMaskedDtype + from pandas.core.dtypes.cast import find_common_type + + null_dtype = type(self)(pa.null()) + + new_dtype = find_common_type( + [ + dtype.numpy_dtype if isinstance(dtype, ArrowDtype) else dtype + for dtype in dtypes + if dtype != null_dtype + ] + ) + if not isinstance(new_dtype, np.dtype): + return None + try: + pa_dtype = pa.from_numpy_dtype(new_dtype) + return type(self)(pa_dtype) + except NotImplementedError: + return None + + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray): + """ + Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. + """ + array_class = self.construct_array_type() + arr = array.cast(self.pyarrow_dtype, safe=True) + return array_class(arr) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/generic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/generic.py new file mode 100644 index 0000000000000000000000000000000000000000..9718ad600cb80b6e38f069a83aaf35ddb376fb00 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/generic.py @@ -0,0 +1,147 @@ +""" define generic base classes for pandas objects """ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Type, + cast, +) + +if TYPE_CHECKING: + from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + ) + from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + NumpyExtensionArray, + PeriodArray, + TimedeltaArray, + ) + from pandas.core.generic import NDFrame + + +# define abstract base classes to enable isinstance type checking on our +# objects +def create_pandas_abc_type(name, attr, comp): + def _check(inst) -> bool: + return getattr(inst, attr, "_typ") in comp + + # https://github.com/python/mypy/issues/1006 + # error: 'classmethod' used with a non-method + @classmethod # type: ignore[misc] + def _instancecheck(cls, inst) -> bool: + return _check(inst) and not isinstance(inst, type) + + @classmethod # type: ignore[misc] + def _subclasscheck(cls, inst) -> bool: + # Raise instead of returning False + # This is consistent with default __subclasscheck__ behavior + if not isinstance(inst, type): + raise TypeError("issubclass() arg 1 must be a class") + + return _check(inst) + + dct = {"__instancecheck__": _instancecheck, "__subclasscheck__": _subclasscheck} + meta = type("ABCBase", (type,), dct) + return meta(name, (), dct) + + +ABCRangeIndex = cast( + "Type[RangeIndex]", + create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)), +) +ABCMultiIndex = cast( + "Type[MultiIndex]", + create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)), +) +ABCDatetimeIndex = cast( + "Type[DatetimeIndex]", + create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)), +) +ABCTimedeltaIndex = cast( + "Type[TimedeltaIndex]", + create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)), +) +ABCPeriodIndex = cast( + "Type[PeriodIndex]", + create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)), +) +ABCCategoricalIndex = cast( + "Type[CategoricalIndex]", + create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)), +) +ABCIntervalIndex = cast( + "Type[IntervalIndex]", + create_pandas_abc_type("ABCIntervalIndex", "_typ", ("intervalindex",)), +) +ABCIndex = cast( + "Type[Index]", + create_pandas_abc_type( + "ABCIndex", + "_typ", + { + "index", + "rangeindex", + "multiindex", + "datetimeindex", + "timedeltaindex", + "periodindex", + "categoricalindex", + "intervalindex", + }, + ), +) + + +ABCNDFrame = cast( + "Type[NDFrame]", + create_pandas_abc_type("ABCNDFrame", "_typ", ("series", "dataframe")), +) +ABCSeries = cast( + "Type[Series]", + create_pandas_abc_type("ABCSeries", "_typ", ("series",)), +) +ABCDataFrame = cast( + "Type[DataFrame]", create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) +) + +ABCCategorical = cast( + "Type[Categorical]", + create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")), +) +ABCDatetimeArray = cast( + "Type[DatetimeArray]", + create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray")), +) +ABCTimedeltaArray = cast( + "Type[TimedeltaArray]", + create_pandas_abc_type("ABCTimedeltaArray", "_typ", ("timedeltaarray")), +) +ABCPeriodArray = cast( + "Type[PeriodArray]", + create_pandas_abc_type("ABCPeriodArray", "_typ", ("periodarray",)), +) +ABCExtensionArray = cast( + "Type[ExtensionArray]", + create_pandas_abc_type( + "ABCExtensionArray", + "_typ", + # Note: IntervalArray and SparseArray are included bc they have _typ="extension" + {"extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"}, + ), +) +ABCNumpyExtensionArray = cast( + "Type[NumpyExtensionArray]", + create_pandas_abc_type("ABCNumpyExtensionArray", "_typ", ("npy_extension",)), +) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/inference.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..f551716772f61455a330bdb308cee830bd54fb03 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/inference.py @@ -0,0 +1,437 @@ +""" basic inference routines """ + +from __future__ import annotations + +from collections import abc +from numbers import Number +import re +from re import Pattern +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs import lib + +if TYPE_CHECKING: + from collections.abc import Hashable + + from pandas._typing import TypeGuard + +is_bool = lib.is_bool + +is_integer = lib.is_integer + +is_float = lib.is_float + +is_complex = lib.is_complex + +is_scalar = lib.is_scalar + +is_decimal = lib.is_decimal + +is_interval = lib.is_interval + +is_list_like = lib.is_list_like + +is_iterator = lib.is_iterator + + +def is_number(obj) -> TypeGuard[Number | np.number]: + """ + Check if the object is a number. + + Returns True when the object is a number, and False if is not. + + Parameters + ---------- + obj : any type + The object to check if is a number. + + Returns + ------- + bool + Whether `obj` is a number or not. + + See Also + -------- + api.types.is_integer: Checks a subgroup of numbers. + + Examples + -------- + >>> from pandas.api.types import is_number + >>> is_number(1) + True + >>> is_number(7.15) + True + + Booleans are valid because they are int subclass. + + >>> is_number(False) + True + + >>> is_number("foo") + False + >>> is_number("5") + False + """ + return isinstance(obj, (Number, np.number)) + + +def iterable_not_string(obj) -> bool: + """ + Check if the object is an iterable but not a string. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_iter_not_string : bool + Whether `obj` is a non-string iterable. + + Examples + -------- + >>> iterable_not_string([1, 2, 3]) + True + >>> iterable_not_string("foo") + False + >>> iterable_not_string(1) + False + """ + return isinstance(obj, abc.Iterable) and not isinstance(obj, str) + + +def is_file_like(obj) -> bool: + """ + Check if the object is a file-like object. + + For objects to be considered file-like, they must + be an iterator AND have either a `read` and/or `write` + method as an attribute. + + Note: file-like objects must be iterable, but + iterable objects need not be file-like. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + bool + Whether `obj` has file-like properties. + + Examples + -------- + >>> import io + >>> from pandas.api.types import is_file_like + >>> buffer = io.StringIO("data") + >>> is_file_like(buffer) + True + >>> is_file_like([1, 2, 3]) + False + """ + if not (hasattr(obj, "read") or hasattr(obj, "write")): + return False + + return bool(hasattr(obj, "__iter__")) + + +def is_re(obj) -> TypeGuard[Pattern]: + """ + Check if the object is a regex pattern instance. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + bool + Whether `obj` is a regex pattern. + + Examples + -------- + >>> from pandas.api.types import is_re + >>> import re + >>> is_re(re.compile(".*")) + True + >>> is_re("foo") + False + """ + return isinstance(obj, Pattern) + + +def is_re_compilable(obj) -> bool: + """ + Check if the object can be compiled into a regex pattern instance. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + bool + Whether `obj` can be compiled as a regex pattern. + + Examples + -------- + >>> from pandas.api.types import is_re_compilable + >>> is_re_compilable(".*") + True + >>> is_re_compilable(1) + False + """ + try: + re.compile(obj) + except TypeError: + return False + else: + return True + + +def is_array_like(obj) -> bool: + """ + Check if the object is array-like. + + For an object to be considered array-like, it must be list-like and + have a `dtype` attribute. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_array_like : bool + Whether `obj` has array-like properties. + + Examples + -------- + >>> is_array_like(np.array([1, 2, 3])) + True + >>> is_array_like(pd.Series(["a", "b"])) + True + >>> is_array_like(pd.Index(["2016-01-01"])) + True + >>> is_array_like([1, 2, 3]) + False + >>> is_array_like(("a", "b")) + False + """ + return is_list_like(obj) and hasattr(obj, "dtype") + + +def is_nested_list_like(obj) -> bool: + """ + Check if the object is list-like, and that all of its elements + are also list-like. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_list_like : bool + Whether `obj` has list-like properties. + + Examples + -------- + >>> is_nested_list_like([[1, 2, 3]]) + True + >>> is_nested_list_like([{1, 2, 3}, {1, 2, 3}]) + True + >>> is_nested_list_like(["foo"]) + False + >>> is_nested_list_like([]) + False + >>> is_nested_list_like([[1, 2, 3], 1]) + False + + Notes + ----- + This won't reliably detect whether a consumable iterator (e. g. + a generator) is a nested-list-like without consuming the iterator. + To avoid consuming it, we always return False if the outer container + doesn't define `__len__`. + + See Also + -------- + is_list_like + """ + return ( + is_list_like(obj) + and hasattr(obj, "__len__") + and len(obj) > 0 + and all(is_list_like(item) for item in obj) + ) + + +def is_dict_like(obj) -> bool: + """ + Check if the object is dict-like. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + bool + Whether `obj` has dict-like properties. + + Examples + -------- + >>> from pandas.api.types import is_dict_like + >>> is_dict_like({1: 2}) + True + >>> is_dict_like([1, 2, 3]) + False + >>> is_dict_like(dict) + False + >>> is_dict_like(dict()) + True + """ + dict_like_attrs = ("__getitem__", "keys", "__contains__") + return ( + all(hasattr(obj, attr) for attr in dict_like_attrs) + # [GH 25196] exclude classes + and not isinstance(obj, type) + ) + + +def is_named_tuple(obj) -> bool: + """ + Check if the object is a named tuple. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + bool + Whether `obj` is a named tuple. + + Examples + -------- + >>> from collections import namedtuple + >>> from pandas.api.types import is_named_tuple + >>> Point = namedtuple("Point", ["x", "y"]) + >>> p = Point(1, 2) + >>> + >>> is_named_tuple(p) + True + >>> is_named_tuple((1, 2)) + False + """ + return isinstance(obj, abc.Sequence) and hasattr(obj, "_fields") + + +def is_hashable(obj) -> TypeGuard[Hashable]: + """ + Return True if hash(obj) will succeed, False otherwise. + + Some types will pass a test against collections.abc.Hashable but fail when + they are actually hashed with hash(). + + Distinguish between these and other types by trying the call to hash() and + seeing if they raise TypeError. + + Returns + ------- + bool + + Examples + -------- + >>> import collections + >>> from pandas.api.types import is_hashable + >>> a = ([],) + >>> isinstance(a, collections.abc.Hashable) + True + >>> is_hashable(a) + False + """ + # Unfortunately, we can't use isinstance(obj, collections.abc.Hashable), + # which can be faster than calling hash. That is because numpy scalars + # fail this test. + + # Reconsider this decision once this numpy bug is fixed: + # https://github.com/numpy/numpy/issues/5562 + + try: + hash(obj) + except TypeError: + return False + else: + return True + + +def is_sequence(obj) -> bool: + """ + Check if the object is a sequence of objects. + String types are not included as sequences here. + + Parameters + ---------- + obj : The object to check + + Returns + ------- + is_sequence : bool + Whether `obj` is a sequence of objects. + + Examples + -------- + >>> l = [1, 2, 3] + >>> + >>> is_sequence(l) + True + >>> is_sequence(iter(l)) + False + """ + try: + iter(obj) # Can iterate over it. + len(obj) # Has a length associated with it. + return not isinstance(obj, (str, bytes)) + except (TypeError, AttributeError): + return False + + +def is_dataclass(item) -> bool: + """ + Checks if the object is a data-class instance + + Parameters + ---------- + item : object + + Returns + -------- + is_dataclass : bool + True if the item is an instance of a data-class, + will return false if you pass the data class itself + + Examples + -------- + >>> from dataclasses import dataclass + >>> @dataclass + ... class Point: + ... x: int + ... y: int + + >>> is_dataclass(Point) + False + >>> is_dataclass(Point(0,2)) + True + + """ + try: + import dataclasses + + return dataclasses.is_dataclass(item) and not isinstance(item, type) + except ImportError: + return False diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/missing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/missing.py new file mode 100644 index 0000000000000000000000000000000000000000..c341ff9dff7e613d8db2209efb5c10f170a9cd47 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/dtypes/missing.py @@ -0,0 +1,810 @@ +""" +missing types & inference +""" +from __future__ import annotations + +from decimal import Decimal +from functools import partial +from typing import ( + TYPE_CHECKING, + overload, +) +import warnings + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import lib +import pandas._libs.missing as libmissing +from pandas._libs.tslibs import ( + NaT, + iNaT, +) + +from pandas.core.dtypes.common import ( + DT64NS_DTYPE, + TD64NS_DTYPE, + ensure_object, + is_scalar, + is_string_or_object_np_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCExtensionArray, + ABCIndex, + ABCMultiIndex, + ABCSeries, +) +from pandas.core.dtypes.inference import is_list_like + +if TYPE_CHECKING: + from re import Pattern + + from pandas._typing import ( + ArrayLike, + DtypeObj, + NDFrame, + NDFrameT, + Scalar, + npt, + ) + + from pandas import Series + from pandas.core.indexes.base import Index + + +isposinf_scalar = libmissing.isposinf_scalar +isneginf_scalar = libmissing.isneginf_scalar + +nan_checker = np.isnan +INF_AS_NA = False +_dtype_object = np.dtype("object") +_dtype_str = np.dtype(str) + + +@overload +def isna(obj: Scalar | Pattern) -> bool: + ... + + +@overload +def isna( + obj: ArrayLike | Index | list, +) -> npt.NDArray[np.bool_]: + ... + + +@overload +def isna(obj: NDFrameT) -> NDFrameT: + ... + + +# handle unions +@overload +def isna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]: + ... + + +@overload +def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: + ... + + +def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: + """ + Detect missing values for an array-like object. + + This function takes a scalar or array-like object and indicates + whether values are missing (``NaN`` in numeric arrays, ``None`` or ``NaN`` + in object arrays, ``NaT`` in datetimelike). + + Parameters + ---------- + obj : scalar or array-like + Object to check for null or missing values. + + Returns + ------- + bool or array-like of bool + For scalar input, returns a scalar boolean. + For array input, returns an array of boolean indicating whether each + corresponding element is missing. + + See Also + -------- + notna : Boolean inverse of pandas.isna. + Series.isna : Detect missing values in a Series. + DataFrame.isna : Detect missing values in a DataFrame. + Index.isna : Detect missing values in an Index. + + Examples + -------- + Scalar arguments (including strings) result in a scalar boolean. + + >>> pd.isna('dog') + False + + >>> pd.isna(pd.NA) + True + + >>> pd.isna(np.nan) + True + + ndarrays result in an ndarray of booleans. + + >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]]) + >>> array + array([[ 1., nan, 3.], + [ 4., 5., nan]]) + >>> pd.isna(array) + array([[False, True, False], + [False, False, True]]) + + For indexes, an ndarray of booleans is returned. + + >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, + ... "2017-07-08"]) + >>> index + DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], + dtype='datetime64[ns]', freq=None) + >>> pd.isna(index) + array([False, False, True, False]) + + For Series and DataFrame, the same type is returned, containing booleans. + + >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']]) + >>> df + 0 1 2 + 0 ant bee cat + 1 dog None fly + >>> pd.isna(df) + 0 1 2 + 0 False False False + 1 False True False + + >>> pd.isna(df[1]) + 0 False + 1 True + Name: 1, dtype: bool + """ + return _isna(obj) + + +isnull = isna + + +def _isna(obj, inf_as_na: bool = False): + """ + Detect missing values, treating None, NaN or NA as null. Infinite + values will also be treated as null if inf_as_na is True. + + Parameters + ---------- + obj: ndarray or object value + Input array or scalar value. + inf_as_na: bool + Whether to treat infinity as null. + + Returns + ------- + boolean ndarray or boolean + """ + if is_scalar(obj): + return libmissing.checknull(obj, inf_as_na=inf_as_na) + elif isinstance(obj, ABCMultiIndex): + raise NotImplementedError("isna is not defined for MultiIndex") + elif isinstance(obj, type): + return False + elif isinstance(obj, (np.ndarray, ABCExtensionArray)): + return _isna_array(obj, inf_as_na=inf_as_na) + elif isinstance(obj, ABCIndex): + # Try to use cached isna, which also short-circuits for integer dtypes + # and avoids materializing RangeIndex._values + if not obj._can_hold_na: + return obj.isna() + return _isna_array(obj._values, inf_as_na=inf_as_na) + + elif isinstance(obj, ABCSeries): + result = _isna_array(obj._values, inf_as_na=inf_as_na) + # box + result = obj._constructor(result, index=obj.index, name=obj.name, copy=False) + return result + elif isinstance(obj, ABCDataFrame): + return obj.isna() + elif isinstance(obj, list): + return _isna_array(np.asarray(obj, dtype=object), inf_as_na=inf_as_na) + elif hasattr(obj, "__array__"): + return _isna_array(np.asarray(obj), inf_as_na=inf_as_na) + else: + return False + + +def _use_inf_as_na(key) -> None: + """ + Option change callback for na/inf behaviour. + + Choose which replacement for numpy.isnan / -numpy.isfinite is used. + + Parameters + ---------- + flag: bool + True means treat None, NaN, INF, -INF as null (old way), + False means None and NaN are null, but INF, -INF are not null + (new way). + + Notes + ----- + This approach to setting global module values is discussed and + approved here: + + * https://stackoverflow.com/questions/4859217/ + programmatically-creating-variables-in-python/4859312#4859312 + """ + inf_as_na = get_option(key) + globals()["_isna"] = partial(_isna, inf_as_na=inf_as_na) + if inf_as_na: + globals()["nan_checker"] = lambda x: ~np.isfinite(x) + globals()["INF_AS_NA"] = True + else: + globals()["nan_checker"] = np.isnan + globals()["INF_AS_NA"] = False + + +def _isna_array(values: ArrayLike, inf_as_na: bool = False): + """ + Return an array indicating which values of the input array are NaN / NA. + + Parameters + ---------- + obj: ndarray or ExtensionArray + The input array whose elements are to be checked. + inf_as_na: bool + Whether or not to treat infinite values as NA. + + Returns + ------- + array-like + Array of boolean values denoting the NA status of each element. + """ + dtype = values.dtype + + if not isinstance(values, np.ndarray): + # i.e. ExtensionArray + if inf_as_na and isinstance(dtype, CategoricalDtype): + result = libmissing.isnaobj(values.to_numpy(), inf_as_na=inf_as_na) + else: + # error: Incompatible types in assignment (expression has type + # "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has + # type "ndarray[Any, dtype[bool_]]") + result = values.isna() # type: ignore[assignment] + elif isinstance(values, np.rec.recarray): + # GH 48526 + result = _isna_recarray_dtype(values, inf_as_na=inf_as_na) + elif is_string_or_object_np_dtype(values.dtype): + result = _isna_string_dtype(values, inf_as_na=inf_as_na) + elif dtype.kind in "mM": + # this is the NaT pattern + result = values.view("i8") == iNaT + else: + if inf_as_na: + result = ~np.isfinite(values) + else: + result = np.isnan(values) + + return result + + +def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bool_]: + # Working around NumPy ticket 1542 + dtype = values.dtype + + if dtype.kind in ("S", "U"): + result = np.zeros(values.shape, dtype=bool) + else: + if values.ndim in {1, 2}: + result = libmissing.isnaobj(values, inf_as_na=inf_as_na) + else: + # 0-D, reached via e.g. mask_missing + result = libmissing.isnaobj(values.ravel(), inf_as_na=inf_as_na) + result = result.reshape(values.shape) + + return result + + +def _has_record_inf_value(record_as_array: np.ndarray) -> np.bool_: + is_inf_in_record = np.zeros(len(record_as_array), dtype=bool) + for i, value in enumerate(record_as_array): + is_element_inf = False + try: + is_element_inf = np.isinf(value) + except TypeError: + is_element_inf = False + is_inf_in_record[i] = is_element_inf + + return np.any(is_inf_in_record) + + +def _isna_recarray_dtype( + values: np.rec.recarray, inf_as_na: bool +) -> npt.NDArray[np.bool_]: + result = np.zeros(values.shape, dtype=bool) + for i, record in enumerate(values): + record_as_array = np.array(record.tolist()) + does_record_contain_nan = isna_all(record_as_array) + does_record_contain_inf = False + if inf_as_na: + does_record_contain_inf = bool(_has_record_inf_value(record_as_array)) + result[i] = np.any( + np.logical_or(does_record_contain_nan, does_record_contain_inf) + ) + + return result + + +@overload +def notna(obj: Scalar) -> bool: + ... + + +@overload +def notna( + obj: ArrayLike | Index | list, +) -> npt.NDArray[np.bool_]: + ... + + +@overload +def notna(obj: NDFrameT) -> NDFrameT: + ... + + +# handle unions +@overload +def notna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]: + ... + + +@overload +def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: + ... + + +def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: + """ + Detect non-missing values for an array-like object. + + This function takes a scalar or array-like object and indicates + whether values are valid (not missing, which is ``NaN`` in numeric + arrays, ``None`` or ``NaN`` in object arrays, ``NaT`` in datetimelike). + + Parameters + ---------- + obj : array-like or object value + Object to check for *not* null or *non*-missing values. + + Returns + ------- + bool or array-like of bool + For scalar input, returns a scalar boolean. + For array input, returns an array of boolean indicating whether each + corresponding element is valid. + + See Also + -------- + isna : Boolean inverse of pandas.notna. + Series.notna : Detect valid values in a Series. + DataFrame.notna : Detect valid values in a DataFrame. + Index.notna : Detect valid values in an Index. + + Examples + -------- + Scalar arguments (including strings) result in a scalar boolean. + + >>> pd.notna('dog') + True + + >>> pd.notna(pd.NA) + False + + >>> pd.notna(np.nan) + False + + ndarrays result in an ndarray of booleans. + + >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]]) + >>> array + array([[ 1., nan, 3.], + [ 4., 5., nan]]) + >>> pd.notna(array) + array([[ True, False, True], + [ True, True, False]]) + + For indexes, an ndarray of booleans is returned. + + >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, + ... "2017-07-08"]) + >>> index + DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], + dtype='datetime64[ns]', freq=None) + >>> pd.notna(index) + array([ True, True, False, True]) + + For Series and DataFrame, the same type is returned, containing booleans. + + >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']]) + >>> df + 0 1 2 + 0 ant bee cat + 1 dog None fly + >>> pd.notna(df) + 0 1 2 + 0 True True True + 1 True False True + + >>> pd.notna(df[1]) + 0 True + 1 False + Name: 1, dtype: bool + """ + res = isna(obj) + if isinstance(res, bool): + return not res + return ~res + + +notnull = notna + + +def array_equivalent( + left, + right, + strict_nan: bool = False, + dtype_equal: bool = False, +) -> bool: + """ + True if two arrays, left and right, have equal non-NaN elements, and NaNs + in corresponding locations. False otherwise. It is assumed that left and + right are NumPy arrays of the same dtype. The behavior of this function + (particularly with respect to NaNs) is not defined if the dtypes are + different. + + Parameters + ---------- + left, right : ndarrays + strict_nan : bool, default False + If True, consider NaN and None to be different. + dtype_equal : bool, default False + Whether `left` and `right` are known to have the same dtype + according to `is_dtype_equal`. Some methods like `BlockManager.equals`. + require that the dtypes match. Setting this to ``True`` can improve + performance, but will give different results for arrays that are + equal but different dtypes. + + Returns + ------- + b : bool + Returns True if the arrays are equivalent. + + Examples + -------- + >>> array_equivalent( + ... np.array([1, 2, np.nan]), + ... np.array([1, 2, np.nan])) + True + >>> array_equivalent( + ... np.array([1, np.nan, 2]), + ... np.array([1, 2, np.nan])) + False + """ + left, right = np.asarray(left), np.asarray(right) + + # shape compat + if left.shape != right.shape: + return False + + if dtype_equal: + # fastpath when we require that the dtypes match (Block.equals) + if left.dtype.kind in "fc": + return _array_equivalent_float(left, right) + elif left.dtype.kind in "mM": + return _array_equivalent_datetimelike(left, right) + elif is_string_or_object_np_dtype(left.dtype): + # TODO: fastpath for pandas' StringDtype + return _array_equivalent_object(left, right, strict_nan) + else: + return np.array_equal(left, right) + + # Slow path when we allow comparing different dtypes. + # Object arrays can contain None, NaN and NaT. + # string dtypes must be come to this path for NumPy 1.7.1 compat + if left.dtype.kind in "OSU" or right.dtype.kind in "OSU": + # Note: `in "OSU"` is non-trivially faster than `in ["O", "S", "U"]` + # or `in ("O", "S", "U")` + return _array_equivalent_object(left, right, strict_nan) + + # NaNs can occur in float and complex arrays. + if left.dtype.kind in "fc": + if not (left.size and right.size): + return True + return ((left == right) | (isna(left) & isna(right))).all() + + elif left.dtype.kind in "mM" or right.dtype.kind in "mM": + # datetime64, timedelta64, Period + if left.dtype != right.dtype: + return False + + left = left.view("i8") + right = right.view("i8") + + # if we have structured dtypes, compare first + if ( + left.dtype.type is np.void or right.dtype.type is np.void + ) and left.dtype != right.dtype: + return False + + return np.array_equal(left, right) + + +def _array_equivalent_float(left: np.ndarray, right: np.ndarray) -> bool: + return bool(((left == right) | (np.isnan(left) & np.isnan(right))).all()) + + +def _array_equivalent_datetimelike(left: np.ndarray, right: np.ndarray): + return np.array_equal(left.view("i8"), right.view("i8")) + + +def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bool): + left = ensure_object(left) + right = ensure_object(right) + + mask: npt.NDArray[np.bool_] | None = None + if strict_nan: + mask = isna(left) & isna(right) + if not mask.any(): + mask = None + + try: + if mask is None: + return lib.array_equivalent_object(left, right) + if not lib.array_equivalent_object(left[~mask], right[~mask]): + return False + left_remaining = left[mask] + right_remaining = right[mask] + except ValueError: + # can raise a ValueError if left and right cannot be + # compared (e.g. nested arrays) + left_remaining = left + right_remaining = right + + for left_value, right_value in zip(left_remaining, right_remaining): + if left_value is NaT and right_value is not NaT: + return False + + elif left_value is libmissing.NA and right_value is not libmissing.NA: + return False + + elif isinstance(left_value, float) and np.isnan(left_value): + if not isinstance(right_value, float) or not np.isnan(right_value): + return False + else: + with warnings.catch_warnings(): + # suppress numpy's "elementwise comparison failed" + warnings.simplefilter("ignore", DeprecationWarning) + try: + if np.any(np.asarray(left_value != right_value)): + return False + except TypeError as err: + if "boolean value of NA is ambiguous" in str(err): + return False + raise + except ValueError: + # numpy can raise a ValueError if left and right cannot be + # compared (e.g. nested arrays) + return False + return True + + +def array_equals(left: ArrayLike, right: ArrayLike) -> bool: + """ + ExtensionArray-compatible implementation of array_equivalent. + """ + if left.dtype != right.dtype: + return False + elif isinstance(left, ABCExtensionArray): + return left.equals(right) + else: + return array_equivalent(left, right, dtype_equal=True) + + +def infer_fill_value(val): + """ + infer the fill value for the nan/NaT from the provided + scalar/ndarray/list-like if we are a NaT, return the correct dtyped + element to provide proper block construction + """ + if not is_list_like(val): + val = [val] + val = np.asarray(val) + if val.dtype.kind in "mM": + return np.array("NaT", dtype=val.dtype) + elif val.dtype == object: + dtype = lib.infer_dtype(ensure_object(val), skipna=False) + if dtype in ["datetime", "datetime64"]: + return np.array("NaT", dtype=DT64NS_DTYPE) + elif dtype in ["timedelta", "timedelta64"]: + return np.array("NaT", dtype=TD64NS_DTYPE) + return np.array(np.nan, dtype=object) + elif val.dtype.kind == "U": + return np.array(np.nan, dtype=val.dtype) + return np.nan + + +def construct_1d_array_from_inferred_fill_value( + value: object, length: int +) -> ArrayLike: + # Find our empty_value dtype by constructing an array + # from our value and doing a .take on it + from pandas.core.algorithms import take_nd + from pandas.core.construction import sanitize_array + from pandas.core.indexes.base import Index + + arr = sanitize_array(value, Index(range(1)), copy=False) + taker = -1 * np.ones(length, dtype=np.intp) + return take_nd(arr, taker) + + +def maybe_fill(arr: np.ndarray) -> np.ndarray: + """ + Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype. + """ + if arr.dtype.kind not in "iub": + arr.fill(np.nan) + return arr + + +def na_value_for_dtype(dtype: DtypeObj, compat: bool = True): + """ + Return a dtype compat na value + + Parameters + ---------- + dtype : string / dtype + compat : bool, default True + + Returns + ------- + np.dtype or a pandas dtype + + Examples + -------- + >>> na_value_for_dtype(np.dtype('int64')) + 0 + >>> na_value_for_dtype(np.dtype('int64'), compat=False) + nan + >>> na_value_for_dtype(np.dtype('float64')) + nan + >>> na_value_for_dtype(np.dtype('bool')) + False + >>> na_value_for_dtype(np.dtype('datetime64[ns]')) + numpy.datetime64('NaT') + """ + + if isinstance(dtype, ExtensionDtype): + return dtype.na_value + elif dtype.kind in "mM": + unit = np.datetime_data(dtype)[0] + return dtype.type("NaT", unit) + elif dtype.kind == "f": + return np.nan + elif dtype.kind in "iu": + if compat: + return 0 + return np.nan + elif dtype.kind == "b": + if compat: + return False + return np.nan + return np.nan + + +def remove_na_arraylike(arr: Series | Index | np.ndarray): + """ + Return array-like containing only true/non-NaN values, possibly empty. + """ + if isinstance(arr.dtype, ExtensionDtype): + return arr[notna(arr)] + else: + return arr[notna(np.asarray(arr))] + + +def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool: + """ + isna check that excludes incompatible dtypes + + Parameters + ---------- + obj : object + dtype : np.datetime64, np.timedelta64, DatetimeTZDtype, or PeriodDtype + + Returns + ------- + bool + """ + if not lib.is_scalar(obj) or not isna(obj): + return False + elif dtype.kind == "M": + if isinstance(dtype, np.dtype): + # i.e. not tzaware + return not isinstance(obj, (np.timedelta64, Decimal)) + # we have to rule out tznaive dt64("NaT") + return not isinstance(obj, (np.timedelta64, np.datetime64, Decimal)) + elif dtype.kind == "m": + return not isinstance(obj, (np.datetime64, Decimal)) + elif dtype.kind in "iufc": + # Numeric + return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64)) + elif dtype.kind == "b": + # We allow pd.NA, None, np.nan in BooleanArray (same as IntervalDtype) + return lib.is_float(obj) or obj is None or obj is libmissing.NA + + elif dtype == _dtype_str: + # numpy string dtypes to avoid float np.nan + return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal, float)) + + elif dtype == _dtype_object: + # This is needed for Categorical, but is kind of weird + return True + + elif isinstance(dtype, PeriodDtype): + return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal)) + + elif isinstance(dtype, IntervalDtype): + return lib.is_float(obj) or obj is None or obj is libmissing.NA + + elif isinstance(dtype, CategoricalDtype): + return is_valid_na_for_dtype(obj, dtype.categories.dtype) + + # fallback, default to allowing NaN, None, NA, NaT + return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal)) + + +def isna_all(arr: ArrayLike) -> bool: + """ + Optimized equivalent to isna(arr).all() + """ + total_len = len(arr) + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. + # parameters 1000 and 40 were chosen arbitrarily + chunk_len = max(total_len // 40, 1000) + + dtype = arr.dtype + if lib.is_np_dtype(dtype, "f"): + checker = nan_checker + + elif (lib.is_np_dtype(dtype, "mM")) or isinstance( + dtype, (DatetimeTZDtype, PeriodDtype) + ): + # error: Incompatible types in assignment (expression has type + # "Callable[[Any], Any]", variable has type "ufunc") + checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment] + + else: + # error: Incompatible types in assignment (expression has type "Callable[[Any], + # Any]", variable has type "ufunc") + checker = lambda x: _isna_array( # type: ignore[assignment] + x, inf_as_na=INF_AS_NA + ) + + return all( + checker(arr[i : i + chunk_len]).all() for i in range(0, total_len, chunk_len) + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8248f378e2c1acea37bdc2d41065c591360b902a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__init__.py @@ -0,0 +1,15 @@ +from pandas.core.groupby.generic import ( + DataFrameGroupBy, + NamedAgg, + SeriesGroupBy, +) +from pandas.core.groupby.groupby import GroupBy +from pandas.core.groupby.grouper import Grouper + +__all__ = [ + "DataFrameGroupBy", + "NamedAgg", + "SeriesGroupBy", + "GroupBy", + "Grouper", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8120af79fb5a3b29c130d12e838a251d00874f3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/base.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7366f7543e8c13f57b3711a68df4a8e36649526e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/base.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/categorical.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/categorical.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0dd7ebeb0603ea3014c615cba01781dcdab57953 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/categorical.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/grouper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/grouper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6de1559de5be180bca6711c1ca9c37ffd83e70f7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/grouper.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..004bd83bc5b38222db5a056668663f9297df892d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/numba_.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/numba_.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8b2893b4460f40b78b2ac1b562ca5d760604045 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/numba_.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..927f2174a2b7a31d4ae093c74100544fd69db1f2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/__pycache__/ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/base.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/base.py new file mode 100644 index 0000000000000000000000000000000000000000..a443597347283887deb9cbd3eafb5f6d3bb6d9a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/base.py @@ -0,0 +1,121 @@ +""" +Provide basic components for groupby. +""" +from __future__ import annotations + +import dataclasses +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Hashable + + +@dataclasses.dataclass(order=True, frozen=True) +class OutputKey: + label: Hashable + position: int + + +# special case to prevent duplicate plots when catching exceptions when +# forwarding methods from NDFrames +plotting_methods = frozenset(["plot", "hist"]) + +# cythonized transformations or canned "agg+broadcast", which do not +# require postprocessing of the result by transform. +cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"]) + +# List of aggregation/reduction functions. +# These map each group to a single numeric value +reduction_kernels = frozenset( + [ + "all", + "any", + "corrwith", + "count", + "first", + "idxmax", + "idxmin", + "last", + "max", + "mean", + "median", + "min", + "nunique", + "prod", + # as long as `quantile`'s signature accepts only + # a single quantile value, it's a reduction. + # GH#27526 might change that. + "quantile", + "sem", + "size", + "skew", + "std", + "sum", + "var", + ] +) + +# List of transformation functions. +# a transformation is a function that, for each group, +# produces a result that has the same shape as the group. + + +transformation_kernels = frozenset( + [ + "bfill", + "cumcount", + "cummax", + "cummin", + "cumprod", + "cumsum", + "diff", + "ffill", + "fillna", + "ngroup", + "pct_change", + "rank", + "shift", + ] +) + +# these are all the public methods on Grouper which don't belong +# in either of the above lists +groupby_other_methods = frozenset( + [ + "agg", + "aggregate", + "apply", + "boxplot", + # corr and cov return ngroups*ncolumns rows, so they + # are neither a transformation nor a reduction + "corr", + "cov", + "describe", + "dtypes", + "expanding", + "ewm", + "filter", + "get_group", + "groups", + "head", + "hist", + "indices", + "ndim", + "ngroups", + "nth", + "ohlc", + "pipe", + "plot", + "resample", + "rolling", + "tail", + "take", + "transform", + "sample", + "value_counts", + ] +) +# Valid values of `name` for `groupby.transform(name)` +# NOTE: do NOT edit this directly. New additions should be inserted +# into the appropriate list above. +transform_kernel_allowlist = reduction_kernels | transformation_kernels diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/categorical.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/categorical.py new file mode 100644 index 0000000000000000000000000000000000000000..6ab98cf4fe55e9b064db99e61d1245cb83b63dc1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/categorical.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import numpy as np + +from pandas.core.algorithms import unique1d +from pandas.core.arrays.categorical import ( + Categorical, + CategoricalDtype, + recode_for_categories, +) + + +def recode_for_groupby( + c: Categorical, sort: bool, observed: bool +) -> tuple[Categorical, Categorical | None]: + """ + Code the categories to ensure we can groupby for categoricals. + + If observed=True, we return a new Categorical with the observed + categories only. + + If sort=False, return a copy of self, coded with categories as + returned by .unique(), followed by any categories not appearing in + the data. If sort=True, return self. + + This method is needed solely to ensure the categorical index of the + GroupBy result has categories in the order of appearance in the data + (GH-8868). + + Parameters + ---------- + c : Categorical + sort : bool + The value of the sort parameter groupby was called with. + observed : bool + Account only for the observed values + + Returns + ------- + Categorical + If sort=False, the new categories are set to the order of + appearance in codes (unless ordered=True, in which case the + original order is preserved), followed by any unrepresented + categories in the original order. + Categorical or None + If we are observed, return the original categorical, otherwise None + """ + # we only care about observed values + if observed: + # In cases with c.ordered, this is equivalent to + # return c.remove_unused_categories(), c + + unique_codes = unique1d(c.codes) + + take_codes = unique_codes[unique_codes != -1] + if sort: + take_codes = np.sort(take_codes) + + # we recode according to the uniques + categories = c.categories.take(take_codes) + codes = recode_for_categories(c.codes, c.categories, categories) + + # return a new categorical that maps our new codes + # and categories + dtype = CategoricalDtype(categories, ordered=c.ordered) + return Categorical._simple_new(codes, dtype=dtype), c + + # Already sorted according to c.categories; all is fine + if sort: + return c, None + + # sort=False should order groups in as-encountered order (GH-8868) + + # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories + all_codes = np.arange(c.categories.nunique()) + # GH 38140: exclude nan from indexer for categories + unique_notnan_codes = unique1d(c.codes[c.codes != -1]) + if sort: + unique_notnan_codes = np.sort(unique_notnan_codes) + if len(all_codes) > len(unique_notnan_codes): + # GH 13179: All categories need to be present, even if missing from the data + missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True) + take_codes = np.concatenate((unique_notnan_codes, missing_codes)) + else: + take_codes = unique_notnan_codes + + return Categorical(c, c.unique().categories.take(take_codes)), None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/generic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/generic.py new file mode 100644 index 0000000000000000000000000000000000000000..f2e314046fb749e7d32bf7fc76ca22eb1194a328 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/generic.py @@ -0,0 +1,2852 @@ +""" +Define the SeriesGroupBy and DataFrameGroupBy +classes that hold the groupby interfaces (and some implementations). + +These are user facing as the result of the ``df.groupby(...)`` operations, +which here returns a DataFrameGroupBy object. +""" +from __future__ import annotations + +from collections import abc +from functools import partial +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + NamedTuple, + TypeVar, + Union, + cast, +) +import warnings + +import numpy as np + +from pandas._libs import ( + Interval, + lib, +) +from pandas._libs.hashtable import duplicated +from pandas.errors import SpecificationError +from pandas.util._decorators import ( + Appender, + Substitution, + doc, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + ensure_int64, + is_bool, + is_dict_like, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_scalar, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + IntervalDtype, +) +from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.missing import ( + isna, + notna, +) + +from pandas.core import algorithms +from pandas.core.apply import ( + GroupByApply, + maybe_mangle_lambdas, + reconstruct_func, + validate_func_kwargs, + warn_alias_replacement, +) +import pandas.core.common as com +from pandas.core.frame import DataFrame +from pandas.core.groupby import ( + base, + ops, +) +from pandas.core.groupby.groupby import ( + GroupBy, + GroupByPlot, + _agg_template_frame, + _agg_template_series, + _apply_docs, + _transform_template, +) +from pandas.core.indexes.api import ( + Index, + MultiIndex, + all_indexes_same, + default_index, +) +from pandas.core.series import Series +from pandas.core.sorting import get_group_index +from pandas.core.util.numba_ import maybe_use_numba + +from pandas.plotting import boxplot_frame_groupby + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Mapping, + Sequence, + ) + + from pandas._typing import ( + ArrayLike, + Axis, + AxisInt, + CorrelationMethod, + FillnaOptions, + IndexLabel, + Manager, + Manager2D, + SingleManager, + TakeIndexer, + ) + + from pandas import Categorical + from pandas.core.generic import NDFrame + +# TODO(typing) the return value on this callable should be any *scalar*. +AggScalar = Union[str, Callable[..., Any]] +# TODO: validate types on ScalarResult and move to _typing +# Blocked from using by https://github.com/python/mypy/issues/1484 +# See note at _mangle_lambda_list +ScalarResult = TypeVar("ScalarResult") + + +class NamedAgg(NamedTuple): + """ + Helper for column specific aggregation with control over output column names. + + Subclass of typing.NamedTuple. + + Parameters + ---------- + column : Hashable + Column label in the DataFrame to apply aggfunc. + aggfunc : function or str + Function to apply to the provided column. If string, the name of a built-in + pandas function. + + Examples + -------- + >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) + >>> agg_a = pd.NamedAgg(column="a", aggfunc="min") + >>> agg_1 = pd.NamedAgg(column=1, aggfunc=lambda x: np.mean(x)) + >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1) + result_a result_1 + key + 1 -1 10.5 + 2 1 12.0 + """ + + column: Hashable + aggfunc: AggScalar + + +class SeriesGroupBy(GroupBy[Series]): + def _wrap_agged_manager(self, mgr: Manager) -> Series: + out = self.obj._constructor_from_mgr(mgr, axes=mgr.axes) + out._name = self.obj.name + return out + + def _get_data_to_aggregate( + self, *, numeric_only: bool = False, name: str | None = None + ) -> SingleManager: + ser = self._obj_with_exclusions + single = ser._mgr + if numeric_only and not is_numeric_dtype(ser.dtype): + # GH#41291 match Series behavior + kwd_name = "numeric_only" + raise TypeError( + f"Cannot use {kwd_name}=True with " + f"{type(self).__name__}.{name} and non-numeric dtypes." + ) + return single + + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).min() + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg('min') + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max']) + min max + 1 1 2 + 2 3 4 + + The output column names can be controlled by passing + the desired column names and aggregations as keyword arguments. + + >>> s.groupby([1, 1, 2, 2]).agg( + ... minimum='min', + ... maximum='max', + ... ) + minimum maximum + 1 1 2 + 2 3 4 + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating function. + + >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min()) + 1 1.0 + 2 3.0 + dtype: float64 + """ + ) + + @Appender( + _apply_docs["template"].format( + input="series", examples=_apply_docs["series_examples"] + ) + ) + def apply(self, func, *args, **kwargs) -> Series: + return super().apply(func, *args, **kwargs) + + @doc(_agg_template_series, examples=_agg_examples_doc, klass="Series") + def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + relabeling = func is None + columns = None + if relabeling: + columns, func = validate_func_kwargs(kwargs) + kwargs = {} + + if isinstance(func, str): + if maybe_use_numba(engine) and engine is not None: + # Not all agg functions support numba, only propagate numba kwargs + # if user asks for numba, and engine is not None + # (if engine is None, the called function will handle the case where + # numba is requested via the global option) + kwargs["engine"] = engine + if engine_kwargs is not None: + kwargs["engine_kwargs"] = engine_kwargs + return getattr(self, func)(*args, **kwargs) + + elif isinstance(func, abc.Iterable): + # Catch instances of lists / tuples + # but not the class list / tuple itself. + func = maybe_mangle_lambdas(func) + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + ret = self._aggregate_multiple_funcs(func, *args, **kwargs) + if relabeling: + # columns is not narrowed by mypy from relabeling flag + assert columns is not None # for mypy + ret.columns = columns + if not self.as_index: + ret = ret.reset_index() + return ret + + else: + cyfunc = com.get_cython_func(func) + if cyfunc and not args and not kwargs: + warn_alias_replacement(self, func, cyfunc) + return getattr(self, cyfunc)() + + if maybe_use_numba(engine): + return self._aggregate_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + + if self.ngroups == 0: + # e.g. test_evaluate_with_empty_groups without any groups to + # iterate over, we have no output on which to do dtype + # inference. We default to using the existing dtype. + # xref GH#51445 + obj = self._obj_with_exclusions + return self.obj._constructor( + [], + name=self.obj.name, + index=self._grouper.result_index, + dtype=obj.dtype, + ) + + if self._grouper.nkeys > 1: + return self._python_agg_general(func, *args, **kwargs) + + try: + return self._python_agg_general(func, *args, **kwargs) + except KeyError: + # KeyError raised in test_groupby.test_basic is bc the func does + # a dictionary lookup on group.name, but group name is not + # pinned in _python_agg_general, only in _aggregate_named + result = self._aggregate_named(func, *args, **kwargs) + + warnings.warn( + "Pinning the groupby key to each group in " + f"{type(self).__name__}.agg is deprecated, and cases that " + "relied on it will raise in a future version. " + "If your operation requires utilizing the groupby keys, " + "iterate over the groupby object instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + # result is a dict whose keys are the elements of result_index + result = Series(result, index=self._grouper.result_index) + result = self._wrap_aggregated_output(result) + return result + + agg = aggregate + + def _python_agg_general(self, func, *args, **kwargs): + orig_func = func + func = com.is_builtin_func(func) + if orig_func != func: + alias = com._builtin_table_alias[func] + warn_alias_replacement(self, orig_func, alias) + f = lambda x: func(x, *args, **kwargs) + + obj = self._obj_with_exclusions + result = self._grouper.agg_series(obj, f) + res = obj._constructor(result, name=obj.name) + return self._wrap_aggregated_output(res) + + def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame: + if isinstance(arg, dict): + if self.as_index: + # GH 15931 + raise SpecificationError("nested renamer is not supported") + else: + # GH#50684 - This accidentally worked in 1.x + msg = ( + "Passing a dictionary to SeriesGroupBy.agg is deprecated " + "and will raise in a future version of pandas. Pass a list " + "of aggregations instead." + ) + warnings.warn( + message=msg, + category=FutureWarning, + stacklevel=find_stack_level(), + ) + arg = list(arg.items()) + elif any(isinstance(x, (tuple, list)) for x in arg): + arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] + else: + # list of functions / function names + columns = (com.get_callable_name(f) or f for f in arg) + arg = zip(columns, arg) + + results: dict[base.OutputKey, DataFrame | Series] = {} + with com.temp_setattr(self, "as_index", True): + # Combine results using the index, need to adjust index after + # if as_index=False (GH#50724) + for idx, (name, func) in enumerate(arg): + key = base.OutputKey(label=name, position=idx) + results[key] = self.aggregate(func, *args, **kwargs) + + if any(isinstance(x, DataFrame) for x in results.values()): + from pandas import concat + + res_df = concat( + results.values(), axis=1, keys=[key.label for key in results] + ) + return res_df + + indexed_output = {key.position: val for key, val in results.items()} + output = self.obj._constructor_expanddim(indexed_output, index=None) + output.columns = Index(key.label for key in results) + + return output + + def _wrap_applied_output( + self, + data: Series, + values: list[Any], + not_indexed_same: bool = False, + is_transform: bool = False, + ) -> DataFrame | Series: + """ + Wrap the output of SeriesGroupBy.apply into the expected result. + + Parameters + ---------- + data : Series + Input data for groupby operation. + values : List[Any] + Applied output for each group. + not_indexed_same : bool, default False + Whether the applied outputs are not indexed the same as the group axes. + + Returns + ------- + DataFrame or Series + """ + if len(values) == 0: + # GH #6265 + if is_transform: + # GH#47787 see test_group_on_empty_multiindex + res_index = data.index + else: + res_index = self._grouper.result_index + + return self.obj._constructor( + [], + name=self.obj.name, + index=res_index, + dtype=data.dtype, + ) + assert values is not None + + if isinstance(values[0], dict): + # GH #823 #24880 + index = self._grouper.result_index + res_df = self.obj._constructor_expanddim(values, index=index) + res_df = self._reindex_output(res_df) + # if self.observed is False, + # keep all-NaN rows created while re-indexing + res_ser = res_df.stack(future_stack=True) + res_ser.name = self.obj.name + return res_ser + elif isinstance(values[0], (Series, DataFrame)): + result = self._concat_objects( + values, + not_indexed_same=not_indexed_same, + is_transform=is_transform, + ) + if isinstance(result, Series): + result.name = self.obj.name + if not self.as_index and not_indexed_same: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return result + else: + # GH #6265 #24880 + result = self.obj._constructor( + data=values, index=self._grouper.result_index, name=self.obj.name + ) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return self._reindex_output(result) + + def _aggregate_named(self, func, *args, **kwargs): + # Note: this is very similar to _aggregate_series_pure_python, + # but that does not pin group.name + result = {} + initialized = False + + for name, group in self._grouper.get_iterator( + self._obj_with_exclusions, axis=self.axis + ): + # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations + object.__setattr__(group, "name", name) + + output = func(group, *args, **kwargs) + output = ops.extract_result(output) + if not initialized: + # We only do this validation on the first iteration + ops.check_result_array(output, group.dtype) + initialized = True + result[name] = output + + return result + + __examples_series_doc = dedent( + """ + >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0], + ... index=["Falcon", "Falcon", "Parrot", "Parrot"], + ... name="Max Speed") + >>> grouped = ser.groupby([1, 1, 2, 2]) + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + Falcon 0.707107 + Falcon -0.707107 + Parrot 0.707107 + Parrot -0.707107 + Name: Max Speed, dtype: float64 + + Broadcast result of the transformation + + >>> grouped.transform(lambda x: x.max() - x.min()) + Falcon 40.0 + Falcon 40.0 + Parrot 10.0 + Parrot 10.0 + Name: Max Speed, dtype: float64 + + >>> grouped.transform("mean") + Falcon 370.0 + Falcon 370.0 + Parrot 25.0 + Parrot 25.0 + Name: Max Speed, dtype: float64 + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + for example: + + >>> grouped.transform(lambda x: x.astype(int).max()) + Falcon 390 + Falcon 390 + Parrot 30 + Parrot 30 + Name: Max Speed, dtype: int64 + """ + ) + + @Substitution(klass="Series", example=__examples_series_doc) + @Appender(_transform_template) + def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + return self._transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) + + def _cython_transform( + self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs + ): + assert axis == 0 # handled by caller + + obj = self._obj_with_exclusions + + try: + result = self._grouper._cython_operation( + "transform", obj._values, how, axis, **kwargs + ) + except NotImplementedError as err: + # e.g. test_groupby_raises_string + raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err + + return obj._constructor(result, index=self.obj.index, name=obj.name) + + def _transform_general( + self, func: Callable, engine, engine_kwargs, *args, **kwargs + ) -> Series: + """ + Transform with a callable `func`. + """ + if maybe_use_numba(engine): + return self._transform_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + assert callable(func) + klass = type(self.obj) + + results = [] + for name, group in self._grouper.get_iterator( + self._obj_with_exclusions, axis=self.axis + ): + # this setattr is needed for test_transform_lambda_with_datetimetz + object.__setattr__(group, "name", name) + res = func(group, *args, **kwargs) + + results.append(klass(res, index=group.index)) + + # check for empty "results" to avoid concat ValueError + if results: + from pandas.core.reshape.concat import concat + + concatenated = concat(results) + result = self._set_result_index_ordered(concatenated) + else: + result = self.obj._constructor(dtype=np.float64) + + result.name = self.obj.name + return result + + def filter(self, func, dropna: bool = True, *args, **kwargs): + """ + Filter elements from groups that don't satisfy a criterion. + + Elements from groups are filtered if they do not satisfy the + boolean criterion specified by func. + + Parameters + ---------- + func : function + Criterion to apply to each group. Should return True or False. + dropna : bool + Drop groups that do not pass the filter. True by default; if False, + groups that evaluate False are filled with NaNs. + + Returns + ------- + Series + + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + Examples + -------- + >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + ... 'foo', 'bar'], + ... 'B' : [1, 2, 3, 4, 5, 6], + ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) + >>> grouped = df.groupby('A') + >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.) + 1 2 + 3 4 + 5 6 + Name: B, dtype: int64 + """ + if isinstance(func, str): + wrapper = lambda x: getattr(x, func)(*args, **kwargs) + else: + wrapper = lambda x: func(x, *args, **kwargs) + + # Interpret np.nan as False. + def true_and_notna(x) -> bool: + b = wrapper(x) + return notna(b) and b + + try: + indices = [ + self._get_index(name) + for name, group in self._grouper.get_iterator( + self._obj_with_exclusions, axis=self.axis + ) + if true_and_notna(group) + ] + except (ValueError, TypeError) as err: + raise TypeError("the filter must return a boolean result") from err + + filtered = self._apply_filter(indices, dropna) + return filtered + + def nunique(self, dropna: bool = True) -> Series | DataFrame: + """ + Return number of unique elements in the group. + + Returns + ------- + Series + Number of unique values within each group. + + Examples + -------- + For SeriesGroupby: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 3 + dtype: int64 + >>> ser.groupby(level=0).nunique() + a 2 + b 1 + dtype: int64 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 3], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 3 + dtype: int64 + >>> ser.resample('MS').nunique() + 2023-01-01 2 + 2023-02-01 1 + Freq: MS, dtype: int64 + """ + ids, _, ngroups = self._grouper.group_info + val = self.obj._values + codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) + + if self._grouper.has_dropped_na: + mask = ids >= 0 + ids = ids[mask] + codes = codes[mask] + + group_index = get_group_index( + labels=[ids, codes], + shape=(ngroups, len(uniques)), + sort=False, + xnull=dropna, + ) + + if dropna: + mask = group_index >= 0 + if (~mask).any(): + ids = ids[mask] + group_index = group_index[mask] + + mask = duplicated(group_index, "first") + res = np.bincount(ids[~mask], minlength=ngroups) + res = ensure_int64(res) + + ri = self._grouper.result_index + result: Series | DataFrame = self.obj._constructor( + res, index=ri, name=self.obj.name + ) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return self._reindex_output(result, fill_value=0) + + @doc(Series.describe) + def describe(self, percentiles=None, include=None, exclude=None) -> Series: + return super().describe( + percentiles=percentiles, include=include, exclude=exclude + ) + + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins=None, + dropna: bool = True, + ) -> Series | DataFrame: + name = "proportion" if normalize else "count" + + if bins is None: + result = self._value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + result.name = name + return result + + from pandas.core.reshape.merge import get_join_indexers + from pandas.core.reshape.tile import cut + + ids, _, _ = self._grouper.group_info + val = self.obj._values + + index_names = self._grouper.names + [self.obj.name] + + if isinstance(val.dtype, CategoricalDtype) or ( + bins is not None and not np.iterable(bins) + ): + # scalar bins cannot be done at top level + # in a backward compatible way + # GH38672 relates to categorical dtype + ser = self.apply( + Series.value_counts, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + ) + ser.name = name + ser.index.names = index_names + return ser + + # groupby removes null keys from groupings + mask = ids != -1 + ids, val = ids[mask], val[mask] + + lab: Index | np.ndarray + if bins is None: + lab, lev = algorithms.factorize(val, sort=True) + llab = lambda lab, inc: lab[inc] + else: + # lab is a Categorical with categories an IntervalIndex + cat_ser = cut(Series(val, copy=False), bins, include_lowest=True) + cat_obj = cast("Categorical", cat_ser._values) + lev = cat_obj.categories + lab = lev.take( + cat_obj.codes, + allow_fill=True, + fill_value=lev._na_value, + ) + llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] + + if isinstance(lab.dtype, IntervalDtype): + # TODO: should we do this inside II? + lab_interval = cast(Interval, lab) + + sorter = np.lexsort((lab_interval.left, lab_interval.right, ids)) + else: + sorter = np.lexsort((lab, ids)) + + ids, lab = ids[sorter], lab[sorter] + + # group boundaries are where group ids change + idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0] + idx = np.r_[0, idchanges] + if not len(ids): + idx = idchanges + + # new values are where sorted labels change + lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) + inc = np.r_[True, lchanges] + if not len(val): + inc = lchanges + inc[idx] = True # group boundaries are also new values + out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts + + # num. of times each group should be repeated + rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) + + # multi-index components + codes = self._grouper.reconstructed_codes + codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] + levels = [ping._group_index for ping in self._grouper.groupings] + [lev] + + if dropna: + mask = codes[-1] != -1 + if mask.all(): + dropna = False + else: + out, codes = out[mask], [level_codes[mask] for level_codes in codes] + + if normalize: + out = out.astype("float") + d = np.diff(np.r_[idx, len(ids)]) + if dropna: + m = ids[lab == -1] + np.add.at(d, m, -1) + acc = rep(d)[mask] + else: + acc = rep(d) + out /= acc + + if sort and bins is None: + cat = ids[inc][mask] if dropna else ids[inc] + sorter = np.lexsort((out if ascending else -out, cat)) + out, codes[-1] = out[sorter], codes[-1][sorter] + + if bins is not None: + # for compat. with libgroupby.value_counts need to ensure every + # bin is present at every index level, null filled with zeros + diff = np.zeros(len(out), dtype="bool") + for level_codes in codes[:-1]: + diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] + + ncat, nbin = diff.sum(), len(levels[-1]) + + left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] + + right = [diff.cumsum() - 1, codes[-1]] + + # error: Argument 1 to "get_join_indexers" has incompatible type + # "List[ndarray[Any, Any]]"; expected "List[Union[Union[ExtensionArray, + # ndarray[Any, Any]], Index, Series]] + _, idx = get_join_indexers( + left, right, sort=False, how="left" # type: ignore[arg-type] + ) + if idx is not None: + out = np.where(idx != -1, out[idx], 0) + + if sort: + sorter = np.lexsort((out if ascending else -out, left[0])) + out, left[-1] = out[sorter], left[-1][sorter] + + # build the multi-index w/ full levels + def build_codes(lev_codes: np.ndarray) -> np.ndarray: + return np.repeat(lev_codes[diff], nbin) + + codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] + codes.append(left[-1]) + + mi = MultiIndex( + levels=levels, codes=codes, names=index_names, verify_integrity=False + ) + + if is_integer_dtype(out.dtype): + out = ensure_int64(out) + result = self.obj._constructor(out, index=mi, name=name) + if not self.as_index: + result = result.reset_index() + return result + + def fillna( + self, + value: object | ArrayLike | None = None, + method: FillnaOptions | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, + inplace: bool = False, + limit: int | None = None, + downcast: dict | None | lib.NoDefault = lib.no_default, + ) -> Series | None: + """ + Fill NA/NaN values using the specified method within groups. + + .. deprecated:: 2.2.0 + This method is deprecated and will be removed in a future version. + Use the :meth:`.SeriesGroupBy.ffill` or :meth:`.SeriesGroupBy.bfill` + for forward or backward filling instead. If you want to fill with a + single value, use :meth:`Series.fillna` instead. + + Parameters + ---------- + value : scalar, dict, Series, or DataFrame + Value to use to fill holes (e.g. 0), alternately a + dict/Series/DataFrame of values specifying which value to use for + each index (for a Series) or column (for a DataFrame). Values not + in the dict/Series/DataFrame will not be filled. This value cannot + be a list. Users wanting to use the ``value`` argument and not ``method`` + should prefer :meth:`.Series.fillna` as this + will produce the same result and be more performant. + method : {{'bfill', 'ffill', None}}, default None + Method to use for filling holes. ``'ffill'`` will propagate + the last valid observation forward within a group. + ``'bfill'`` will use next valid observation to fill the gap. + axis : {0 or 'index', 1 or 'columns'} + Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`. + inplace : bool, default False + Broken. Do not set to True. + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill within a group. In other words, + if there is a gap with more than this number of consecutive NaNs, + it will only be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + Returns + ------- + Series + Object with missing values filled within groups. + + See Also + -------- + ffill : Forward fill values within a group. + bfill : Backward fill values within a group. + + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['cat', 'cat', 'cat', 'mouse', 'mouse'] + >>> ser = pd.Series([1, None, None, 2, None], index=lst) + >>> ser + cat 1.0 + cat NaN + cat NaN + mouse 2.0 + mouse NaN + dtype: float64 + >>> ser.groupby(level=0).fillna(0, limit=1) + cat 1.0 + cat 0.0 + cat NaN + mouse 2.0 + mouse 0.0 + dtype: float64 + """ + warnings.warn( + f"{type(self).__name__}.fillna is deprecated and " + "will be removed in a future version. Use obj.ffill() or obj.bfill() " + "for forward or backward filling instead. If you want to fill with a " + f"single value, use {type(self.obj).__name__}.fillna instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = self._op_via_apply( + "fillna", + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) + return result + + def take( + self, + indices: TakeIndexer, + axis: Axis | lib.NoDefault = lib.no_default, + **kwargs, + ) -> Series: + """ + Return the elements in the given *positional* indices in each group. + + This means that we are not indexing according to actual values in + the index attribute of the object. We are indexing according to the + actual position of the element in the object. + + If a requested index does not exist for some group, this method will raise. + To get similar behavior that ignores indices that don't exist, see + :meth:`.SeriesGroupBy.nth`. + + Parameters + ---------- + indices : array-like + An array of ints indicating which positions to take in each group. + axis : {0 or 'index', 1 or 'columns', None}, default 0 + The axis on which to select elements. ``0`` means that we are + selecting rows, ``1`` means that we are selecting columns. + For `SeriesGroupBy` this parameter is unused and defaults to 0. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + **kwargs + For compatibility with :meth:`numpy.take`. Has no effect on the + output. + + Returns + ------- + Series + A Series containing the elements taken from each group. + + See Also + -------- + Series.take : Take elements from a Series along an axis. + Series.loc : Select a subset of a DataFrame by labels. + Series.iloc : Select a subset of a DataFrame by positions. + numpy.take : Take elements from an array along an axis. + SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist. + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan), + ... ('rabbit', 'mammal', 15.0)], + ... columns=['name', 'class', 'max_speed'], + ... index=[4, 3, 2, 1, 0]) + >>> df + name class max_speed + 4 falcon bird 389.0 + 3 parrot bird 24.0 + 2 lion mammal 80.5 + 1 monkey mammal NaN + 0 rabbit mammal 15.0 + >>> gb = df["name"].groupby([1, 1, 2, 2, 2]) + + Take elements at positions 0 and 1 along the axis 0 in each group (default). + + >>> gb.take([0, 1]) + 1 4 falcon + 3 parrot + 2 2 lion + 1 monkey + Name: name, dtype: object + + We may take elements using negative integers for positive indices, + starting from the end of the object, just like with Python lists. + + >>> gb.take([-1, -2]) + 1 3 parrot + 4 falcon + 2 0 rabbit + 1 monkey + Name: name, dtype: object + """ + result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) + return result + + def skew( + self, + axis: Axis | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> Series: + """ + Return unbiased skew within groups. + + Normalized by N-1. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns', None}, default 0 + Axis for the function to be applied on. + This parameter is only for compatibility with DataFrame and is unused. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series + + See Also + -------- + Series.skew : Return unbiased skew over requested axis. + + Examples + -------- + >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.], + ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', + ... 'Parrot', 'Parrot', 'Parrot'], + ... name="Max Speed") + >>> ser + Falcon 390.0 + Falcon 350.0 + Falcon 357.0 + Falcon NaN + Parrot 22.0 + Parrot 20.0 + Parrot 30.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).skew() + Falcon 1.525174 + Parrot 1.457863 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).skew(skipna=False) + Falcon NaN + Parrot 1.457863 + Name: Max Speed, dtype: float64 + """ + if axis is lib.no_default: + axis = 0 + + if axis != 0: + result = self._op_via_apply( + "skew", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + return result + + def alt(obj): + # This should not be reached since the cython path should raise + # TypeError and not NotImplementedError. + raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") + + return self._cython_agg_general( + "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + + @property + @doc(Series.plot.__doc__) + def plot(self) -> GroupByPlot: + result = GroupByPlot(self) + return result + + @doc(Series.nlargest.__doc__) + def nlargest( + self, n: int = 5, keep: Literal["first", "last", "all"] = "first" + ) -> Series: + f = partial(Series.nlargest, n=n, keep=keep) + data = self._obj_with_exclusions + # Don't change behavior if result index happens to be the same, i.e. + # already ordered and n >= all group sizes. + result = self._python_apply_general(f, data, not_indexed_same=True) + return result + + @doc(Series.nsmallest.__doc__) + def nsmallest( + self, n: int = 5, keep: Literal["first", "last", "all"] = "first" + ) -> Series: + f = partial(Series.nsmallest, n=n, keep=keep) + data = self._obj_with_exclusions + # Don't change behavior if result index happens to be the same, i.e. + # already ordered and n >= all group sizes. + result = self._python_apply_general(f, data, not_indexed_same=True) + return result + + @doc(Series.idxmin.__doc__) + def idxmin( + self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True + ) -> Series: + return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna) + + @doc(Series.idxmax.__doc__) + def idxmax( + self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True + ) -> Series: + return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna) + + @doc(Series.corr.__doc__) + def corr( + self, + other: Series, + method: CorrelationMethod = "pearson", + min_periods: int | None = None, + ) -> Series: + result = self._op_via_apply( + "corr", other=other, method=method, min_periods=min_periods + ) + return result + + @doc(Series.cov.__doc__) + def cov( + self, other: Series, min_periods: int | None = None, ddof: int | None = 1 + ) -> Series: + result = self._op_via_apply( + "cov", other=other, min_periods=min_periods, ddof=ddof + ) + return result + + @property + def is_monotonic_increasing(self) -> Series: + """ + Return whether each group's values are monotonically increasing. + + Returns + ------- + Series + + Examples + -------- + >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) + >>> s.groupby(level=0).is_monotonic_increasing + Falcon False + Parrot True + dtype: bool + """ + return self.apply(lambda ser: ser.is_monotonic_increasing) + + @property + def is_monotonic_decreasing(self) -> Series: + """ + Return whether each group's values are monotonically decreasing. + + Returns + ------- + Series + + Examples + -------- + >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) + >>> s.groupby(level=0).is_monotonic_decreasing + Falcon True + Parrot False + dtype: bool + """ + return self.apply(lambda ser: ser.is_monotonic_decreasing) + + @doc(Series.hist.__doc__) + def hist( + self, + by=None, + ax=None, + grid: bool = True, + xlabelsize: int | None = None, + xrot: float | None = None, + ylabelsize: int | None = None, + yrot: float | None = None, + figsize: tuple[int, int] | None = None, + bins: int | Sequence[int] = 10, + backend: str | None = None, + legend: bool = False, + **kwargs, + ): + result = self._op_via_apply( + "hist", + by=by, + ax=ax, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + figsize=figsize, + bins=bins, + backend=backend, + legend=legend, + **kwargs, + ) + return result + + @property + @doc(Series.dtype.__doc__) + def dtype(self) -> Series: + return self.apply(lambda ser: ser.dtype) + + def unique(self) -> Series: + """ + Return unique values for each group. + + It returns unique values for each of the grouped values. Returned in + order of appearance. Hash table-based unique, therefore does NOT sort. + + Returns + ------- + Series + Unique values for each of the grouped values. + + See Also + -------- + Series.unique : Return unique values of Series object. + + Examples + -------- + >>> df = pd.DataFrame([('Chihuahua', 'dog', 6.1), + ... ('Beagle', 'dog', 15.2), + ... ('Chihuahua', 'dog', 6.9), + ... ('Persian', 'cat', 9.2), + ... ('Chihuahua', 'dog', 7), + ... ('Persian', 'cat', 8.8)], + ... columns=['breed', 'animal', 'height_in']) + >>> df + breed animal height_in + 0 Chihuahua dog 6.1 + 1 Beagle dog 15.2 + 2 Chihuahua dog 6.9 + 3 Persian cat 9.2 + 4 Chihuahua dog 7.0 + 5 Persian cat 8.8 + >>> ser = df.groupby('animal')['breed'].unique() + >>> ser + animal + cat [Persian] + dog [Chihuahua, Beagle] + Name: breed, dtype: object + """ + result = self._op_via_apply("unique") + return result + + +class DataFrameGroupBy(GroupBy[DataFrame]): + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> data = {"A": [1, 1, 2, 2], + ... "B": [1, 2, 3, 4], + ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} + >>> df = pd.DataFrame(data) + >>> df + A B C + 0 1 1 0.362838 + 1 1 2 0.227877 + 2 2 3 1.267767 + 3 2 4 -0.562860 + + The aggregation is for each column. + + >>> df.groupby('A').agg('min') + B C + A + 1 1 0.227877 + 2 3 -0.562860 + + Multiple aggregations + + >>> df.groupby('A').agg(['min', 'max']) + B C + min max min max + A + 1 1 2 0.227877 0.362838 + 2 3 4 -0.562860 1.267767 + + Select a column for aggregation + + >>> df.groupby('A').B.agg(['min', 'max']) + min max + A + 1 1 2 + 2 3 4 + + User-defined function for aggregation + + >>> df.groupby('A').agg(lambda x: sum(x) + 2) + B C + A + 1 5 2.590715 + 2 9 2.704907 + + Different aggregations per column + + >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}) + B C + min max sum + A + 1 1 2 0.590715 + 2 3 4 0.704907 + + To control the output names with different aggregations per column, + pandas supports "named aggregation" + + >>> df.groupby("A").agg( + ... b_min=pd.NamedAgg(column="B", aggfunc="min"), + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum") + ... ) + b_min c_sum + A + 1 1 0.590715 + 2 3 0.704907 + + - The keywords are the *output* column names + - The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. + Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields + ``['column', 'aggfunc']`` to make it clearer what the arguments are. + As usual, the aggregation can be a callable or a string alias. + + See :ref:`groupby.aggregate.named` for more. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating function. + + >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min()) + B + A + 1 1.0 + 2 3.0 + """ + ) + + @doc(_agg_template_frame, examples=_agg_examples_doc, klass="DataFrame") + def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + relabeling, func, columns, order = reconstruct_func(func, **kwargs) + func = maybe_mangle_lambdas(func) + + if maybe_use_numba(engine): + # Not all agg functions support numba, only propagate numba kwargs + # if user asks for numba + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + + op = GroupByApply(self, func, args=args, kwargs=kwargs) + result = op.agg() + if not is_dict_like(func) and result is not None: + # GH #52849 + if not self.as_index and is_list_like(func): + return result.reset_index() + else: + return result + elif relabeling: + # this should be the only (non-raising) case with relabeling + # used reordered index of columns + result = cast(DataFrame, result) + result = result.iloc[:, order] + result = cast(DataFrame, result) + # error: Incompatible types in assignment (expression has type + # "Optional[List[str]]", variable has type + # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], + # Index, Series], Sequence[Any]]") + result.columns = columns # type: ignore[assignment] + + if result is None: + # Remove the kwargs we inserted + # (already stored in engine, engine_kwargs arguments) + if "engine" in kwargs: + del kwargs["engine"] + del kwargs["engine_kwargs"] + # at this point func is not a str, list-like, dict-like, + # or a known callable(e.g. sum) + if maybe_use_numba(engine): + return self._aggregate_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + # grouper specific aggregations + if self._grouper.nkeys > 1: + # test_groupby_as_index_series_scalar gets here with 'not self.as_index' + return self._python_agg_general(func, *args, **kwargs) + elif args or kwargs: + # test_pass_args_kwargs gets here (with and without as_index) + # can't return early + result = self._aggregate_frame(func, *args, **kwargs) + + elif self.axis == 1: + # _aggregate_multiple_funcs does not allow self.axis == 1 + # Note: axis == 1 precludes 'not self.as_index', see __init__ + result = self._aggregate_frame(func) + return result + + else: + # try to treat as if we are passing a list + gba = GroupByApply(self, [func], args=(), kwargs={}) + try: + result = gba.agg() + + except ValueError as err: + if "No objects to concatenate" not in str(err): + raise + # _aggregate_frame can fail with e.g. func=Series.mode, + # where it expects 1D values but would be getting 2D values + # In other tests, using aggregate_frame instead of GroupByApply + # would give correct values but incorrect dtypes + # object vs float64 in test_cython_agg_empty_buckets + # float64 vs int64 in test_category_order_apply + result = self._aggregate_frame(func) + + else: + # GH#32040, GH#35246 + # e.g. test_groupby_as_index_select_column_sum_empty_df + result = cast(DataFrame, result) + result.columns = self._obj_with_exclusions.columns.copy() + + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + + return result + + agg = aggregate + + def _python_agg_general(self, func, *args, **kwargs): + orig_func = func + func = com.is_builtin_func(func) + if orig_func != func: + alias = com._builtin_table_alias[func] + warn_alias_replacement(self, orig_func, alias) + f = lambda x: func(x, *args, **kwargs) + + if self.ngroups == 0: + # e.g. test_evaluate_with_empty_groups different path gets different + # result dtype in empty case. + return self._python_apply_general(f, self._selected_obj, is_agg=True) + + obj = self._obj_with_exclusions + if self.axis == 1: + obj = obj.T + + if not len(obj.columns): + # e.g. test_margins_no_values_no_cols + return self._python_apply_general(f, self._selected_obj) + + output: dict[int, ArrayLike] = {} + for idx, (name, ser) in enumerate(obj.items()): + result = self._grouper.agg_series(ser, f) + output[idx] = result + + res = self.obj._constructor(output) + res.columns = obj.columns.copy(deep=False) + return self._wrap_aggregated_output(res) + + def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: + if self._grouper.nkeys != 1: + raise AssertionError("Number of keys must be 1") + + obj = self._obj_with_exclusions + + result: dict[Hashable, NDFrame | np.ndarray] = {} + for name, grp_df in self._grouper.get_iterator(obj, self.axis): + fres = func(grp_df, *args, **kwargs) + result[name] = fres + + result_index = self._grouper.result_index + other_ax = obj.axes[1 - self.axis] + out = self.obj._constructor(result, index=other_ax, columns=result_index) + if self.axis == 0: + out = out.T + + return out + + def _wrap_applied_output( + self, + data: DataFrame, + values: list, + not_indexed_same: bool = False, + is_transform: bool = False, + ): + if len(values) == 0: + if is_transform: + # GH#47787 see test_group_on_empty_multiindex + res_index = data.index + else: + res_index = self._grouper.result_index + + result = self.obj._constructor(index=res_index, columns=data.columns) + result = result.astype(data.dtypes, copy=False) + return result + + # GH12824 + # using values[0] here breaks test_groupby_apply_none_first + first_not_none = next(com.not_none(*values), None) + + if first_not_none is None: + # GH9684 - All values are None, return an empty frame. + return self.obj._constructor() + elif isinstance(first_not_none, DataFrame): + return self._concat_objects( + values, + not_indexed_same=not_indexed_same, + is_transform=is_transform, + ) + + key_index = self._grouper.result_index if self.as_index else None + + if isinstance(first_not_none, (np.ndarray, Index)): + # GH#1738: values is list of arrays of unequal lengths + # fall through to the outer else clause + # TODO: sure this is right? we used to do this + # after raising AttributeError above + # GH 18930 + if not is_hashable(self._selection): + # error: Need type annotation for "name" + name = tuple(self._selection) # type: ignore[var-annotated, arg-type] + else: + # error: Incompatible types in assignment + # (expression has type "Hashable", variable + # has type "Tuple[Any, ...]") + name = self._selection # type: ignore[assignment] + return self.obj._constructor_sliced(values, index=key_index, name=name) + elif not isinstance(first_not_none, Series): + # values are not series or array-like but scalars + # self._selection not passed through to Series as the + # result should not take the name of original selection + # of columns + if self.as_index: + return self.obj._constructor_sliced(values, index=key_index) + else: + result = self.obj._constructor(values, columns=[self._selection]) + result = self._insert_inaxis_grouper(result) + return result + else: + # values are Series + return self._wrap_applied_output_series( + values, + not_indexed_same, + first_not_none, + key_index, + is_transform, + ) + + def _wrap_applied_output_series( + self, + values: list[Series], + not_indexed_same: bool, + first_not_none, + key_index: Index | None, + is_transform: bool, + ) -> DataFrame | Series: + kwargs = first_not_none._construct_axes_dict() + backup = Series(**kwargs) + values = [x if (x is not None) else backup for x in values] + + all_indexed_same = all_indexes_same(x.index for x in values) + + if not all_indexed_same: + # GH 8467 + return self._concat_objects( + values, + not_indexed_same=True, + is_transform=is_transform, + ) + + # Combine values + # vstack+constructor is faster than concat and handles MI-columns + stacked_values = np.vstack([np.asarray(v) for v in values]) + + if self.axis == 0: + index = key_index + columns = first_not_none.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = next(iter(names)) + else: + index = first_not_none.index + columns = key_index + stacked_values = stacked_values.T + + if stacked_values.dtype == object: + # We'll have the DataFrame constructor do inference + stacked_values = stacked_values.tolist() + result = self.obj._constructor(stacked_values, index=index, columns=columns) + + if not self.as_index: + result = self._insert_inaxis_grouper(result) + + return self._reindex_output(result) + + def _cython_transform( + self, + how: str, + numeric_only: bool = False, + axis: AxisInt = 0, + **kwargs, + ) -> DataFrame: + assert axis == 0 # handled by caller + + # With self.axis == 0, we have multi-block tests + # e.g. test_rank_min_int, test_cython_transform_frame + # test_transform_numeric_ret + # With self.axis == 1, _get_data_to_aggregate does a transpose + # so we always have a single block. + mgr: Manager2D = self._get_data_to_aggregate( + numeric_only=numeric_only, name=how + ) + + def arr_func(bvalues: ArrayLike) -> ArrayLike: + return self._grouper._cython_operation( + "transform", bvalues, how, 1, **kwargs + ) + + # We could use `mgr.apply` here and not have to set_axis, but + # we would have to do shape gymnastics for ArrayManager compat + res_mgr = mgr.grouped_reduce(arr_func) + res_mgr.set_axis(1, mgr.axes[1]) + + res_df = self.obj._constructor_from_mgr(res_mgr, axes=res_mgr.axes) + res_df = self._maybe_transpose_result(res_df) + return res_df + + def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs): + if maybe_use_numba(engine): + return self._transform_with_numba( + func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + from pandas.core.reshape.concat import concat + + applied = [] + obj = self._obj_with_exclusions + gen = self._grouper.get_iterator(obj, axis=self.axis) + fast_path, slow_path = self._define_paths(func, *args, **kwargs) + + # Determine whether to use slow or fast path by evaluating on the first group. + # Need to handle the case of an empty generator and process the result so that + # it does not need to be computed again. + try: + name, group = next(gen) + except StopIteration: + pass + else: + # 2023-02-27 No tests broken by disabling this pinning + object.__setattr__(group, "name", name) + try: + path, res = self._choose_path(fast_path, slow_path, group) + except ValueError as err: + # e.g. test_transform_with_non_scalar_group + msg = "transform must return a scalar value for each group" + raise ValueError(msg) from err + if group.size > 0: + res = _wrap_transform_general_frame(self.obj, group, res) + applied.append(res) + + # Compute and process with the remaining groups + for name, group in gen: + if group.size == 0: + continue + # 2023-02-27 No tests broken by disabling this pinning + object.__setattr__(group, "name", name) + res = path(group) + + res = _wrap_transform_general_frame(self.obj, group, res) + applied.append(res) + + concat_index = obj.columns if self.axis == 0 else obj.index + other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 + concatenated = concat(applied, axis=self.axis, verify_integrity=False) + concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) + return self._set_result_index_ordered(concatenated) + + __examples_dataframe_doc = dedent( + """ + >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + ... 'foo', 'bar'], + ... 'B' : ['one', 'one', 'two', 'three', + ... 'two', 'two'], + ... 'C' : [1, 5, 5, 2, 5, 5], + ... 'D' : [2.0, 5., 8., 1., 2., 9.]}) + >>> grouped = df.groupby('A')[['C', 'D']] + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + C D + 0 -1.154701 -0.577350 + 1 0.577350 0.000000 + 2 0.577350 1.154701 + 3 -1.154701 -1.000000 + 4 0.577350 -0.577350 + 5 0.577350 1.000000 + + Broadcast result of the transformation + + >>> grouped.transform(lambda x: x.max() - x.min()) + C D + 0 4.0 6.0 + 1 3.0 8.0 + 2 4.0 6.0 + 3 3.0 8.0 + 4 4.0 6.0 + 5 3.0 8.0 + + >>> grouped.transform("mean") + C D + 0 3.666667 4.0 + 1 4.000000 5.0 + 2 3.666667 4.0 + 3 4.000000 5.0 + 4 3.666667 4.0 + 5 4.000000 5.0 + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + for example: + + >>> grouped.transform(lambda x: x.astype(int).max()) + C D + 0 5 8 + 1 5 9 + 2 5 8 + 3 5 9 + 4 5 8 + 5 5 9 + """ + ) + + @Substitution(klass="DataFrame", example=__examples_dataframe_doc) + @Appender(_transform_template) + def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + return self._transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) + + def _define_paths(self, func, *args, **kwargs): + if isinstance(func, str): + fast_path = lambda group: getattr(group, func)(*args, **kwargs) + slow_path = lambda group: group.apply( + lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis + ) + else: + fast_path = lambda group: func(group, *args, **kwargs) + slow_path = lambda group: group.apply( + lambda x: func(x, *args, **kwargs), axis=self.axis + ) + return fast_path, slow_path + + def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): + path = slow_path + res = slow_path(group) + + if self.ngroups == 1: + # no need to evaluate multiple paths when only + # a single group exists + return path, res + + # if we make it here, test if we can use the fast path + try: + res_fast = fast_path(group) + except AssertionError: + raise # pragma: no cover + except Exception: + # GH#29631 For user-defined function, we can't predict what may be + # raised; see test_transform.test_transform_fastpath_raises + return path, res + + # verify fast path returns either: + # a DataFrame with columns equal to group.columns + # OR a Series with index equal to group.columns + if isinstance(res_fast, DataFrame): + if not res_fast.columns.equals(group.columns): + return path, res + elif isinstance(res_fast, Series): + if not res_fast.index.equals(group.columns): + return path, res + else: + return path, res + + if res_fast.equals(res): + path = fast_path + + return path, res + + def filter(self, func, dropna: bool = True, *args, **kwargs): + """ + Filter elements from groups that don't satisfy a criterion. + + Elements from groups are filtered if they do not satisfy the + boolean criterion specified by func. + + Parameters + ---------- + func : function + Criterion to apply to each group. Should return True or False. + dropna : bool + Drop groups that do not pass the filter. True by default; if False, + groups that evaluate False are filled with NaNs. + + Returns + ------- + DataFrame + + Notes + ----- + Each subframe is endowed the attribute 'name' in case you need to know + which group you are working on. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + Examples + -------- + >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + ... 'foo', 'bar'], + ... 'B' : [1, 2, 3, 4, 5, 6], + ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) + >>> grouped = df.groupby('A') + >>> grouped.filter(lambda x: x['B'].mean() > 3.) + A B C + 1 bar 2 5.0 + 3 bar 4 1.0 + 5 bar 6 9.0 + """ + indices = [] + + obj = self._selected_obj + gen = self._grouper.get_iterator(obj, axis=self.axis) + + for name, group in gen: + # 2023-02-27 no tests are broken this pinning, but it is documented in the + # docstring above. + object.__setattr__(group, "name", name) + + res = func(group, *args, **kwargs) + + try: + res = res.squeeze() + except AttributeError: # allow e.g., scalars and frames to pass + pass + + # interpret the result of the filter + if is_bool(res) or (is_scalar(res) and isna(res)): + if notna(res) and res: + indices.append(self._get_index(name)) + else: + # non scalars aren't allowed + raise TypeError( + f"filter function returned a {type(res).__name__}, " + "but expected a scalar bool" + ) + + return self._apply_filter(indices, dropna) + + def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy: + if self.axis == 1: + # GH 37725 + raise ValueError("Cannot subset columns when using axis=1") + # per GH 23566 + if isinstance(key, tuple) and len(key) > 1: + # if len == 1, then it becomes a SeriesGroupBy and this is actually + # valid syntax, so don't raise + raise ValueError( + "Cannot subset columns with a tuple with more than one element. " + "Use a list instead." + ) + return super().__getitem__(key) + + def _gotitem(self, key, ndim: int, subset=None): + """ + sub-classes to define + return a sliced object + + Parameters + ---------- + key : string / list of selections + ndim : {1, 2} + requested ndim of result + subset : object, default None + subset to act on + """ + if ndim == 2: + if subset is None: + subset = self.obj + return DataFrameGroupBy( + subset, + self.keys, + axis=self.axis, + level=self.level, + grouper=self._grouper, + exclusions=self.exclusions, + selection=key, + as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + observed=self.observed, + dropna=self.dropna, + ) + elif ndim == 1: + if subset is None: + subset = self.obj[key] + return SeriesGroupBy( + subset, + self.keys, + level=self.level, + grouper=self._grouper, + exclusions=self.exclusions, + selection=key, + as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + observed=self.observed, + dropna=self.dropna, + ) + + raise AssertionError("invalid ndim for _gotitem") + + def _get_data_to_aggregate( + self, *, numeric_only: bool = False, name: str | None = None + ) -> Manager2D: + obj = self._obj_with_exclusions + if self.axis == 1: + mgr = obj.T._mgr + else: + mgr = obj._mgr + + if numeric_only: + mgr = mgr.get_numeric_data() + return mgr + + def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: + return self.obj._constructor_from_mgr(mgr, axes=mgr.axes) + + def _apply_to_column_groupbys(self, func) -> DataFrame: + from pandas.core.reshape.concat import concat + + obj = self._obj_with_exclusions + columns = obj.columns + sgbs = [ + SeriesGroupBy( + obj.iloc[:, i], + selection=colname, + grouper=self._grouper, + exclusions=self.exclusions, + observed=self.observed, + ) + for i, colname in enumerate(obj.columns) + ] + results = [func(sgb) for sgb in sgbs] + + if not len(results): + # concat would raise + res_df = DataFrame([], columns=columns, index=self._grouper.result_index) + else: + res_df = concat(results, keys=columns, axis=1) + + if not self.as_index: + res_df.index = default_index(len(res_df)) + res_df = self._insert_inaxis_grouper(res_df) + return res_df + + def nunique(self, dropna: bool = True) -> DataFrame: + """ + Return DataFrame with counts of unique elements in each position. + + Parameters + ---------- + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + nunique: DataFrame + + Examples + -------- + >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', + ... 'ham', 'ham'], + ... 'value1': [1, 5, 5, 2, 5, 5], + ... 'value2': list('abbaxy')}) + >>> df + id value1 value2 + 0 spam 1 a + 1 egg 5 b + 2 egg 5 b + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y + + >>> df.groupby('id').nunique() + value1 value2 + id + egg 1 1 + ham 1 2 + spam 2 1 + + Check for rows with the same id but conflicting values: + + >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) + id value1 value2 + 0 spam 1 a + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y + """ + + if self.axis != 0: + # see test_groupby_crash_on_nunique + return self._python_apply_general( + lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True + ) + + return self._apply_to_column_groupbys(lambda sgb: sgb.nunique(dropna)) + + def idxmax( + self, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + ) -> DataFrame: + """ + Return index of first occurrence of maximum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + + .. versionchanged:: 2.0.0 + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of maxima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmax : Return index of the maximum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmax``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the maximum value in each column. + + >>> df.idxmax() + consumption Wheat Products + co2_emissions Beef + dtype: object + + To return the index for the maximum value in each row, use ``axis="columns"``. + + >>> df.idxmax(axis="columns") + Pork co2_emissions + Wheat Products consumption + Beef co2_emissions + dtype: object + """ + return self._idxmax_idxmin( + "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna + ) + + def idxmin( + self, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + ) -> DataFrame: + """ + Return index of first occurrence of minimum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + + .. versionchanged:: 2.0.0 + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of minima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmin : Return index of the minimum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmin``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the minimum value in each column. + + >>> df.idxmin() + consumption Pork + co2_emissions Wheat Products + dtype: object + + To return the index for the minimum value in each row, use ``axis="columns"``. + + >>> df.idxmin(axis="columns") + Pork consumption + Wheat Products co2_emissions + Beef consumption + dtype: object + """ + return self._idxmax_idxmin( + "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna + ) + + boxplot = boxplot_frame_groupby + + def value_counts( + self, + subset: Sequence[Hashable] | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrame | Series: + """ + Return a Series or DataFrame containing counts of unique rows. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + dropna : bool, default True + Don't include counts of rows that contain NA values. + + Returns + ------- + Series or DataFrame + Series if the groupby as_index is True, otherwise DataFrame. + + See Also + -------- + Series.value_counts: Equivalent method on Series. + DataFrame.value_counts: Equivalent method on DataFrame. + SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. + + Notes + ----- + - If the groupby as_index is True then the returned Series will have a + MultiIndex with one level per input column. + - If the groupby as_index is False then the returned DataFrame will have an + additional column with the value_counts. The column is labelled 'count' or + 'proportion', depending on the ``normalize`` parameter. + + By default, rows that contain any NA values are omitted from + the result. + + By default, the result will be in descending order so that the + first element of each group is the most frequently-occurring row. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + >>> df.groupby('gender').value_counts() + gender education country + female high FR 1 + US 1 + male low FR 2 + US 1 + medium FR 1 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(ascending=True) + gender education country + female high FR 1 + US 1 + male low US 1 + medium FR 1 + low FR 2 + Name: count, dtype: int64 + + >>> df.groupby('gender').value_counts(normalize=True) + gender education country + female high FR 0.50 + US 0.50 + male low FR 0.50 + US 0.25 + medium FR 0.25 + Name: proportion, dtype: float64 + + >>> df.groupby('gender', as_index=False).value_counts() + gender education country count + 0 female high FR 1 + 1 female high US 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + gender education country proportion + 0 female high FR 0.50 + 1 female high US 0.50 + 2 male low FR 0.50 + 3 male low US 0.25 + 4 male medium FR 0.25 + """ + return self._value_counts(subset, normalize, sort, ascending, dropna) + + def fillna( + self, + value: Hashable | Mapping | Series | DataFrame | None = None, + method: FillnaOptions | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, + inplace: bool = False, + limit: int | None = None, + downcast=lib.no_default, + ) -> DataFrame | None: + """ + Fill NA/NaN values using the specified method within groups. + + .. deprecated:: 2.2.0 + This method is deprecated and will be removed in a future version. + Use the :meth:`.DataFrameGroupBy.ffill` or :meth:`.DataFrameGroupBy.bfill` + for forward or backward filling instead. If you want to fill with a + single value, use :meth:`DataFrame.fillna` instead. + + Parameters + ---------- + value : scalar, dict, Series, or DataFrame + Value to use to fill holes (e.g. 0), alternately a + dict/Series/DataFrame of values specifying which value to use for + each index (for a Series) or column (for a DataFrame). Values not + in the dict/Series/DataFrame will not be filled. This value cannot + be a list. Users wanting to use the ``value`` argument and not ``method`` + should prefer :meth:`.DataFrame.fillna` as this + will produce the same result and be more performant. + method : {{'bfill', 'ffill', None}}, default None + Method to use for filling holes. ``'ffill'`` will propagate + the last valid observation forward within a group. + ``'bfill'`` will use next valid observation to fill the gap. + axis : {0 or 'index', 1 or 'columns'} + Axis along which to fill missing values. When the :class:`DataFrameGroupBy` + ``axis`` argument is ``0``, using ``axis=1`` here will produce + the same results as :meth:`.DataFrame.fillna`. When the + :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0`` + or ``axis=1`` here will produce the same results. + inplace : bool, default False + Broken. Do not set to True. + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill within a group. In other words, + if there is a gap with more than this number of consecutive NaNs, + it will only be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + Returns + ------- + DataFrame + Object with missing values filled. + + See Also + -------- + ffill : Forward fill values within a group. + bfill : Backward fill values within a group. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "key": [0, 0, 1, 1, 1], + ... "A": [np.nan, 2, np.nan, 3, np.nan], + ... "B": [2, 3, np.nan, np.nan, np.nan], + ... "C": [np.nan, np.nan, 2, np.nan, np.nan], + ... } + ... ) + >>> df + key A B C + 0 0 NaN 2.0 NaN + 1 0 2.0 3.0 NaN + 2 1 NaN NaN 2.0 + 3 1 3.0 NaN NaN + 4 1 NaN NaN NaN + + Propagate non-null values forward or backward within each group along columns. + + >>> df.groupby("key").fillna(method="ffill") + A B C + 0 NaN 2.0 NaN + 1 2.0 3.0 NaN + 2 NaN NaN 2.0 + 3 3.0 NaN 2.0 + 4 3.0 NaN 2.0 + + >>> df.groupby("key").fillna(method="bfill") + A B C + 0 2.0 2.0 NaN + 1 2.0 3.0 NaN + 2 3.0 NaN 2.0 + 3 3.0 NaN NaN + 4 NaN NaN NaN + + Propagate non-null values forward or backward within each group along rows. + + >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="ffill").T + key A B C + 0 0.0 0.0 2.0 2.0 + 1 0.0 2.0 3.0 3.0 + 2 1.0 1.0 NaN 2.0 + 3 1.0 3.0 NaN NaN + 4 1.0 1.0 NaN NaN + + >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="bfill").T + key A B C + 0 0.0 NaN 2.0 NaN + 1 0.0 2.0 3.0 NaN + 2 1.0 NaN 2.0 2.0 + 3 1.0 3.0 NaN NaN + 4 1.0 NaN NaN NaN + + Only replace the first NaN element within a group along rows. + + >>> df.groupby("key").fillna(method="ffill", limit=1) + A B C + 0 NaN 2.0 NaN + 1 2.0 3.0 NaN + 2 NaN NaN 2.0 + 3 3.0 NaN 2.0 + 4 3.0 NaN NaN + """ + warnings.warn( + f"{type(self).__name__}.fillna is deprecated and " + "will be removed in a future version. Use obj.ffill() or obj.bfill() " + "for forward or backward filling instead. If you want to fill with a " + f"single value, use {type(self.obj).__name__}.fillna instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + result = self._op_via_apply( + "fillna", + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) + return result + + def take( + self, + indices: TakeIndexer, + axis: Axis | None | lib.NoDefault = lib.no_default, + **kwargs, + ) -> DataFrame: + """ + Return the elements in the given *positional* indices in each group. + + This means that we are not indexing according to actual values in + the index attribute of the object. We are indexing according to the + actual position of the element in the object. + + If a requested index does not exist for some group, this method will raise. + To get similar behavior that ignores indices that don't exist, see + :meth:`.DataFrameGroupBy.nth`. + + Parameters + ---------- + indices : array-like + An array of ints indicating which positions to take. + axis : {0 or 'index', 1 or 'columns', None}, default 0 + The axis on which to select elements. ``0`` means that we are + selecting rows, ``1`` means that we are selecting columns. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + **kwargs + For compatibility with :meth:`numpy.take`. Has no effect on the + output. + + Returns + ------- + DataFrame + An DataFrame containing the elements taken from each group. + + See Also + -------- + DataFrame.take : Take elements from a Series along an axis. + DataFrame.loc : Select a subset of a DataFrame by labels. + DataFrame.iloc : Select a subset of a DataFrame by positions. + numpy.take : Take elements from an array along an axis. + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan), + ... ('rabbit', 'mammal', 15.0)], + ... columns=['name', 'class', 'max_speed'], + ... index=[4, 3, 2, 1, 0]) + >>> df + name class max_speed + 4 falcon bird 389.0 + 3 parrot bird 24.0 + 2 lion mammal 80.5 + 1 monkey mammal NaN + 0 rabbit mammal 15.0 + >>> gb = df.groupby([1, 1, 2, 2, 2]) + + Take elements at positions 0 and 1 along the axis 0 (default). + + Note how the indices selected in the result do not correspond to + our input indices 0 and 1. That's because we are selecting the 0th + and 1st rows, not rows whose indices equal 0 and 1. + + >>> gb.take([0, 1]) + name class max_speed + 1 4 falcon bird 389.0 + 3 parrot bird 24.0 + 2 2 lion mammal 80.5 + 1 monkey mammal NaN + + The order of the specified indices influences the order in the result. + Here, the order is swapped from the previous example. + + >>> gb.take([1, 0]) + name class max_speed + 1 3 parrot bird 24.0 + 4 falcon bird 389.0 + 2 1 monkey mammal NaN + 2 lion mammal 80.5 + + Take elements at indices 1 and 2 along the axis 1 (column selection). + + We may take elements using negative integers for positive indices, + starting from the end of the object, just like with Python lists. + + >>> gb.take([-1, -2]) + name class max_speed + 1 3 parrot bird 24.0 + 4 falcon bird 389.0 + 2 0 rabbit mammal 15.0 + 1 monkey mammal NaN + """ + result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) + return result + + def skew( + self, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> DataFrame: + """ + Return unbiased skew within groups. + + Normalized by N-1. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns', None}, default 0 + Axis for the function to be applied on. + + Specifying ``axis=None`` will apply the aggregation across both axes. + + .. versionadded:: 2.0.0 + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.skew : Return unbiased skew over requested axis. + + Examples + -------- + >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi', + ... 'lion', 'monkey', 'rabbit'], + ... ['bird', 'bird', 'bird', 'bird', + ... 'mammal', 'mammal', 'mammal']] + >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class')) + >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan, + ... 80.5, 21.5, 15.0]}, + ... index=index) + >>> df + max_speed + name class + falcon bird 389.0 + parrot bird 24.0 + cockatoo bird 70.0 + kiwi bird NaN + lion mammal 80.5 + monkey mammal 21.5 + rabbit mammal 15.0 + >>> gb = df.groupby(["class"]) + >>> gb.skew() + max_speed + class + bird 1.628296 + mammal 1.669046 + >>> gb.skew(skipna=False) + max_speed + class + bird NaN + mammal 1.669046 + """ + if axis is lib.no_default: + axis = 0 + + if axis != 0: + result = self._op_via_apply( + "skew", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + **kwargs, + ) + return result + + def alt(obj): + # This should not be reached since the cython path should raise + # TypeError and not NotImplementedError. + raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") + + return self._cython_agg_general( + "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + + @property + @doc(DataFrame.plot.__doc__) + def plot(self) -> GroupByPlot: + result = GroupByPlot(self) + return result + + @doc(DataFrame.corr.__doc__) + def corr( + self, + method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", + min_periods: int = 1, + numeric_only: bool = False, + ) -> DataFrame: + result = self._op_via_apply( + "corr", method=method, min_periods=min_periods, numeric_only=numeric_only + ) + return result + + @doc(DataFrame.cov.__doc__) + def cov( + self, + min_periods: int | None = None, + ddof: int | None = 1, + numeric_only: bool = False, + ) -> DataFrame: + result = self._op_via_apply( + "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only + ) + return result + + @doc(DataFrame.hist.__doc__) + def hist( + self, + column: IndexLabel | None = None, + by=None, + grid: bool = True, + xlabelsize: int | None = None, + xrot: float | None = None, + ylabelsize: int | None = None, + yrot: float | None = None, + ax=None, + sharex: bool = False, + sharey: bool = False, + figsize: tuple[int, int] | None = None, + layout: tuple[int, int] | None = None, + bins: int | Sequence[int] = 10, + backend: str | None = None, + legend: bool = False, + **kwargs, + ): + result = self._op_via_apply( + "hist", + column=column, + by=by, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + bins=bins, + backend=backend, + legend=legend, + **kwargs, + ) + return result + + @property + @doc(DataFrame.dtypes.__doc__) + def dtypes(self) -> Series: + # GH#51045 + warnings.warn( + f"{type(self).__name__}.dtypes is deprecated and will be removed in " + "a future version. Check the dtypes on the base object instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + # error: Incompatible return value type (got "DataFrame", expected "Series") + return self._python_apply_general( # type: ignore[return-value] + lambda df: df.dtypes, self._selected_obj + ) + + @doc(DataFrame.corrwith.__doc__) + def corrwith( + self, + other: DataFrame | Series, + axis: Axis | lib.NoDefault = lib.no_default, + drop: bool = False, + method: CorrelationMethod = "pearson", + numeric_only: bool = False, + ) -> DataFrame: + result = self._op_via_apply( + "corrwith", + other=other, + axis=axis, + drop=drop, + method=method, + numeric_only=numeric_only, + ) + return result + + +def _wrap_transform_general_frame( + obj: DataFrame, group: DataFrame, res: DataFrame | Series +) -> DataFrame: + from pandas import concat + + if isinstance(res, Series): + # we need to broadcast across the + # other dimension; this will preserve dtypes + # GH14457 + if res.index.is_(obj.index): + res_frame = concat([res] * len(group.columns), axis=1) + res_frame.columns = group.columns + res_frame.index = group.index + else: + res_frame = obj._constructor( + np.tile(res.values, (len(group.index), 1)), + columns=group.columns, + index=group.index, + ) + assert isinstance(res_frame, DataFrame) + return res_frame + elif isinstance(res, DataFrame) and not res.index.is_(group.index): + return res._align_frame(group)[0] + else: + return res diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/groupby.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/groupby.py new file mode 100644 index 0000000000000000000000000000000000000000..db8949788567b76bd8df0580d19dcc3e5069e923 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/groupby.py @@ -0,0 +1,5997 @@ +""" +Provide the groupby split-apply-combine paradigm. Define the GroupBy +class providing the base-class of operations. + +The SeriesGroupBy and DataFrameGroupBy sub-class +(defined in pandas.core.groupby.generic) +expose these user-facing objects to provide specific functionality. +""" +from __future__ import annotations + +from collections.abc import ( + Hashable, + Iterator, + Mapping, + Sequence, +) +import datetime +from functools import ( + partial, + wraps, +) +import inspect +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Callable, + Literal, + TypeVar, + Union, + cast, + final, +) +import warnings + +import numpy as np + +from pandas._config.config import option_context + +from pandas._libs import ( + Timestamp, + lib, +) +from pandas._libs.algos import rank_1d +import pandas._libs.groupby as libgroupby +from pandas._libs.missing import NA +from pandas._typing import ( + AnyArrayLike, + ArrayLike, + Axis, + AxisInt, + DtypeObj, + FillnaOptions, + IndexLabel, + NDFrameT, + PositionalIndexer, + RandomState, + Scalar, + T, + npt, +) +from pandas.compat.numpy import function as nv +from pandas.errors import ( + AbstractMethodError, + DataError, +) +from pandas.util._decorators import ( + Appender, + Substitution, + cache_readonly, + doc, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import ( + coerce_indexer_dtype, + ensure_dtype_can_hold_na, +) +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_float_dtype, + is_hashable, + is_integer, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_scalar, + needs_i8_conversion, + pandas_dtype, +) +from pandas.core.dtypes.missing import ( + isna, + na_value_for_dtype, + notna, +) + +from pandas.core import ( + algorithms, + sample, +) +from pandas.core._numba import executor +from pandas.core.apply import warn_alias_replacement +from pandas.core.arrays import ( + ArrowExtensionArray, + BaseMaskedArray, + Categorical, + ExtensionArray, + FloatingArray, + IntegerArray, + SparseArray, +) +from pandas.core.arrays.string_ import StringDtype +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) +from pandas.core.base import ( + PandasObject, + SelectionMixin, +) +import pandas.core.common as com +from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame +from pandas.core.groupby import ( + base, + numba_, + ops, +) +from pandas.core.groupby.grouper import get_grouper +from pandas.core.groupby.indexing import ( + GroupByIndexingMixin, + GroupByNthSelector, +) +from pandas.core.indexes.api import ( + CategoricalIndex, + Index, + MultiIndex, + RangeIndex, + default_index, +) +from pandas.core.internals.blocks import ensure_block_shape +from pandas.core.series import Series +from pandas.core.sorting import get_group_index_sorter +from pandas.core.util.numba_ import ( + get_jit_arguments, + maybe_use_numba, +) + +if TYPE_CHECKING: + from typing import Any + + from pandas.core.resample import Resampler + from pandas.core.window import ( + ExpandingGroupby, + ExponentialMovingWindowGroupby, + RollingGroupby, + ) + +_common_see_also = """ + See Also + -------- + Series.%(name)s : Apply a function %(name)s to a Series. + DataFrame.%(name)s : Apply a function %(name)s + to each row or column of a DataFrame. +""" + +_apply_docs = { + "template": """ + Apply function ``func`` group-wise and combine the results together. + + The function passed to ``apply`` must take a {input} as its first + argument and return a DataFrame, Series or scalar. ``apply`` will + then take care of combining the results back together into a single + dataframe or series. ``apply`` is therefore a highly flexible + grouping method. + + While ``apply`` is a very flexible method, its downside is that + using it can be quite a bit slower than using more specific methods + like ``agg`` or ``transform``. Pandas offers a wide range of method that will + be much faster than using ``apply`` for their specific purposes, so try to + use them before reaching for ``apply``. + + Parameters + ---------- + func : callable + A callable that takes a {input} as its first argument, and + returns a dataframe, a series or a scalar. In addition the + callable may take positional and keyword arguments. + include_groups : bool, default True + When True, will attempt to apply ``func`` to the groupings in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + + args, kwargs : tuple and dict + Optional positional and keyword arguments to pass to ``func``. + + Returns + ------- + Series or DataFrame + + See Also + -------- + pipe : Apply function to the full GroupBy object instead of to each + group. + aggregate : Apply aggregate function to the GroupBy object. + transform : Apply function column-by-column to the GroupBy object. + Series.apply : Apply a function to a Series. + DataFrame.apply : Apply a function to each row or column of a DataFrame. + + Notes + ----- + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + Examples + -------- + {examples} + """, + "dataframe_examples": """ + >>> df = pd.DataFrame({'A': 'a a b'.split(), + ... 'B': [1, 2, 3], + ... 'C': [4, 6, 5]}) + >>> g1 = df.groupby('A', group_keys=False) + >>> g2 = df.groupby('A', group_keys=True) + + Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only + differ in their ``group_keys`` argument. Calling `apply` in various ways, + we can get different grouping results: + + Example 1: below the function passed to `apply` takes a DataFrame as + its argument and returns a DataFrame. `apply` combines the result for + each group together into a new DataFrame: + + >>> g1[['B', 'C']].apply(lambda x: x / x.sum()) + B C + 0 0.333333 0.4 + 1 0.666667 0.6 + 2 1.000000 1.0 + + In the above, the groups are not part of the index. We can have them included + by using ``g2`` where ``group_keys=True``: + + >>> g2[['B', 'C']].apply(lambda x: x / x.sum()) + B C + A + a 0 0.333333 0.4 + 1 0.666667 0.6 + b 2 1.000000 1.0 + + Example 2: The function passed to `apply` takes a DataFrame as + its argument and returns a Series. `apply` combines the result for + each group together into a new DataFrame. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``. + + >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) + B C + A + a 1.0 2.0 + b 0.0 0.0 + + >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) + B C + A + a 1.0 2.0 + b 0.0 0.0 + + The ``group_keys`` argument has no effect here because the result is not + like-indexed (i.e. :ref:`a transform `) when compared + to the input. + + Example 3: The function passed to `apply` takes a DataFrame as + its argument and returns a scalar. `apply` combines the result for + each group together into a Series, including setting the index as + appropriate: + + >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) + A + a 5 + b 2 + dtype: int64""", + "series_examples": """ + >>> s = pd.Series([0, 1, 2], index='a a b'.split()) + >>> g1 = s.groupby(s.index, group_keys=False) + >>> g2 = s.groupby(s.index, group_keys=True) + + From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``. + Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only + differ in their ``group_keys`` argument. Calling `apply` in various ways, + we can get different grouping results: + + Example 1: The function passed to `apply` takes a Series as + its argument and returns a Series. `apply` combines the result for + each group together into a new Series. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``. + + >>> g1.apply(lambda x: x * 2 if x.name == 'a' else x / 2) + a 0.0 + a 2.0 + b 1.0 + dtype: float64 + + In the above, the groups are not part of the index. We can have them included + by using ``g2`` where ``group_keys=True``: + + >>> g2.apply(lambda x: x * 2 if x.name == 'a' else x / 2) + a a 0.0 + a 2.0 + b b 1.0 + dtype: float64 + + Example 2: The function passed to `apply` takes a Series as + its argument and returns a scalar. `apply` combines the result for + each group together into a Series, including setting the index as + appropriate: + + >>> g1.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64 + + The ``group_keys`` argument has no effect here because the result is not + like-indexed (i.e. :ref:`a transform `) when compared + to the input. + + >>> g2.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64""", +} + +_groupby_agg_method_template = """ +Compute {fname} of group values. + +Parameters +---------- +numeric_only : bool, default {no} + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + +min_count : int, default {mc} + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + +Returns +------- +Series or DataFrame + Computed {fname} of values within each group. + +Examples +-------- +{example} +""" + +_groupby_agg_method_engine_template = """ +Compute {fname} of group values. + +Parameters +---------- +numeric_only : bool, default {no} + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None``. + +min_count : int, default {mc} + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + +engine : str, default None {e} + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + +engine_kwargs : dict, default None {ek} + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to both the ``func`` and the ``apply`` groupby aggregation. + +Returns +------- +Series or DataFrame + Computed {fname} of values within each group. + +Examples +-------- +{example} +""" + +_pipe_template = """ +Apply a ``func`` with arguments to this %(klass)s object and return its result. + +Use `.pipe` when you want to improve readability by chaining together +functions that expect Series, DataFrames, GroupBy or Resampler objects. +Instead of writing + +>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 +>>> g = lambda x, arg1: x * 5 / arg1 +>>> f = lambda x: x ** 4 +>>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"]) +>>> h(g(f(df.groupby('group')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP + +You can write + +>>> (df.groupby('group') +... .pipe(f) +... .pipe(g, arg1=1) +... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP + +which is much more readable. + +Parameters +---------- +func : callable or tuple of (callable, str) + Function to apply to this %(klass)s object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + %(klass)s object. +args : iterable, optional + Positional arguments passed into `func`. +kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + +Returns +------- +the return type of `func`. + +See Also +-------- +Series.pipe : Apply a function with arguments to a series. +DataFrame.pipe: Apply a function with arguments to a dataframe. +apply : Apply function to each group instead of to the + full %(klass)s object. + +Notes +----- +See more `here +`_ + +Examples +-------- +%(examples)s +""" + +_transform_template = """ +Call function producing a same-indexed %(klass)s on each group. + +Returns a %(klass)s having the same indexes as the original object +filled with the transformed values. + +Parameters +---------- +f : function, str + Function to apply to each group. See the Notes section below for requirements. + + Accepted inputs are: + + - String + - Python function + - Numba JIT function with ``engine='numba'`` specified. + + Only passing a single function is supported with this engine. + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + If a string is chosen, then it needs to be the name + of the groupby method you want to use. +*args + Positional arguments to pass to func. +engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba`` + +engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function + +**kwargs + Keyword arguments to be passed into func. + +Returns +------- +%(klass)s + +See Also +-------- +%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine + the results together. +%(klass)s.groupby.aggregate : Aggregate using one or more + operations over the specified axis. +%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the + same axis shape as self. + +Notes +----- +Each group is endowed the attribute 'name' in case you need to know +which group you are working on. + +The current implementation imposes three requirements on f: + +* f must return a value that either has the same shape as the input + subframe or can be broadcast to the shape of the input subframe. + For example, if `f` returns a scalar it will be broadcast to have the + same shape as the input subframe. +* if this is a DataFrame, f must support application column-by-column + in the subframe. If f also supports application to the entire subframe, + then a fast path is used starting from the second chunk. +* f must not mutate groups. Mutation is not supported and may + produce unexpected results. See :ref:`gotchas.udf-mutation` for more details. + +When using ``engine='numba'``, there will be no "fall back" behavior internally. +The group data and group index will be passed as numpy arrays to the JITed +user defined function, and no alternative execution attempts will be tried. + +.. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + +.. versionchanged:: 2.0.0 + + When using ``.transform`` on a grouped DataFrame and the transformation function + returns a DataFrame, pandas now aligns the result's index + with the input's index. You can call ``.to_numpy()`` on the + result of the transformation function to avoid alignment. + +Examples +-------- +%(example)s""" + +_agg_template_series = """ +Aggregate using one or more operations over the specified axis. + +Parameters +---------- +func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - None, in which case ``**kwargs`` are used with Named Aggregation. Here the + output has one column for each element in ``**kwargs``. The name of the + column is keyword, whereas the value determines the aggregation used to compute + the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + .. deprecated:: 2.1.0 + + Passing a dictionary is deprecated and will raise in a future version + of pandas. Pass a list of aggregations instead. +*args + Positional arguments to pass to func. +engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + +engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to the function + +**kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + +Returns +------- +{klass} + +See Also +-------- +{klass}.groupby.apply : Apply function func group-wise + and combine the results together. +{klass}.groupby.transform : Transforms the Series on each group + based on the given function. +{klass}.aggregate : Aggregate using one or more + operations over the specified axis. + +Notes +----- +When using ``engine='numba'``, there will be no "fall back" behavior internally. +The group data and group index will be passed as numpy arrays to the JITed +user defined function, and no alternative execution attempts will be tried. + +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` +for more details. + +.. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. +{examples}""" + +_agg_template_frame = """ +Aggregate using one or more operations over the specified axis. + +Parameters +---------- +func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. + - None, in which case ``**kwargs`` are used with Named Aggregation. Here the + output has one column for each element in ``**kwargs``. The name of the + column is keyword, whereas the value determines the aggregation used to compute + the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + +*args + Positional arguments to pass to func. +engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + +engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to the function + +**kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + +Returns +------- +{klass} + +See Also +-------- +{klass}.groupby.apply : Apply function func group-wise + and combine the results together. +{klass}.groupby.transform : Transforms the Series on each group + based on the given function. +{klass}.aggregate : Aggregate using one or more + operations over the specified axis. + +Notes +----- +When using ``engine='numba'``, there will be no "fall back" behavior internally. +The group data and group index will be passed as numpy arrays to the JITed +user defined function, and no alternative execution attempts will be tried. + +Functions that mutate the passed object can produce unexpected +behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` +for more details. + +.. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. +{examples}""" + + +@final +class GroupByPlot(PandasObject): + """ + Class implementing the .plot attribute for groupby objects. + """ + + def __init__(self, groupby: GroupBy) -> None: + self._groupby = groupby + + def __call__(self, *args, **kwargs): + def f(self): + return self.plot(*args, **kwargs) + + f.__name__ = "plot" + return self._groupby._python_apply_general(f, self._groupby._selected_obj) + + def __getattr__(self, name: str): + def attr(*args, **kwargs): + def f(self): + return getattr(self.plot, name)(*args, **kwargs) + + return self._groupby._python_apply_general(f, self._groupby._selected_obj) + + return attr + + +_KeysArgType = Union[ + Hashable, + list[Hashable], + Callable[[Hashable], Hashable], + list[Callable[[Hashable], Hashable]], + Mapping[Hashable, Hashable], +] + + +class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): + _hidden_attrs = PandasObject._hidden_attrs | { + "as_index", + "axis", + "dropna", + "exclusions", + "grouper", + "group_keys", + "keys", + "level", + "obj", + "observed", + "sort", + } + + axis: AxisInt + _grouper: ops.BaseGrouper + keys: _KeysArgType | None = None + level: IndexLabel | None = None + group_keys: bool + + @final + def __len__(self) -> int: + return len(self.groups) + + @final + def __repr__(self) -> str: + # TODO: Better repr for GroupBy object + return object.__repr__(self) + + @final + @property + def grouper(self) -> ops.BaseGrouper: + warnings.warn( + f"{type(self).__name__}.grouper is deprecated and will be removed in a " + "future version of pandas.", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._grouper + + @final + @property + def groups(self) -> dict[Hashable, np.ndarray]: + """ + Dict {group name -> group labels}. + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: int64 + >>> ser.groupby(level=0).groups + {'a': ['a', 'a'], 'b': ['b']} + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"]) + >>> df + a b c + 0 1 2 3 + 1 1 5 6 + 2 7 8 9 + >>> df.groupby(by=["a"]).groups + {1: [0, 1], 7: [2]} + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').groups + {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} + """ + return self._grouper.groups + + @final + @property + def ngroups(self) -> int: + return self._grouper.ngroups + + @final + @property + def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: + """ + Dict {group name -> group indices}. + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: int64 + >>> ser.groupby(level=0).indices + {'a': array([0, 1]), 'b': array([2])} + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["owl", "toucan", "eagle"]) + >>> df + a b c + owl 1 2 3 + toucan 1 5 6 + eagle 7 8 9 + >>> df.groupby(by=["a"]).indices + {1: array([0, 1]), 7: array([2])} + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').indices + defaultdict(, {Timestamp('2023-01-01 00:00:00'): [0, 1], + Timestamp('2023-02-01 00:00:00'): [2, 3]}) + """ + return self._grouper.indices + + @final + def _get_indices(self, names): + """ + Safe get multiple indices, translate keys for + datelike to underlying repr. + """ + + def get_converter(s): + # possibly convert to the actual key types + # in the indices, could be a Timestamp or a np.datetime64 + if isinstance(s, datetime.datetime): + return lambda key: Timestamp(key) + elif isinstance(s, np.datetime64): + return lambda key: Timestamp(key).asm8 + else: + return lambda key: key + + if len(names) == 0: + return [] + + if len(self.indices) > 0: + index_sample = next(iter(self.indices)) + else: + index_sample = None # Dummy sample + + name_sample = names[0] + if isinstance(index_sample, tuple): + if not isinstance(name_sample, tuple): + msg = "must supply a tuple to get_group with multiple grouping keys" + raise ValueError(msg) + if not len(name_sample) == len(index_sample): + try: + # If the original grouper was a tuple + return [self.indices[name] for name in names] + except KeyError as err: + # turns out it wasn't a tuple + msg = ( + "must supply a same-length tuple to get_group " + "with multiple grouping keys" + ) + raise ValueError(msg) from err + + converters = [get_converter(s) for s in index_sample] + names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) + + else: + converter = get_converter(index_sample) + names = (converter(name) for name in names) + + return [self.indices.get(name, []) for name in names] + + @final + def _get_index(self, name): + """ + Safe get index, translate keys for datelike to underlying repr. + """ + return self._get_indices([name])[0] + + @final + @cache_readonly + def _selected_obj(self): + # Note: _selected_obj is always just `self.obj` for SeriesGroupBy + if isinstance(self.obj, Series): + return self.obj + + if self._selection is not None: + if is_hashable(self._selection): + # i.e. a single key, so selecting it will return a Series. + # In this case, _obj_with_exclusions would wrap the key + # in a list and return a single-column DataFrame. + return self.obj[self._selection] + + # Otherwise _selection is equivalent to _selection_list, so + # _selected_obj matches _obj_with_exclusions, so we can reuse + # that and avoid making a copy. + return self._obj_with_exclusions + + return self.obj + + @final + def _dir_additions(self) -> set[str]: + return self.obj._dir_additions() + + @Substitution( + klass="GroupBy", + examples=dedent( + """\ + >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) + >>> df + A B + 0 a 1 + 1 b 2 + 2 a 3 + 3 b 4 + + To get the difference between each groups maximum and minimum value in one + pass, you can do + + >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) + B + A + a 2 + b 2""" + ), + ) + @Appender(_pipe_template) + def pipe( + self, + func: Callable[..., T] | tuple[Callable[..., T], str], + *args, + **kwargs, + ) -> T: + return com.pipe(self, func, *args, **kwargs) + + @final + def get_group(self, name, obj=None) -> DataFrame | Series: + """ + Construct DataFrame from group with provided name. + + Parameters + ---------- + name : object + The name of the group to get as a DataFrame. + obj : DataFrame, default None + The DataFrame to take the DataFrame out of. If + it is None, the object groupby was called on will + be used. + + .. deprecated:: 2.1.0 + The obj is deprecated and will be removed in a future version. + Do ``df.iloc[gb.indices.get(name)]`` + instead of ``gb.get_group(name, obj=df)``. + + Returns + ------- + same type as obj + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: int64 + >>> ser.groupby(level=0).get_group("a") + a 1 + a 2 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["owl", "toucan", "eagle"]) + >>> df + a b c + owl 1 2 3 + toucan 1 5 6 + eagle 7 8 9 + >>> df.groupby(by=["a"]).get_group((1,)) + a b c + owl 1 2 3 + toucan 1 5 6 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').get_group('2023-01-01') + 2023-01-01 1 + 2023-01-15 2 + dtype: int64 + """ + keys = self.keys + level = self.level + # mypy doesn't recognize level/keys as being sized when passed to len + if (is_list_like(level) and len(level) == 1) or ( # type: ignore[arg-type] + is_list_like(keys) and len(keys) == 1 # type: ignore[arg-type] + ): + # GH#25971 + if isinstance(name, tuple) and len(name) == 1: + # Allow users to pass tuples of length 1 to silence warning + name = name[0] + elif not isinstance(name, tuple): + warnings.warn( + "When grouping with a length-1 list-like, " + "you will need to pass a length-1 tuple to get_group in a future " + "version of pandas. Pass `(name,)` instead of `name` to silence " + "this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + inds = self._get_index(name) + if not len(inds): + raise KeyError(name) + + if obj is None: + indexer = inds if self.axis == 0 else (slice(None), inds) + return self._selected_obj.iloc[indexer] + else: + warnings.warn( + "obj is deprecated and will be removed in a future version. " + "Do ``df.iloc[gb.indices.get(name)]`` " + "instead of ``gb.get_group(name, obj=df)``.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return obj._take_with_is_copy(inds, axis=self.axis) + + @final + def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: + """ + Groupby iterator. + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: int64 + >>> for x, y in ser.groupby(level=0): + ... print(f'{x}\\n{y}\\n') + a + a 1 + a 2 + dtype: int64 + b + b 3 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"]) + >>> df + a b c + 0 1 2 3 + 1 1 5 6 + 2 7 8 9 + >>> for x, y in df.groupby(by=["a"]): + ... print(f'{x}\\n{y}\\n') + (1,) + a b c + 0 1 2 3 + 1 1 5 6 + (7,) + a b c + 2 7 8 9 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> for x, y in ser.resample('MS'): + ... print(f'{x}\\n{y}\\n') + 2023-01-01 00:00:00 + 2023-01-01 1 + 2023-01-15 2 + dtype: int64 + 2023-02-01 00:00:00 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + """ + keys = self.keys + level = self.level + result = self._grouper.get_iterator(self._selected_obj, axis=self.axis) + # error: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized" + if is_list_like(level) and len(level) == 1: # type: ignore[arg-type] + # GH 51583 + warnings.warn( + "Creating a Groupby object with a length-1 list-like " + "level parameter will yield indexes as tuples in a future version. " + "To keep indexes as scalars, create Groupby objects with " + "a scalar level parameter instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if isinstance(keys, list) and len(keys) == 1: + # GH#42795 - when keys is a list, return tuples even when length is 1 + result = (((key,), group) for key, group in result) + return result + + +# To track operations that expand dimensions, like ohlc +OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) + + +class GroupBy(BaseGroupBy[NDFrameT]): + """ + Class for grouping and aggregating relational data. + + See aggregate, transform, and apply functions on this object. + + It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: + + :: + + grouped = groupby(obj, ...) + + Parameters + ---------- + obj : pandas object + axis : int, default 0 + level : int, default None + Level of MultiIndex + groupings : list of Grouping objects + Most users should ignore this + exclusions : array-like, optional + List of columns to exclude + name : str + Most users should ignore this + + Returns + ------- + **Attributes** + groups : dict + {group name -> group labels} + len(grouped) : int + Number of groups + + Notes + ----- + After grouping, see aggregate, apply, and transform functions. Here are + some other brief notes about usage. When grouping by multiple groups, the + result index will be a MultiIndex (hierarchical) by default. + + Iteration produces (key, group) tuples, i.e. chunking the data by group. So + you can write code like: + + :: + + grouped = obj.groupby(keys, axis=axis) + for key, group in grouped: + # do something with the data + + Function calls on GroupBy, if not specially implemented, "dispatch" to the + grouped data. So if you group a DataFrame and wish to invoke the std() + method on each group, you can simply do: + + :: + + df.groupby(mapper).std() + + rather than + + :: + + df.groupby(mapper).aggregate(np.std) + + You can pass arguments to these "wrapped" functions, too. + + See the online documentation for full exposition on these topics and much + more + """ + + _grouper: ops.BaseGrouper + as_index: bool + + @final + def __init__( + self, + obj: NDFrameT, + keys: _KeysArgType | None = None, + axis: Axis = 0, + level: IndexLabel | None = None, + grouper: ops.BaseGrouper | None = None, + exclusions: frozenset[Hashable] | None = None, + selection: IndexLabel | None = None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + observed: bool | lib.NoDefault = lib.no_default, + dropna: bool = True, + ) -> None: + self._selection = selection + + assert isinstance(obj, NDFrame), type(obj) + + self.level = level + + if not as_index: + if axis != 0: + raise ValueError("as_index=False only valid for axis=0") + + self.as_index = as_index + self.keys = keys + self.sort = sort + self.group_keys = group_keys + self.dropna = dropna + + if grouper is None: + grouper, exclusions, obj = get_grouper( + obj, + keys, + axis=axis, + level=level, + sort=sort, + observed=False if observed is lib.no_default else observed, + dropna=self.dropna, + ) + + if observed is lib.no_default: + if any(ping._passed_categorical for ping in grouper.groupings): + warnings.warn( + "The default of observed=False is deprecated and will be changed " + "to True in a future version of pandas. Pass observed=False to " + "retain current behavior or observed=True to adopt the future " + "default and silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + observed = False + self.observed = observed + + self.obj = obj + self.axis = obj._get_axis_number(axis) + self._grouper = grouper + self.exclusions = frozenset(exclusions) if exclusions else frozenset() + + def __getattr__(self, attr: str): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) + if attr in self.obj: + return self[attr] + + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{attr}'" + ) + + @final + def _deprecate_axis(self, axis: int, name: str) -> None: + if axis == 1: + warnings.warn( + f"{type(self).__name__}.{name} with axis=1 is deprecated and " + "will be removed in a future version. Operate on the un-grouped " + "DataFrame instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + f"The 'axis' keyword in {type(self).__name__}.{name} is deprecated " + "and will be removed in a future version. " + "Call without passing 'axis' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + @final + def _op_via_apply(self, name: str, *args, **kwargs): + """Compute the result of an operation by using GroupBy's apply.""" + f = getattr(type(self._obj_with_exclusions), name) + sig = inspect.signature(f) + + if "axis" in kwargs and kwargs["axis"] is not lib.no_default: + axis = self.obj._get_axis_number(kwargs["axis"]) + self._deprecate_axis(axis, name) + elif "axis" in kwargs: + # exclude skew here because that was already defaulting to lib.no_default + # before this deprecation was instituted + if name == "skew": + pass + elif name == "fillna": + # maintain the behavior from before the deprecation + kwargs["axis"] = None + else: + kwargs["axis"] = 0 + + # a little trickery for aggregation functions that need an axis + # argument + if "axis" in sig.parameters: + if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default: + kwargs["axis"] = self.axis + + def curried(x): + return f(x, *args, **kwargs) + + # preserve the name so we can detect it when calling plot methods, + # to avoid duplicates + curried.__name__ = name + + # special case otherwise extra plots are created when catching the + # exception below + if name in base.plotting_methods: + return self._python_apply_general(curried, self._selected_obj) + + is_transform = name in base.transformation_kernels + result = self._python_apply_general( + curried, + self._obj_with_exclusions, + is_transform=is_transform, + not_indexed_same=not is_transform, + ) + + if self._grouper.has_dropped_na and is_transform: + # result will have dropped rows due to nans, fill with null + # and ensure index is ordered same as the input + result = self._set_result_index_ordered(result) + return result + + # ----------------------------------------------------------------- + # Dispatch/Wrapping + + @final + def _concat_objects( + self, + values, + not_indexed_same: bool = False, + is_transform: bool = False, + ): + from pandas.core.reshape.concat import concat + + if self.group_keys and not is_transform: + if self.as_index: + # possible MI return case + group_keys = self._grouper.result_index + group_levels = self._grouper.levels + group_names = self._grouper.names + + result = concat( + values, + axis=self.axis, + keys=group_keys, + levels=group_levels, + names=group_names, + sort=False, + ) + else: + # GH5610, returns a MI, with the first level being a + # range index + keys = list(range(len(values))) + result = concat(values, axis=self.axis, keys=keys) + + elif not not_indexed_same: + result = concat(values, axis=self.axis) + + ax = self._selected_obj._get_axis(self.axis) + if self.dropna: + labels = self._grouper.group_info[0] + mask = labels != -1 + ax = ax[mask] + + # this is a very unfortunate situation + # we can't use reindex to restore the original order + # when the ax has duplicates + # so we resort to this + # GH 14776, 30667 + # TODO: can we reuse e.g. _reindex_non_unique? + if ax.has_duplicates and not result.axes[self.axis].equals(ax): + # e.g. test_category_order_transformer + target = algorithms.unique1d(ax._values) + indexer, _ = result.index.get_indexer_non_unique(target) + result = result.take(indexer, axis=self.axis) + else: + result = result.reindex(ax, axis=self.axis, copy=False) + + else: + result = concat(values, axis=self.axis) + + if self.obj.ndim == 1: + name = self.obj.name + elif is_hashable(self._selection): + name = self._selection + else: + name = None + + if isinstance(result, Series) and name is not None: + result.name = name + + return result + + @final + def _set_result_index_ordered( + self, result: OutputFrameOrSeries + ) -> OutputFrameOrSeries: + # set the result index on the passed values object and + # return the new object, xref 8046 + + obj_axis = self.obj._get_axis(self.axis) + + if self._grouper.is_monotonic and not self._grouper.has_dropped_na: + # shortcut if we have an already ordered grouper + result = result.set_axis(obj_axis, axis=self.axis, copy=False) + return result + + # row order is scrambled => sort the rows by position in original index + original_positions = Index(self._grouper.result_ilocs()) + result = result.set_axis(original_positions, axis=self.axis, copy=False) + result = result.sort_index(axis=self.axis) + if self._grouper.has_dropped_na: + # Add back in any missing rows due to dropna - index here is integral + # with values referring to the row of the input so can use RangeIndex + result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) + result = result.set_axis(obj_axis, axis=self.axis, copy=False) + + return result + + @final + def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: + if isinstance(result, Series): + result = result.to_frame() + + # zip in reverse so we can always insert at loc 0 + columns = result.columns + for name, lev, in_axis in zip( + reversed(self._grouper.names), + reversed(self._grouper.get_group_levels()), + reversed([grp.in_axis for grp in self._grouper.groupings]), + ): + # GH #28549 + # When using .apply(-), name will be in columns already + if name not in columns: + if in_axis: + result.insert(0, name, lev) + else: + msg = ( + "A grouping was used that is not in the columns of the " + "DataFrame and so was excluded from the result. This grouping " + "will be included in a future version of pandas. Add the " + "grouping as a column of the DataFrame to silence this warning." + ) + warnings.warn( + message=msg, + category=FutureWarning, + stacklevel=find_stack_level(), + ) + + return result + + @final + def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT: + if self.axis == 1: + # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy + result = result.T + if result.index.equals(self.obj.index): + # Retain e.g. DatetimeIndex/TimedeltaIndex freq + # e.g. test_groupby_crash_on_nunique + result.index = self.obj.index.copy() + return result + + @final + def _wrap_aggregated_output( + self, + result: Series | DataFrame, + qs: npt.NDArray[np.float64] | None = None, + ): + """ + Wraps the output of GroupBy aggregations into the expected result. + + Parameters + ---------- + result : Series, DataFrame + + Returns + ------- + Series or DataFrame + """ + # ATM we do not get here for SeriesGroupBy; when we do, we will + # need to require that result.name already match self.obj.name + + if not self.as_index: + # `not self.as_index` is only relevant for DataFrameGroupBy, + # enforced in __init__ + result = self._insert_inaxis_grouper(result) + result = result._consolidate() + index = Index(range(self._grouper.ngroups)) + + else: + index = self._grouper.result_index + + if qs is not None: + # We get here with len(qs) != 1 and not self.as_index + # in test_pass_args_kwargs + index = _insert_quantile_level(index, qs) + + result.index = index + + # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has + # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT" + res = self._maybe_transpose_result(result) # type: ignore[arg-type] + return self._reindex_output(res, qs=qs) + + def _wrap_applied_output( + self, + data, + values: list, + not_indexed_same: bool = False, + is_transform: bool = False, + ): + raise AbstractMethodError(self) + + # ----------------------------------------------------------------- + # numba + + @final + def _numba_prep(self, data: DataFrame): + ids, _, ngroups = self._grouper.group_info + sorted_index = self._grouper._sort_idx + sorted_ids = self._grouper._sorted_ids + + sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() + # GH 46867 + index_data = data.index + if isinstance(index_data, MultiIndex): + if len(self._grouper.groupings) > 1: + raise NotImplementedError( + "Grouping with more than 1 grouping labels and " + "a MultiIndex is not supported with engine='numba'" + ) + group_key = self._grouper.groupings[0].name + index_data = index_data.get_level_values(group_key) + sorted_index_data = index_data.take(sorted_index).to_numpy() + + starts, ends = lib.generate_slices(sorted_ids, ngroups) + return ( + starts, + ends, + sorted_index_data, + sorted_data, + ) + + def _numba_agg_general( + self, + func: Callable, + dtype_mapping: dict[np.dtype, Any], + engine_kwargs: dict[str, bool] | None, + **aggregator_kwargs, + ): + """ + Perform groupby with a standard numerical aggregation function (e.g. mean) + with Numba. + """ + if not self.as_index: + raise NotImplementedError( + "as_index=False is not supported. Use .reset_index() instead." + ) + if self.axis == 1: + raise NotImplementedError("axis=1 is not supported.") + + data = self._obj_with_exclusions + df = data if data.ndim == 2 else data.to_frame() + + aggregator = executor.generate_shared_aggregator( + func, + dtype_mapping, + True, # is_grouped_kernel + **get_jit_arguments(engine_kwargs), + ) + # Pass group ids to kernel directly if it can handle it + # (This is faster since it doesn't require a sort) + ids, _, _ = self._grouper.group_info + ngroups = self._grouper.ngroups + + res_mgr = df._mgr.apply( + aggregator, labels=ids, ngroups=ngroups, **aggregator_kwargs + ) + res_mgr.axes[1] = self._grouper.result_index + result = df._constructor_from_mgr(res_mgr, axes=res_mgr.axes) + + if data.ndim == 1: + result = result.squeeze("columns") + result.name = data.name + else: + result.columns = data.columns + return result + + @final + def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs): + """ + Perform groupby transform routine with the numba engine. + + This routine mimics the data splitting routine of the DataSplitter class + to generate the indices of each group in the sorted data and then passes the + data and indices into a Numba jitted function. + """ + data = self._obj_with_exclusions + df = data if data.ndim == 2 else data.to_frame() + + starts, ends, sorted_index, sorted_data = self._numba_prep(df) + numba_.validate_udf(func) + numba_transform_func = numba_.generate_numba_transform_func( + func, **get_jit_arguments(engine_kwargs, kwargs) + ) + result = numba_transform_func( + sorted_data, + sorted_index, + starts, + ends, + len(df.columns), + *args, + ) + # result values needs to be resorted to their original positions since we + # evaluated the data sorted by group + result = result.take(np.argsort(sorted_index), axis=0) + index = data.index + if data.ndim == 1: + result_kwargs = {"name": data.name} + result = result.ravel() + else: + result_kwargs = {"columns": data.columns} + return data._constructor(result, index=index, **result_kwargs) + + @final + def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): + """ + Perform groupby aggregation routine with the numba engine. + + This routine mimics the data splitting routine of the DataSplitter class + to generate the indices of each group in the sorted data and then passes the + data and indices into a Numba jitted function. + """ + data = self._obj_with_exclusions + df = data if data.ndim == 2 else data.to_frame() + + starts, ends, sorted_index, sorted_data = self._numba_prep(df) + numba_.validate_udf(func) + numba_agg_func = numba_.generate_numba_agg_func( + func, **get_jit_arguments(engine_kwargs, kwargs) + ) + result = numba_agg_func( + sorted_data, + sorted_index, + starts, + ends, + len(df.columns), + *args, + ) + index = self._grouper.result_index + if data.ndim == 1: + result_kwargs = {"name": data.name} + result = result.ravel() + else: + result_kwargs = {"columns": data.columns} + res = data._constructor(result, index=index, **result_kwargs) + if not self.as_index: + res = self._insert_inaxis_grouper(res) + res.index = default_index(len(res)) + return res + + # ----------------------------------------------------------------- + # apply/agg/transform + + @Appender( + _apply_docs["template"].format( + input="dataframe", examples=_apply_docs["dataframe_examples"] + ) + ) + def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: + orig_func = func + func = com.is_builtin_func(func) + if orig_func != func: + alias = com._builtin_table_alias[orig_func] + warn_alias_replacement(self, orig_func, alias) + + if isinstance(func, str): + if hasattr(self, func): + res = getattr(self, func) + if callable(res): + return res(*args, **kwargs) + elif args or kwargs: + raise ValueError(f"Cannot pass arguments to property {func}") + return res + + else: + raise TypeError(f"apply func should be callable, not '{func}'") + + elif args or kwargs: + if callable(func): + + @wraps(func) + def f(g): + return func(g, *args, **kwargs) + + else: + raise ValueError( + "func must be a callable if args or kwargs are supplied" + ) + else: + f = func + + if not include_groups: + return self._python_apply_general(f, self._obj_with_exclusions) + + # ignore SettingWithCopy here in case the user mutates + with option_context("mode.chained_assignment", None): + try: + result = self._python_apply_general(f, self._selected_obj) + if ( + not isinstance(self.obj, Series) + and self._selection is None + and self._selected_obj.shape != self._obj_with_exclusions.shape + ): + warnings.warn( + message=_apply_groupings_depr.format( + type(self).__name__, "apply" + ), + category=DeprecationWarning, + stacklevel=find_stack_level(), + ) + except TypeError: + # gh-20949 + # try again, with .apply acting as a filtering + # operation, by excluding the grouping column + # This would normally not be triggered + # except if the udf is trying an operation that + # fails on *some* columns, e.g. a numeric operation + # on a string grouper column + + return self._python_apply_general(f, self._obj_with_exclusions) + + return result + + @final + def _python_apply_general( + self, + f: Callable, + data: DataFrame | Series, + not_indexed_same: bool | None = None, + is_transform: bool = False, + is_agg: bool = False, + ) -> NDFrameT: + """ + Apply function f in python space + + Parameters + ---------- + f : callable + Function to apply + data : Series or DataFrame + Data to apply f to + not_indexed_same: bool, optional + When specified, overrides the value of not_indexed_same. Apply behaves + differently when the result index is equal to the input index, but + this can be coincidental leading to value-dependent behavior. + is_transform : bool, default False + Indicator for whether the function is actually a transform + and should not have group keys prepended. + is_agg : bool, default False + Indicator for whether the function is an aggregation. When the + result is empty, we don't want to warn for this case. + See _GroupBy._python_agg_general. + + Returns + ------- + Series or DataFrame + data after applying f + """ + values, mutated = self._grouper.apply_groupwise(f, data, self.axis) + if not_indexed_same is None: + not_indexed_same = mutated + + return self._wrap_applied_output( + data, + values, + not_indexed_same, + is_transform, + ) + + @final + def _agg_general( + self, + numeric_only: bool = False, + min_count: int = -1, + *, + alias: str, + npfunc: Callable | None = None, + **kwargs, + ): + result = self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + return result.__finalize__(self.obj, method="groupby") + + def _agg_py_fallback( + self, how: str, values: ArrayLike, ndim: int, alt: Callable + ) -> ArrayLike: + """ + Fallback to pure-python aggregation if _cython_operation raises + NotImplementedError. + """ + # We get here with a) EADtypes and b) object dtype + assert alt is not None + + if values.ndim == 1: + # For DataFrameGroupBy we only get here with ExtensionArray + ser = Series(values, copy=False) + else: + # We only get here with values.dtype == object + df = DataFrame(values.T, dtype=values.dtype) + # bc we split object blocks in grouped_reduce, we have only 1 col + # otherwise we'd have to worry about block-splitting GH#39329 + assert df.shape[1] == 1 + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + ser = df.iloc[:, 0] + + # We do not get here with UDFs, so we know that our dtype + # should always be preserved by the implemented aggregations + # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? + try: + res_values = self._grouper.agg_series(ser, alt, preserve_dtype=True) + except Exception as err: + msg = f"agg function failed [how->{how},dtype->{ser.dtype}]" + # preserve the kind of exception that raised + raise type(err)(msg) from err + + if ser.dtype == object: + res_values = res_values.astype(object, copy=False) + + # If we are DataFrameGroupBy and went through a SeriesGroupByPath + # then we need to reshape + # GH#32223 includes case with IntegerArray values, ndarray res_values + # test_groupby_duplicate_columns with object dtype values + return ensure_block_shape(res_values, ndim=ndim) + + @final + def _cython_agg_general( + self, + how: str, + alt: Callable | None = None, + numeric_only: bool = False, + min_count: int = -1, + **kwargs, + ): + # Note: we never get here with how="ohlc" for DataFrameGroupBy; + # that goes through SeriesGroupBy + + data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) + + def array_func(values: ArrayLike) -> ArrayLike: + try: + result = self._grouper._cython_operation( + "aggregate", + values, + how, + axis=data.ndim - 1, + min_count=min_count, + **kwargs, + ) + except NotImplementedError: + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + # TODO: shouldn't min_count matter? + # TODO: avoid special casing SparseArray here + if how in ["any", "all"] and isinstance(values, SparseArray): + pass + elif alt is None or how in ["any", "all", "std", "sem"]: + raise # TODO: re-raise as TypeError? should not be reached + else: + return result + + assert alt is not None + result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) + return result + + new_mgr = data.grouped_reduce(array_func) + res = self._wrap_agged_manager(new_mgr) + if how in ["idxmin", "idxmax"]: + res = self._wrap_idxmax_idxmin(res) + out = self._wrap_aggregated_output(res) + if self.axis == 1: + out = out.infer_objects(copy=False) + return out + + def _cython_transform( + self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs + ): + raise AbstractMethodError(self) + + @final + def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + # optimized transforms + orig_func = func + func = com.get_cython_func(func) or func + if orig_func != func: + warn_alias_replacement(self, orig_func, func) + + if not isinstance(func, str): + return self._transform_general(func, engine, engine_kwargs, *args, **kwargs) + + elif func not in base.transform_kernel_allowlist: + msg = f"'{func}' is not a valid function name for transform(name)" + raise ValueError(msg) + elif func in base.cythonized_kernels or func in base.transformation_kernels: + # cythonized transform or canned "agg+broadcast" + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + return getattr(self, func)(*args, **kwargs) + + else: + # i.e. func in base.reduction_kernels + + # GH#30918 Use _transform_fast only when we know func is an aggregation + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + with com.temp_setattr(self, "as_index", True): + # GH#49834 - result needs groups in the index for + # _wrap_transform_fast_result + if func in ["idxmin", "idxmax"]: + func = cast(Literal["idxmin", "idxmax"], func) + result = self._idxmax_idxmin(func, True, *args, **kwargs) + else: + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + result = getattr(self, func)(*args, **kwargs) + + return self._wrap_transform_fast_result(result) + + @final + def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: + """ + Fast transform path for aggregations. + """ + obj = self._obj_with_exclusions + + # for each col, reshape to size of original frame by take operation + ids, _, _ = self._grouper.group_info + result = result.reindex(self._grouper.result_index, axis=self.axis, copy=False) + + if self.obj.ndim == 1: + # i.e. SeriesGroupBy + out = algorithms.take_nd(result._values, ids) + output = obj._constructor(out, index=obj.index, name=obj.name) + else: + # `.size()` gives Series output on DataFrame input, need axis 0 + axis = 0 if result.ndim == 1 else self.axis + # GH#46209 + # Don't convert indices: negative indices need to give rise + # to null values in the result + new_ax = result.axes[axis].take(ids) + output = result._reindex_with_indexers( + {axis: (new_ax, ids)}, allow_dups=True, copy=False + ) + output = output.set_axis(obj._get_axis(self.axis), axis=axis) + return output + + # ----------------------------------------------------------------- + # Utilities + + @final + def _apply_filter(self, indices, dropna): + if len(indices) == 0: + indices = np.array([], dtype="int64") + else: + indices = np.sort(np.concatenate(indices)) + if dropna: + filtered = self._selected_obj.take(indices, axis=self.axis) + else: + mask = np.empty(len(self._selected_obj.index), dtype=bool) + mask.fill(False) + mask[indices.astype(int)] = True + # mask fails to broadcast when passed to where; broadcast manually. + mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T + filtered = self._selected_obj.where(mask) # Fill with NaNs. + return filtered + + @final + def _cumcount_array(self, ascending: bool = True) -> np.ndarray: + """ + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from length of group - 1 to 0. + + Notes + ----- + this is currently implementing sort=False + (though the default is sort=True) for groupby in general + """ + ids, _, ngroups = self._grouper.group_info + sorter = get_group_index_sorter(ids, ngroups) + ids, count = ids[sorter], len(ids) + + if count == 0: + return np.empty(0, dtype=np.int64) + + run = np.r_[True, ids[:-1] != ids[1:]] + rep = np.diff(np.r_[np.nonzero(run)[0], count]) + out = (~run).cumsum() + + if ascending: + out -= np.repeat(out[run], rep) + else: + out = np.repeat(out[np.r_[run[1:], True]], rep) - out + + if self._grouper.has_dropped_na: + out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False)) + else: + out = out.astype(np.int64, copy=False) + + rev = np.empty(count, dtype=np.intp) + rev[sorter] = np.arange(count, dtype=np.intp) + return out[rev] + + # ----------------------------------------------------------------- + + @final + @property + def _obj_1d_constructor(self) -> Callable: + # GH28330 preserve subclassed Series/DataFrames + if isinstance(self.obj, DataFrame): + return self.obj._constructor_sliced + assert isinstance(self.obj, Series) + return self.obj._constructor + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def any(self, skipna: bool = True) -> NDFrameT: + """ + Return True if any value in the group is truthful, else False. + + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing. + + Returns + ------- + Series or DataFrame + DataFrame or Series of boolean values, where a value is True if any element + is True within its respective group, False otherwise. + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 0], index=lst) + >>> ser + a 1 + a 2 + b 0 + dtype: int64 + >>> ser.groupby(level=0).any() + a True + b False + dtype: bool + + For DataFrameGroupBy: + + >>> data = [[1, 0, 3], [1, 0, 6], [7, 1, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["ostrich", "penguin", "parrot"]) + >>> df + a b c + ostrich 1 0 3 + penguin 1 0 6 + parrot 7 1 9 + >>> df.groupby(by=["a"]).any() + b c + a + 1 False True + 7 True True + """ + return self._cython_agg_general( + "any", + alt=lambda x: Series(x, copy=False).any(skipna=skipna), + skipna=skipna, + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def all(self, skipna: bool = True) -> NDFrameT: + """ + Return True if all values in the group are truthful, else False. + + Parameters + ---------- + skipna : bool, default True + Flag to ignore nan values during truth testing. + + Returns + ------- + Series or DataFrame + DataFrame or Series of boolean values, where a value is True if all elements + are True within its respective group, False otherwise. + %(see_also)s + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 0], index=lst) + >>> ser + a 1 + a 2 + b 0 + dtype: int64 + >>> ser.groupby(level=0).all() + a True + b False + dtype: bool + + For DataFrameGroupBy: + + >>> data = [[1, 0, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["ostrich", "penguin", "parrot"]) + >>> df + a b c + ostrich 1 0 3 + penguin 1 5 6 + parrot 7 8 9 + >>> df.groupby(by=["a"]).all() + b c + a + 1 False True + 7 True True + """ + return self._cython_agg_general( + "all", + alt=lambda x: Series(x, copy=False).all(skipna=skipna), + skipna=skipna, + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def count(self) -> NDFrameT: + """ + Compute count of group, excluding missing values. + + Returns + ------- + Series or DataFrame + Count of values within each group. + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, np.nan], index=lst) + >>> ser + a 1.0 + a 2.0 + b NaN + dtype: float64 + >>> ser.groupby(level=0).count() + a 2 + b 0 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, np.nan, 3], [1, np.nan, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["cow", "horse", "bull"]) + >>> df + a b c + cow 1 NaN 3 + horse 1 NaN 6 + bull 7 8.0 9 + >>> df.groupby("a").count() + b c + a + 1 0 2 + 7 1 1 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample('MS').count() + 2023-01-01 2 + 2023-02-01 2 + Freq: MS, dtype: int64 + """ + data = self._get_data_to_aggregate() + ids, _, ngroups = self._grouper.group_info + mask = ids != -1 + + is_series = data.ndim == 1 + + def hfunc(bvalues: ArrayLike) -> ArrayLike: + # TODO(EA2D): reshape would not be necessary with 2D EAs + if bvalues.ndim == 1: + # EA + masked = mask & ~isna(bvalues).reshape(1, -1) + else: + masked = mask & ~isna(bvalues) + + counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups) + if isinstance(bvalues, BaseMaskedArray): + return IntegerArray( + counted[0], mask=np.zeros(counted.shape[1], dtype=np.bool_) + ) + elif isinstance(bvalues, ArrowExtensionArray) and not isinstance( + bvalues.dtype, StringDtype + ): + dtype = pandas_dtype("int64[pyarrow]") + return type(bvalues)._from_sequence(counted[0], dtype=dtype) + if is_series: + assert counted.ndim == 2 + assert counted.shape[0] == 1 + return counted[0] + return counted + + new_mgr = data.grouped_reduce(hfunc) + new_obj = self._wrap_agged_manager(new_mgr) + + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _wrap_aggregated_output() returns. GH 35028 + # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false + with com.temp_setattr(self, "observed", True): + result = self._wrap_aggregated_output(new_obj) + + return self._reindex_output(result, fill_value=0) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def mean( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + """ + Compute mean of groups, excluding missing values. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None`` and defaults to ``False``. + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + + Returns + ------- + pandas.Series or pandas.DataFrame + %(see_also)s + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], + ... 'B': [np.nan, 2, 3, 4, 5], + ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) + + Groupby one column and return the mean of the remaining columns in + each group. + + >>> df.groupby('A').mean() + B C + A + 1 3.0 1.333333 + 2 4.0 1.500000 + + Groupby two columns and return the mean of the remaining column. + + >>> df.groupby(['A', 'B']).mean() + C + A B + 1 2.0 2.0 + 4.0 1.0 + 2 3.0 1.0 + 5.0 2.0 + + Groupby one column and return the mean of only particular column in + the group. + + >>> df.groupby('A')['B'].mean() + A + 1 3.0 + 2 4.0 + Name: B, dtype: float64 + """ + + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_mean + + return self._numba_agg_general( + grouped_mean, + executor.float_dtype_mapping, + engine_kwargs, + min_periods=0, + ) + else: + result = self._cython_agg_general( + "mean", + alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only), + numeric_only=numeric_only, + ) + return result.__finalize__(self.obj, method="groupby") + + @final + def median(self, numeric_only: bool = False) -> NDFrameT: + """ + Compute median of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None`` and defaults to False. + + Returns + ------- + Series or DataFrame + Median of values within each group. + + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) + >>> ser + a 7 + a 2 + a 8 + b 4 + b 3 + b 3 + dtype: int64 + >>> ser.groupby(level=0).median() + a 7.0 + b 3.0 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', + ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> df + a b + dog 1 1 + dog 3 4 + dog 5 8 + mouse 7 4 + mouse 7 4 + mouse 8 2 + mouse 3 1 + >>> df.groupby(level=0).median() + a b + dog 3.0 4.0 + mouse 7.0 3.0 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3, 3, 4, 5], + ... index=pd.DatetimeIndex(['2023-01-01', + ... '2023-01-10', + ... '2023-01-15', + ... '2023-02-01', + ... '2023-02-10', + ... '2023-02-15'])) + >>> ser.resample('MS').median() + 2023-01-01 2.0 + 2023-02-01 4.0 + Freq: MS, dtype: float64 + """ + result = self._cython_agg_general( + "median", + alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only), + numeric_only=numeric_only, + ) + return result.__finalize__(self.obj, method="groupby") + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def std( + self, + ddof: int = 1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + numeric_only: bool = False, + ): + """ + Compute standard deviation of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Standard deviation of values within each group. + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) + >>> ser + a 7 + a 2 + a 8 + b 4 + b 3 + b 3 + dtype: int64 + >>> ser.groupby(level=0).std() + a 3.21455 + b 0.57735 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', + ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> df + a b + dog 1 1 + dog 3 4 + dog 5 8 + mouse 7 4 + mouse 7 4 + mouse 8 2 + mouse 3 1 + >>> df.groupby(level=0).std() + a b + dog 2.000000 3.511885 + mouse 2.217356 1.500000 + """ + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_var + + return np.sqrt( + self._numba_agg_general( + grouped_var, + executor.float_dtype_mapping, + engine_kwargs, + min_periods=0, + ddof=ddof, + ) + ) + else: + return self._cython_agg_general( + "std", + alt=lambda x: Series(x, copy=False).std(ddof=ddof), + numeric_only=numeric_only, + ddof=ddof, + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def var( + self, + ddof: int = 1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + numeric_only: bool = False, + ): + """ + Compute variance of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Variance of values within each group. + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) + >>> ser + a 7 + a 2 + a 8 + b 4 + b 3 + b 3 + dtype: int64 + >>> ser.groupby(level=0).var() + a 10.333333 + b 0.333333 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', + ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> df + a b + dog 1 1 + dog 3 4 + dog 5 8 + mouse 7 4 + mouse 7 4 + mouse 8 2 + mouse 3 1 + >>> df.groupby(level=0).var() + a b + dog 4.000000 12.333333 + mouse 4.916667 2.250000 + """ + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_var + + return self._numba_agg_general( + grouped_var, + executor.float_dtype_mapping, + engine_kwargs, + min_periods=0, + ddof=ddof, + ) + else: + return self._cython_agg_general( + "var", + alt=lambda x: Series(x, copy=False).var(ddof=ddof), + numeric_only=numeric_only, + ddof=ddof, + ) + + @final + def _value_counts( + self, + subset: Sequence[Hashable] | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrame | Series: + """ + Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy. + + SeriesGroupBy additionally supports a bins argument. See the docstring of + DataFrameGroupBy.value_counts for a description of arguments. + """ + if self.axis == 1: + raise NotImplementedError( + "DataFrameGroupBy.value_counts only handles axis=0" + ) + name = "proportion" if normalize else "count" + + df = self.obj + obj = self._obj_with_exclusions + + in_axis_names = { + grouping.name for grouping in self._grouper.groupings if grouping.in_axis + } + if isinstance(obj, Series): + _name = obj.name + keys = [] if _name in in_axis_names else [obj] + else: + unique_cols = set(obj.columns) + if subset is not None: + subsetted = set(subset) + clashing = subsetted & set(in_axis_names) + if clashing: + raise ValueError( + f"Keys {clashing} in subset cannot be in " + "the groupby column keys." + ) + doesnt_exist = subsetted - unique_cols + if doesnt_exist: + raise ValueError( + f"Keys {doesnt_exist} in subset do not " + f"exist in the DataFrame." + ) + else: + subsetted = unique_cols + + keys = [ + # Can't use .values because the column label needs to be preserved + obj.iloc[:, idx] + for idx, _name in enumerate(obj.columns) + if _name not in in_axis_names and _name in subsetted + ] + + groupings = list(self._grouper.groupings) + for key in keys: + grouper, _, _ = get_grouper( + df, + key=key, + axis=self.axis, + sort=self.sort, + observed=False, + dropna=dropna, + ) + groupings += list(grouper.groupings) + + # Take the size of the overall columns + gb = df.groupby( + groupings, + sort=self.sort, + observed=self.observed, + dropna=self.dropna, + ) + result_series = cast(Series, gb.size()) + result_series.name = name + + # GH-46357 Include non-observed categories + # of non-grouping columns regardless of `observed` + if any( + isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) + and not grouping._observed + for grouping in groupings + ): + levels_list = [ping._result_index for ping in groupings] + multi_index = MultiIndex.from_product( + levels_list, names=[ping.name for ping in groupings] + ) + result_series = result_series.reindex(multi_index, fill_value=0) + + if sort: + # Sort by the values + result_series = result_series.sort_values( + ascending=ascending, kind="stable" + ) + if self.sort: + # Sort by the groupings + names = result_series.index.names + # GH#55951 - Temporarily replace names in case they are integers + result_series.index.names = range(len(names)) + index_level = list(range(len(self._grouper.groupings))) + result_series = result_series.sort_index( + level=index_level, sort_remaining=False + ) + result_series.index.names = names + + if normalize: + # Normalize the results by dividing by the original group sizes. + # We are guaranteed to have the first N levels be the + # user-requested grouping. + levels = list( + range(len(self._grouper.groupings), result_series.index.nlevels) + ) + indexed_group_size = result_series.groupby( + result_series.index.droplevel(levels), + sort=self.sort, + dropna=self.dropna, + # GH#43999 - deprecation of observed=False + observed=False, + ).transform("sum") + result_series /= indexed_group_size + + # Handle groups of non-observed categories + result_series = result_series.fillna(0.0) + + result: Series | DataFrame + if self.as_index: + result = result_series + else: + # Convert to frame + index = result_series.index + columns = com.fill_missing_names(index.names) + if name in columns: + raise ValueError(f"Column label '{name}' is duplicate of result column") + result_series.name = name + result_series.index = index.set_names(range(len(columns))) + result_frame = result_series.reset_index() + orig_dtype = self._grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] + cols = Index(columns, dtype=orig_dtype).insert(len(columns), name) + result_frame.columns = cols + result = result_frame + return result.__finalize__(self.obj, method="value_counts") + + @final + def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: + """ + Compute standard error of the mean of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Standard error of the mean of values within each group. + + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([5, 10, 8, 14], index=lst) + >>> ser + a 5 + a 10 + b 8 + b 14 + dtype: int64 + >>> ser.groupby(level=0).sem() + a 2.5 + b 3.0 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = [[1, 12, 11], [1, 15, 2], [2, 5, 8], [2, 6, 12]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df + a b c + tuna 1 12 11 + salmon 1 15 2 + catfish 2 5 8 + goldfish 2 6 12 + >>> df.groupby("a").sem() + b c + a + 1 1.5 4.5 + 2 0.5 2.0 + + For Resampler: + + >>> ser = pd.Series([1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex(['2023-01-01', + ... '2023-01-10', + ... '2023-01-15', + ... '2023-02-01', + ... '2023-02-10', + ... '2023-02-15'])) + >>> ser.resample('MS').sem() + 2023-01-01 0.577350 + 2023-02-01 1.527525 + Freq: MS, dtype: float64 + """ + if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): + raise TypeError( + f"{type(self).__name__}.sem called with " + f"numeric_only={numeric_only} and dtype {self.obj.dtype}" + ) + return self._cython_agg_general( + "sem", + alt=lambda x: Series(x, copy=False).sem(ddof=ddof), + numeric_only=numeric_only, + ddof=ddof, + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def size(self) -> DataFrame | Series: + """ + Compute group sizes. + + Returns + ------- + DataFrame or Series + Number of rows in each group as a Series if as_index is True + or a DataFrame if as_index is False. + %(see_also)s + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: int64 + >>> ser.groupby(level=0).size() + a 2 + b 1 + dtype: int64 + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["owl", "toucan", "eagle"]) + >>> df + a b c + owl 1 2 3 + toucan 1 5 6 + eagle 7 8 9 + >>> df.groupby("a").size() + a + 1 2 + 7 1 + dtype: int64 + + For Resampler: + + >>> ser = pd.Series([1, 2, 3], index=pd.DatetimeIndex( + ... ['2023-01-01', '2023-01-15', '2023-02-01'])) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + dtype: int64 + >>> ser.resample('MS').size() + 2023-01-01 2 + 2023-02-01 1 + Freq: MS, dtype: int64 + """ + result = self._grouper.size() + dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None + if isinstance(self.obj, Series): + if isinstance(self.obj.array, ArrowExtensionArray): + if isinstance(self.obj.array, ArrowStringArrayNumpySemantics): + dtype_backend = None + elif isinstance(self.obj.array, ArrowStringArray): + dtype_backend = "numpy_nullable" + else: + dtype_backend = "pyarrow" + elif isinstance(self.obj.array, BaseMaskedArray): + dtype_backend = "numpy_nullable" + # TODO: For DataFrames what if columns are mixed arrow/numpy/masked? + + # GH28330 preserve subclassed Series/DataFrames through calls + if isinstance(self.obj, Series): + result = self._obj_1d_constructor(result, name=self.obj.name) + else: + result = self._obj_1d_constructor(result) + + if dtype_backend is not None: + result = result.convert_dtypes( + infer_objects=False, + convert_string=False, + convert_boolean=False, + convert_floating=False, + dtype_backend=dtype_backend, + ) + + with com.temp_setattr(self, "as_index", True): + # size already has the desired behavior in GH#49519, but this makes the + # as_index=False path of _reindex_output fail on categorical groupers. + result = self._reindex_output(result, fill_value=0) + if not self.as_index: + # error: Incompatible types in assignment (expression has + # type "DataFrame", variable has type "Series") + result = result.rename("size").reset_index() # type: ignore[assignment] + return result + + @final + @doc( + _groupby_agg_method_engine_template, + fname="sum", + no=False, + mc=0, + e=None, + ek=None, + example=dedent( + """\ + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).sum() + a 3 + b 7 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tiger", "leopard", "cheetah", "lion"]) + >>> df + a b c + tiger 1 8 2 + leopard 1 2 5 + cheetah 2 5 8 + lion 2 6 9 + >>> df.groupby("a").sum() + b c + a + 1 10 7 + 2 11 17""" + ), + ) + def sum( + self, + numeric_only: bool = False, + min_count: int = 0, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_sum + + return self._numba_agg_general( + grouped_sum, + executor.default_dtype_mapping, + engine_kwargs, + min_periods=min_count, + ) + else: + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _agg_general() returns. GH #31422 + with com.temp_setattr(self, "observed", True): + result = self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="sum", + npfunc=np.sum, + ) + + return self._reindex_output(result, fill_value=0) + + @final + @doc( + _groupby_agg_method_template, + fname="prod", + no=False, + mc=0, + example=dedent( + """\ + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).prod() + a 2 + b 12 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tiger", "leopard", "cheetah", "lion"]) + >>> df + a b c + tiger 1 8 2 + leopard 1 2 5 + cheetah 2 5 8 + lion 2 6 9 + >>> df.groupby("a").prod() + b c + a + 1 16 10 + 2 30 72""" + ), + ) + def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + return self._agg_general( + numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod + ) + + @final + @doc( + _groupby_agg_method_engine_template, + fname="min", + no=False, + mc=-1, + e=None, + ek=None, + example=dedent( + """\ + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).min() + a 1 + b 3 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tiger", "leopard", "cheetah", "lion"]) + >>> df + a b c + tiger 1 8 2 + leopard 1 2 5 + cheetah 2 5 8 + lion 2 6 9 + >>> df.groupby("a").min() + b c + a + 1 2 2 + 2 5 8""" + ), + ) + def min( + self, + numeric_only: bool = False, + min_count: int = -1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_min_max + + return self._numba_agg_general( + grouped_min_max, + executor.identity_dtype_mapping, + engine_kwargs, + min_periods=min_count, + is_max=False, + ) + else: + return self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="min", + npfunc=np.min, + ) + + @final + @doc( + _groupby_agg_method_engine_template, + fname="max", + no=False, + mc=-1, + e=None, + ek=None, + example=dedent( + """\ + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).max() + a 2 + b 4 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tiger", "leopard", "cheetah", "lion"]) + >>> df + a b c + tiger 1 8 2 + leopard 1 2 5 + cheetah 2 5 8 + lion 2 6 9 + >>> df.groupby("a").max() + b c + a + 1 8 5 + 2 6 9""" + ), + ) + def max( + self, + numeric_only: bool = False, + min_count: int = -1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + from pandas.core._numba.kernels import grouped_min_max + + return self._numba_agg_general( + grouped_min_max, + executor.identity_dtype_mapping, + engine_kwargs, + min_periods=min_count, + is_max=True, + ) + else: + return self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="max", + npfunc=np.max, + ) + + @final + def first( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: + """ + Compute the first entry of each column within each group. + + Defaults to skipping NA elements. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + min_count : int, default -1 + The required number of valid values to perform the operation. If fewer + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 + + Returns + ------- + Series or DataFrame + First values within each group. + + See Also + -------- + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry + of each column. + pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. + + Examples + -------- + >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3], + ... D=['3/11/2000', '3/12/2000', '3/13/2000'])) + >>> df['D'] = pd.to_datetime(df['D']) + >>> df.groupby("A").first() + B C D + A + 1 5.0 1 2000-03-11 + 3 6.0 3 2000-03-13 + >>> df.groupby("A").first(min_count=2) + B C D + A + 1 NaN 1.0 2000-03-11 + 3 NaN NaN NaT + >>> df.groupby("A").first(numeric_only=True) + B C + A + 1 5.0 1 + 3 6.0 3 + """ + + def first_compat(obj: NDFrameT, axis: AxisInt = 0): + def first(x: Series): + """Helper function for first item that isn't NA.""" + arr = x.array[notna(x.array)] + if not len(arr): + return x.array.dtype.na_value + return arr[0] + + if isinstance(obj, DataFrame): + return obj.apply(first, axis=axis) + elif isinstance(obj, Series): + return first(obj) + else: # pragma: no cover + raise TypeError(type(obj)) + + return self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="first", + npfunc=first_compat, + skipna=skipna, + ) + + @final + def last( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: + """ + Compute the last entry of each column within each group. + + Defaults to skipping NA elements. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. + min_count : int, default -1 + The required number of valid values to perform the operation. If fewer + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 + + Returns + ------- + Series or DataFrame + Last of values within each group. + + See Also + -------- + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry + of each column. + pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. + + Examples + -------- + >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) + >>> df.groupby("A").last() + B C + A + 1 5.0 2 + 3 6.0 3 + """ + + def last_compat(obj: NDFrameT, axis: AxisInt = 0): + def last(x: Series): + """Helper function for last item that isn't NA.""" + arr = x.array[notna(x.array)] + if not len(arr): + return x.array.dtype.na_value + return arr[-1] + + if isinstance(obj, DataFrame): + return obj.apply(last, axis=axis) + elif isinstance(obj, Series): + return last(obj) + else: # pragma: no cover + raise TypeError(type(obj)) + + return self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="last", + npfunc=last_compat, + skipna=skipna, + ) + + @final + def ohlc(self) -> DataFrame: + """ + Compute open, high, low and close values of a group, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex + + Returns + ------- + DataFrame + Open, high, low and close values within each group. + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC',] + >>> ser = pd.Series([3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 0.1, 0.5], index=lst) + >>> ser + SPX 3.4 + CAC 9.0 + SPX 7.2 + CAC 5.2 + SPX 8.8 + CAC 9.4 + SPX 0.1 + CAC 0.5 + dtype: float64 + >>> ser.groupby(level=0).ohlc() + open high low close + CAC 9.0 9.4 0.5 0.5 + SPX 3.4 8.8 0.1 0.1 + + For DataFrameGroupBy: + + >>> data = {2022: [1.2, 2.3, 8.9, 4.5, 4.4, 3, 2 , 1], + ... 2023: [3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 8.2, 1.0]} + >>> df = pd.DataFrame(data, index=['SPX', 'CAC', 'SPX', 'CAC', + ... 'SPX', 'CAC', 'SPX', 'CAC']) + >>> df + 2022 2023 + SPX 1.2 3.4 + CAC 2.3 9.0 + SPX 8.9 7.2 + CAC 4.5 5.2 + SPX 4.4 8.8 + CAC 3.0 9.4 + SPX 2.0 8.2 + CAC 1.0 1.0 + >>> df.groupby(level=0).ohlc() + 2022 2023 + open high low close open high low close + CAC 2.3 4.5 1.0 1.0 9.0 9.4 1.0 1.0 + SPX 1.2 8.9 1.2 2.0 3.4 8.8 3.4 8.2 + + For Resampler: + + >>> ser = pd.Series([1, 3, 2, 4, 3, 5], + ... index=pd.DatetimeIndex(['2023-01-01', + ... '2023-01-10', + ... '2023-01-15', + ... '2023-02-01', + ... '2023-02-10', + ... '2023-02-15'])) + >>> ser.resample('MS').ohlc() + open high low close + 2023-01-01 1 3 1 2 + 2023-02-01 4 5 3 5 + """ + if self.obj.ndim == 1: + obj = self._selected_obj + + is_numeric = is_numeric_dtype(obj.dtype) + if not is_numeric: + raise DataError("No numeric types to aggregate") + + res_values = self._grouper._cython_operation( + "aggregate", obj._values, "ohlc", axis=0, min_count=-1 + ) + + agg_names = ["open", "high", "low", "close"] + result = self.obj._constructor_expanddim( + res_values, index=self._grouper.result_index, columns=agg_names + ) + return self._reindex_output(result) + + result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc()) + return result + + @doc(DataFrame.describe) + def describe( + self, + percentiles=None, + include=None, + exclude=None, + ) -> NDFrameT: + obj = self._obj_with_exclusions + + if len(obj) == 0: + described = obj.describe( + percentiles=percentiles, include=include, exclude=exclude + ) + if obj.ndim == 1: + result = described + else: + result = described.unstack() + return result.to_frame().T.iloc[:0] + + with com.temp_setattr(self, "as_index", True): + result = self._python_apply_general( + lambda x: x.describe( + percentiles=percentiles, include=include, exclude=exclude + ), + obj, + not_indexed_same=True, + ) + if self.axis == 1: + return result.T + + # GH#49256 - properly handle the grouping column(s) + result = result.unstack() + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + + return result + + @final + def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: + """ + Provide resampling when using a TimeGrouper. + + Given a grouper, the function resamples it according to a string + "string" -> "frequency". + + See the :ref:`frequency aliases ` + documentation for more details. + + Parameters + ---------- + rule : str or DateOffset + The offset string or object representing target grouper conversion. + *args + Possible arguments are `how`, `fill_method`, `limit`, `kind` and + `on`, and other arguments of `TimeGrouper`. + include_groups : bool, default True + When True, will attempt to include the groupings in the operation in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + + **kwargs + Possible arguments are `how`, `fill_method`, `limit`, `kind` and + `on`, and other arguments of `TimeGrouper`. + + Returns + ------- + pandas.api.typing.DatetimeIndexResamplerGroupby, + pandas.api.typing.PeriodIndexResamplerGroupby, or + pandas.api.typing.TimedeltaIndexResamplerGroupby + Return a new groupby object, with type depending on the data + being resampled. + + See Also + -------- + Grouper : Specify a frequency to resample with when + grouping by a key. + DatetimeIndex.resample : Frequency conversion and resampling of + time series. + + Examples + -------- + >>> idx = pd.date_range('1/1/2000', periods=4, freq='min') + >>> df = pd.DataFrame(data=4 * [range(2)], + ... index=idx, + ... columns=['a', 'b']) + >>> df.iloc[2, 0] = 5 + >>> df + a b + 2000-01-01 00:00:00 0 1 + 2000-01-01 00:01:00 0 1 + 2000-01-01 00:02:00 5 1 + 2000-01-01 00:03:00 0 1 + + Downsample the DataFrame into 3 minute bins and sum the values of + the timestamps falling into a bin. + + >>> df.groupby('a').resample('3min', include_groups=False).sum() + b + a + 0 2000-01-01 00:00:00 2 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:00:00 1 + + Upsample the series into 30 second bins. + + >>> df.groupby('a').resample('30s', include_groups=False).sum() + b + a + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:00:30 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 0 + 2000-01-01 00:02:00 0 + 2000-01-01 00:02:30 0 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:02:00 1 + + Resample by month. Values are assigned to the month of the period. + + >>> df.groupby('a').resample('ME', include_groups=False).sum() + b + a + 0 2000-01-31 3 + 5 2000-01-31 1 + + Downsample the series into 3 minute bins as above, but close the right + side of the bin interval. + + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', include_groups=False) + ... .sum() + ... ) + b + a + 0 1999-12-31 23:57:00 1 + 2000-01-01 00:00:00 2 + 5 2000-01-01 00:00:00 1 + + Downsample the series into 3 minute bins and close the right side of + the bin interval, but label each bin using the right edge instead of + the left. + + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', label='right', include_groups=False) + ... .sum() + ... ) + b + a + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:03:00 2 + 5 2000-01-01 00:03:00 1 + """ + from pandas.core.resample import get_resampler_for_grouping + + # mypy flags that include_groups could be specified via `*args` or `**kwargs` + # GH#54961 would resolve. + return get_resampler_for_grouping( # type: ignore[misc] + self, rule, *args, include_groups=include_groups, **kwargs + ) + + @final + def rolling(self, *args, **kwargs) -> RollingGroupby: + """ + Return a rolling grouper, providing rolling functionality per group. + + Parameters + ---------- + window : int, timedelta, str, offset, or BaseIndexer subclass + Size of the moving window. + + If an integer, the fixed number of observations used for + each window. + + If a timedelta, str, or offset, the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetimelike indexes. + To learn more about the offsets & frequency strings, please see `this link + `__. + + If a BaseIndexer subclass, the window boundaries + based on the defined ``get_window_bounds`` method. Additional rolling + keyword arguments, namely ``min_periods``, ``center``, ``closed`` and + ``step`` will be passed to ``get_window_bounds``. + + min_periods : int, default None + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + + For a window that is specified by an offset, + ``min_periods`` will default to 1. + + For a window that is specified by an integer, ``min_periods`` will default + to the size of the window. + + center : bool, default False + If False, set the window labels as the right edge of the window index. + + If True, set the window labels as the center of the window index. + + win_type : str, default None + If ``None``, all points are evenly weighted. + + If a string, it must be a valid `scipy.signal window function + `__. + + Certain Scipy window types require additional parameters to be passed + in the aggregation function. The additional parameters must match + the keywords specified in the Scipy window type method signature. + + on : str, optional + For a DataFrame, a column label or Index level on which + to calculate the rolling window, rather than the DataFrame's index. + + Provided integer column is ignored and excluded from result since + an integer index is not used to calculate the rolling window. + + axis : int or str, default 0 + If ``0`` or ``'index'``, roll across the rows. + + If ``1`` or ``'columns'``, roll across the columns. + + For `Series` this parameter is unused and defaults to 0. + + closed : str, default None + If ``'right'``, the first point in the window is excluded from calculations. + + If ``'left'``, the last point in the window is excluded from calculations. + + If ``'both'``, no points in the window are excluded from calculations. + + If ``'neither'``, the first and last points in the window are excluded + from calculations. + + Default ``None`` (``'right'``). + + method : str {'single', 'table'}, default 'single' + Execute the rolling operation per single column or row (``'single'``) + or over the entire object (``'table'``). + + This argument is only implemented when specifying ``engine='numba'`` + in the method call. + + Returns + ------- + pandas.api.typing.RollingGroupby + Return a new grouper with our rolling appended. + + See Also + -------- + Series.rolling : Calling object with Series data. + DataFrame.rolling : Calling object with DataFrames. + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 1, 2, 2], + ... 'B': [1, 2, 3, 4], + ... 'C': [0.362, 0.227, 1.267, -0.562]}) + >>> df + A B C + 0 1 1 0.362 + 1 1 2 0.227 + 2 2 3 1.267 + 3 2 4 -0.562 + + >>> df.groupby('A').rolling(2).sum() + B C + A + 1 0 NaN NaN + 1 3.0 0.589 + 2 2 NaN NaN + 3 7.0 0.705 + + >>> df.groupby('A').rolling(2, min_periods=1).sum() + B C + A + 1 0 1.0 0.362 + 1 3.0 0.589 + 2 2 3.0 1.267 + 3 7.0 0.705 + + >>> df.groupby('A').rolling(2, on='B').sum() + B C + A + 1 0 1 NaN + 1 2 0.589 + 2 2 3 NaN + 3 4 0.705 + """ + from pandas.core.window import RollingGroupby + + return RollingGroupby( + self._selected_obj, + *args, + _grouper=self._grouper, + _as_index=self.as_index, + **kwargs, + ) + + @final + @Substitution(name="groupby") + @Appender(_common_see_also) + def expanding(self, *args, **kwargs) -> ExpandingGroupby: + """ + Return an expanding grouper, providing expanding + functionality per group. + + Returns + ------- + pandas.api.typing.ExpandingGroupby + """ + from pandas.core.window import ExpandingGroupby + + return ExpandingGroupby( + self._selected_obj, + *args, + _grouper=self._grouper, + **kwargs, + ) + + @final + @Substitution(name="groupby") + @Appender(_common_see_also) + def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby: + """ + Return an ewm grouper, providing ewm functionality per group. + + Returns + ------- + pandas.api.typing.ExponentialMovingWindowGroupby + """ + from pandas.core.window import ExponentialMovingWindowGroupby + + return ExponentialMovingWindowGroupby( + self._selected_obj, + *args, + _grouper=self._grouper, + **kwargs, + ) + + @final + def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): + """ + Shared function for `pad` and `backfill` to call Cython method. + + Parameters + ---------- + direction : {'ffill', 'bfill'} + Direction passed to underlying Cython function. `bfill` will cause + values to be filled backwards. `ffill` and any other values will + default to a forward fill + limit : int, default None + Maximum number of consecutive values to fill. If `None`, this + method will convert to -1 prior to passing to Cython + + Returns + ------- + `Series` or `DataFrame` with filled values + + See Also + -------- + pad : Returns Series with minimum number of char in object. + backfill : Backward fill the missing values in the dataset. + """ + # Need int value for Cython + if limit is None: + limit = -1 + + ids, _, _ = self._grouper.group_info + sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False) + if direction == "bfill": + sorted_labels = sorted_labels[::-1] + + col_func = partial( + libgroupby.group_fillna_indexer, + labels=ids, + sorted_labels=sorted_labels, + limit=limit, + dropna=self.dropna, + ) + + def blk_func(values: ArrayLike) -> ArrayLike: + mask = isna(values) + if values.ndim == 1: + indexer = np.empty(values.shape, dtype=np.intp) + col_func(out=indexer, mask=mask) + return algorithms.take_nd(values, indexer) + + else: + # We broadcast algorithms.take_nd analogous to + # np.take_along_axis + if isinstance(values, np.ndarray): + dtype = values.dtype + if self._grouper.has_dropped_na: + # dropped null groups give rise to nan in the result + dtype = ensure_dtype_can_hold_na(values.dtype) + out = np.empty(values.shape, dtype=dtype) + else: + # Note: we only get here with backfill/pad, + # so if we have a dtype that cannot hold NAs, + # then there will be no -1s in indexer, so we can use + # the original dtype (no need to ensure_dtype_can_hold_na) + out = type(values)._empty(values.shape, dtype=values.dtype) + + for i, value_element in enumerate(values): + # call group_fillna_indexer column-wise + indexer = np.empty(values.shape[1], dtype=np.intp) + col_func(out=indexer, mask=mask[i]) + out[i, :] = algorithms.take_nd(value_element, indexer) + return out + + mgr = self._get_data_to_aggregate() + res_mgr = mgr.apply(blk_func) + + new_obj = self._wrap_agged_manager(res_mgr) + + if self.axis == 1: + # Only relevant for DataFrameGroupBy + new_obj = new_obj.T + new_obj.columns = self.obj.columns + + new_obj.index = self.obj.index + return new_obj + + @final + @Substitution(name="groupby") + def ffill(self, limit: int | None = None): + """ + Forward fill the values. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + Returns + ------- + Series or DataFrame + Object with missing values filled. + + See Also + -------- + Series.ffill: Returns Series with minimum number of char in object. + DataFrame.ffill: Object with missing values filled or None if inplace=True. + Series.fillna: Fill NaN values of a Series. + DataFrame.fillna: Fill NaN values of a DataFrame. + + Examples + -------- + + For SeriesGroupBy: + + >>> key = [0, 0, 1, 1] + >>> ser = pd.Series([np.nan, 2, 3, np.nan], index=key) + >>> ser + 0 NaN + 0 2.0 + 1 3.0 + 1 NaN + dtype: float64 + >>> ser.groupby(level=0).ffill() + 0 NaN + 0 2.0 + 1 3.0 + 1 3.0 + dtype: float64 + + For DataFrameGroupBy: + + >>> df = pd.DataFrame( + ... { + ... "key": [0, 0, 1, 1, 1], + ... "A": [np.nan, 2, np.nan, 3, np.nan], + ... "B": [2, 3, np.nan, np.nan, np.nan], + ... "C": [np.nan, np.nan, 2, np.nan, np.nan], + ... } + ... ) + >>> df + key A B C + 0 0 NaN 2.0 NaN + 1 0 2.0 3.0 NaN + 2 1 NaN NaN 2.0 + 3 1 3.0 NaN NaN + 4 1 NaN NaN NaN + + Propagate non-null values forward or backward within each group along columns. + + >>> df.groupby("key").ffill() + A B C + 0 NaN 2.0 NaN + 1 2.0 3.0 NaN + 2 NaN NaN 2.0 + 3 3.0 NaN 2.0 + 4 3.0 NaN 2.0 + + Propagate non-null values forward or backward within each group along rows. + + >>> df.T.groupby(np.array([0, 0, 1, 1])).ffill().T + key A B C + 0 0.0 0.0 2.0 2.0 + 1 0.0 2.0 3.0 3.0 + 2 1.0 1.0 NaN 2.0 + 3 1.0 3.0 NaN NaN + 4 1.0 1.0 NaN NaN + + Only replace the first NaN element within a group along rows. + + >>> df.groupby("key").ffill(limit=1) + A B C + 0 NaN 2.0 NaN + 1 2.0 3.0 NaN + 2 NaN NaN 2.0 + 3 3.0 NaN 2.0 + 4 3.0 NaN NaN + """ + return self._fill("ffill", limit=limit) + + @final + @Substitution(name="groupby") + def bfill(self, limit: int | None = None): + """ + Backward fill the values. + + Parameters + ---------- + limit : int, optional + Limit of how many values to fill. + + Returns + ------- + Series or DataFrame + Object with missing values filled. + + See Also + -------- + Series.bfill : Backward fill the missing values in the dataset. + DataFrame.bfill: Backward fill the missing values in the dataset. + Series.fillna: Fill NaN values of a Series. + DataFrame.fillna: Fill NaN values of a DataFrame. + + Examples + -------- + + With Series: + + >>> index = ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot'] + >>> s = pd.Series([None, 1, None, None, 3], index=index) + >>> s + Falcon NaN + Falcon 1.0 + Parrot NaN + Parrot NaN + Parrot 3.0 + dtype: float64 + >>> s.groupby(level=0).bfill() + Falcon 1.0 + Falcon 1.0 + Parrot 3.0 + Parrot 3.0 + Parrot 3.0 + dtype: float64 + >>> s.groupby(level=0).bfill(limit=1) + Falcon 1.0 + Falcon 1.0 + Parrot NaN + Parrot 3.0 + Parrot 3.0 + dtype: float64 + + With DataFrame: + + >>> df = pd.DataFrame({'A': [1, None, None, None, 4], + ... 'B': [None, None, 5, None, 7]}, index=index) + >>> df + A B + Falcon 1.0 NaN + Falcon NaN NaN + Parrot NaN 5.0 + Parrot NaN NaN + Parrot 4.0 7.0 + >>> df.groupby(level=0).bfill() + A B + Falcon 1.0 NaN + Falcon NaN NaN + Parrot 4.0 5.0 + Parrot 4.0 7.0 + Parrot 4.0 7.0 + >>> df.groupby(level=0).bfill(limit=1) + A B + Falcon 1.0 NaN + Falcon NaN NaN + Parrot NaN 5.0 + Parrot 4.0 7.0 + Parrot 4.0 7.0 + """ + return self._fill("bfill", limit=limit) + + @final + @property + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def nth(self) -> GroupByNthSelector: + """ + Take the nth row from each group if n is an int, otherwise a subset of rows. + + Can be either a call or an index. dropna is not available with index notation. + Index notation accepts a comma separated list of integers and slices. + + If dropna, will take the nth non-null row, dropna is either + 'all' or 'any'; this is equivalent to calling dropna(how=dropna) + before the groupby. + + Parameters + ---------- + n : int, slice or list of ints and slices + A single nth value for the row or a list of nth values or slices. + + .. versionchanged:: 1.4.0 + Added slice and lists containing slices. + Added index notation. + + dropna : {'any', 'all', None}, default None + Apply the specified dropna operation before counting which row is + the nth row. Only supported if n is an int. + + Returns + ------- + Series or DataFrame + N-th value within each group. + %(see_also)s + Examples + -------- + + >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], + ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) + >>> g = df.groupby('A') + >>> g.nth(0) + A B + 0 1 NaN + 2 2 3.0 + >>> g.nth(1) + A B + 1 1 2.0 + 4 2 5.0 + >>> g.nth(-1) + A B + 3 1 4.0 + 4 2 5.0 + >>> g.nth([0, 1]) + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 + 4 2 5.0 + >>> g.nth(slice(None, -1)) + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 + + Index notation may also be used + + >>> g.nth[0, 1] + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 + 4 2 5.0 + >>> g.nth[:-1] + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 + + Specifying `dropna` allows ignoring ``NaN`` values + + >>> g.nth(0, dropna='any') + A B + 1 1 2.0 + 2 2 3.0 + + When the specified ``n`` is larger than any of the groups, an + empty DataFrame is returned + + >>> g.nth(3, dropna='any') + Empty DataFrame + Columns: [A, B] + Index: [] + """ + return GroupByNthSelector(self) + + def _nth( + self, + n: PositionalIndexer | tuple, + dropna: Literal["any", "all", None] = None, + ) -> NDFrameT: + if not dropna: + mask = self._make_mask_from_positional_indexer(n) + + ids, _, _ = self._grouper.group_info + + # Drop NA values in grouping + mask = mask & (ids != -1) + + out = self._mask_selected_obj(mask) + return out + + # dropna is truthy + if not is_integer(n): + raise ValueError("dropna option only supported for an integer argument") + + if dropna not in ["any", "all"]: + # Note: when agg-ing picker doesn't raise this, just returns NaN + raise ValueError( + "For a DataFrame or Series groupby.nth, dropna must be " + "either None, 'any' or 'all', " + f"(was passed {dropna})." + ) + + # old behaviour, but with all and any support for DataFrames. + # modified in GH 7559 to have better perf + n = cast(int, n) + dropped = self._selected_obj.dropna(how=dropna, axis=self.axis) + + # get a new grouper for our dropped obj + grouper: np.ndarray | Index | ops.BaseGrouper + if len(dropped) == len(self._selected_obj): + # Nothing was dropped, can use the same grouper + grouper = self._grouper + else: + # we don't have the grouper info available + # (e.g. we have selected out + # a column that is not in the current object) + axis = self._grouper.axis + grouper = self._grouper.codes_info[axis.isin(dropped.index)] + if self._grouper.has_dropped_na: + # Null groups need to still be encoded as -1 when passed to groupby + nulls = grouper == -1 + # error: No overload variant of "where" matches argument types + # "Any", "NAType", "Any" + values = np.where(nulls, NA, grouper) # type: ignore[call-overload] + grouper = Index(values, dtype="Int64") + + if self.axis == 1: + grb = dropped.T.groupby(grouper, as_index=self.as_index, sort=self.sort) + else: + grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) + return grb.nth(n) + + @final + def quantile( + self, + q: float | AnyArrayLike = 0.5, + interpolation: str = "linear", + numeric_only: bool = False, + ): + """ + Return group values at the given quantile, a la numpy.percentile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + Value(s) between 0 and 1 providing the quantile(s) to compute. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + Method to use when the desired quantile falls between two points. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Return type determined by caller of GroupBy object. + + See Also + -------- + Series.quantile : Similar method for Series. + DataFrame.quantile : Similar method for DataFrame. + numpy.percentile : NumPy method to compute qth percentile. + + Examples + -------- + >>> df = pd.DataFrame([ + ... ['a', 1], ['a', 2], ['a', 3], + ... ['b', 1], ['b', 3], ['b', 5] + ... ], columns=['key', 'val']) + >>> df.groupby('key').quantile() + val + key + a 2.0 + b 3.0 + """ + mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile") + obj = self._wrap_agged_manager(mgr) + if self.axis == 1: + splitter = self._grouper._get_splitter(obj.T, axis=self.axis) + sdata = splitter._sorted_data.T + else: + splitter = self._grouper._get_splitter(obj, axis=self.axis) + sdata = splitter._sorted_data + + starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) + + def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: + if is_object_dtype(vals.dtype): + raise TypeError( + "'quantile' cannot be performed against 'object' dtypes!" + ) + + inference: DtypeObj | None = None + if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype): + out = vals.to_numpy(dtype=float, na_value=np.nan) + inference = vals.dtype + elif is_integer_dtype(vals.dtype): + if isinstance(vals, ExtensionArray): + out = vals.to_numpy(dtype=float, na_value=np.nan) + else: + out = vals + inference = np.dtype(np.int64) + elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray): + out = vals.to_numpy(dtype=float, na_value=np.nan) + elif is_bool_dtype(vals.dtype): + # GH#51424 deprecate to match Series/DataFrame behavior + warnings.warn( + f"Allowing bool dtype in {type(self).__name__}.quantile is " + "deprecated and will raise in a future version, matching " + "the Series/DataFrame behavior. Cast to uint8 dtype before " + "calling quantile instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + out = np.asarray(vals) + elif needs_i8_conversion(vals.dtype): + inference = vals.dtype + # In this case we need to delay the casting until after the + # np.lexsort below. + # error: Incompatible return value type (got + # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any, + # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any], + # Optional[Union[dtype[Any], ExtensionDtype]]]") + return vals, inference # type: ignore[return-value] + elif isinstance(vals, ExtensionArray) and is_float_dtype(vals.dtype): + inference = np.dtype(np.float64) + out = vals.to_numpy(dtype=float, na_value=np.nan) + else: + out = np.asarray(vals) + + return out, inference + + def post_processor( + vals: np.ndarray, + inference: DtypeObj | None, + result_mask: np.ndarray | None, + orig_vals: ArrayLike, + ) -> ArrayLike: + if inference: + # Check for edge case + if isinstance(orig_vals, BaseMaskedArray): + assert result_mask is not None # for mypy + + if interpolation in {"linear", "midpoint"} and not is_float_dtype( + orig_vals + ): + return FloatingArray(vals, result_mask) + else: + # Item "ExtensionDtype" of "Union[ExtensionDtype, str, + # dtype[Any], Type[object]]" has no attribute "numpy_dtype" + # [union-attr] + with warnings.catch_warnings(): + # vals.astype with nan can warn with numpy >1.24 + warnings.filterwarnings("ignore", category=RuntimeWarning) + return type(orig_vals)( + vals.astype( + inference.numpy_dtype # type: ignore[union-attr] + ), + result_mask, + ) + + elif not ( + is_integer_dtype(inference) + and interpolation in {"linear", "midpoint"} + ): + if needs_i8_conversion(inference): + # error: Item "ExtensionArray" of "Union[ExtensionArray, + # ndarray[Any, Any]]" has no attribute "_ndarray" + vals = vals.astype("i8").view( + orig_vals._ndarray.dtype # type: ignore[union-attr] + ) + # error: Item "ExtensionArray" of "Union[ExtensionArray, + # ndarray[Any, Any]]" has no attribute "_from_backing_data" + return orig_vals._from_backing_data( # type: ignore[union-attr] + vals + ) + + assert isinstance(inference, np.dtype) # for mypy + return vals.astype(inference) + + return vals + + qs = np.array(q, dtype=np.float64) + pass_qs: np.ndarray | None = qs + if is_scalar(q): + qs = np.array([q], dtype=np.float64) + pass_qs = None + + ids, _, ngroups = self._grouper.group_info + nqs = len(qs) + + func = partial( + libgroupby.group_quantile, + labels=ids, + qs=qs, + interpolation=interpolation, + starts=starts, + ends=ends, + ) + + def blk_func(values: ArrayLike) -> ArrayLike: + orig_vals = values + if isinstance(values, BaseMaskedArray): + mask = values._mask + result_mask = np.zeros((ngroups, nqs), dtype=np.bool_) + else: + mask = isna(values) + result_mask = None + + is_datetimelike = needs_i8_conversion(values.dtype) + + vals, inference = pre_processor(values) + + ncols = 1 + if vals.ndim == 2: + ncols = vals.shape[0] + + out = np.empty((ncols, ngroups, nqs), dtype=np.float64) + + if is_datetimelike: + vals = vals.view("i8") + + if vals.ndim == 1: + # EA is always 1d + func( + out[0], + values=vals, + mask=mask, + result_mask=result_mask, + is_datetimelike=is_datetimelike, + ) + else: + for i in range(ncols): + func( + out[i], + values=vals[i], + mask=mask[i], + result_mask=None, + is_datetimelike=is_datetimelike, + ) + + if vals.ndim == 1: + out = out.ravel("K") + if result_mask is not None: + result_mask = result_mask.ravel("K") + else: + out = out.reshape(ncols, ngroups * nqs) + + return post_processor(out, inference, result_mask, orig_vals) + + res_mgr = sdata._mgr.grouped_reduce(blk_func) + + res = self._wrap_agged_manager(res_mgr) + return self._wrap_aggregated_output(res, qs=pass_qs) + + @final + @Substitution(name="groupby") + def ngroup(self, ascending: bool = True): + """ + Number each group from 0 to the number of groups - 1. + + This is the enumerative complement of cumcount. Note that the + numbers given to the groups match the order in which the groups + would be seen when iterating over the groupby object, not the + order they are first observed. + + Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN` + and will be skipped from the count. + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from number of group - 1 to 0. + + Returns + ------- + Series + Unique numbers for each group. + + See Also + -------- + .cumcount : Number the rows in each group. + + Examples + -------- + >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]}) + >>> df + color + 0 red + 1 None + 2 red + 3 blue + 4 blue + 5 red + >>> df.groupby("color").ngroup() + 0 1.0 + 1 NaN + 2 1.0 + 3 0.0 + 4 0.0 + 5 1.0 + dtype: float64 + >>> df.groupby("color", dropna=False).ngroup() + 0 1 + 1 2 + 2 1 + 3 0 + 4 0 + 5 1 + dtype: int64 + >>> df.groupby("color", dropna=False).ngroup(ascending=False) + 0 1 + 1 0 + 2 1 + 3 2 + 4 2 + 5 1 + dtype: int64 + """ + obj = self._obj_with_exclusions + index = obj._get_axis(self.axis) + comp_ids = self._grouper.group_info[0] + + dtype: type + if self._grouper.has_dropped_na: + comp_ids = np.where(comp_ids == -1, np.nan, comp_ids) + dtype = np.float64 + else: + dtype = np.int64 + + if any(ping._passed_categorical for ping in self._grouper.groupings): + # comp_ids reflect non-observed groups, we need only observed + comp_ids = rank_1d(comp_ids, ties_method="dense") - 1 + + result = self._obj_1d_constructor(comp_ids, index, dtype=dtype) + if not ascending: + result = self.ngroups - 1 - result + return result + + @final + @Substitution(name="groupby") + def cumcount(self, ascending: bool = True): + """ + Number each item in each group from 0 to the length of that group - 1. + + Essentially this is equivalent to + + .. code-block:: python + + self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from length of group - 1 to 0. + + Returns + ------- + Series + Sequence number of each element within each group. + + See Also + -------- + .ngroup : Number the groups themselves. + + Examples + -------- + >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], + ... columns=['A']) + >>> df + A + 0 a + 1 a + 2 a + 3 b + 4 b + 5 a + >>> df.groupby('A').cumcount() + 0 0 + 1 1 + 2 2 + 3 0 + 4 1 + 5 3 + dtype: int64 + >>> df.groupby('A').cumcount(ascending=False) + 0 3 + 1 2 + 2 1 + 3 1 + 4 0 + 5 0 + dtype: int64 + """ + index = self._obj_with_exclusions._get_axis(self.axis) + cumcounts = self._cumcount_array(ascending=ascending) + return self._obj_1d_constructor(cumcounts, index) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def rank( + self, + method: str = "average", + ascending: bool = True, + na_option: str = "keep", + pct: bool = False, + axis: AxisInt | lib.NoDefault = lib.no_default, + ) -> NDFrameT: + """ + Provide the rank of values within each group. + + Parameters + ---------- + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + * average: average rank of group. + * min: lowest rank in group. + * max: highest rank in group. + * first: ranks assigned in order they appear in the array. + * dense: like 'min', but rank always increases by 1 between groups. + ascending : bool, default True + False for ranks by high (1) to low (N). + na_option : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are. + * top: smallest rank if ascending. + * bottom: smallest rank if descending. + pct : bool, default False + Compute percentage rank of data within each group. + axis : int, default 0 + The axis of the object over which to compute the rank. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + Returns + ------- + DataFrame with ranking of values within each group + %(see_also)s + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], + ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5], + ... } + ... ) + >>> df + group value + 0 a 2 + 1 a 4 + 2 a 2 + 3 a 3 + 4 a 5 + 5 b 1 + 6 b 2 + 7 b 4 + 8 b 1 + 9 b 5 + >>> for method in ['average', 'min', 'max', 'dense', 'first']: + ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method) + >>> df + group value average_rank min_rank max_rank dense_rank first_rank + 0 a 2 1.5 1.0 2.0 1.0 1.0 + 1 a 4 4.0 4.0 4.0 3.0 4.0 + 2 a 2 1.5 1.0 2.0 1.0 2.0 + 3 a 3 3.0 3.0 3.0 2.0 3.0 + 4 a 5 5.0 5.0 5.0 4.0 5.0 + 5 b 1 1.5 1.0 2.0 1.0 1.0 + 6 b 2 3.0 3.0 3.0 2.0 3.0 + 7 b 4 4.0 4.0 4.0 3.0 4.0 + 8 b 1 1.5 1.0 2.0 1.0 2.0 + 9 b 5 5.0 5.0 5.0 4.0 5.0 + """ + if na_option not in {"keep", "top", "bottom"}: + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + raise ValueError(msg) + + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "rank") + else: + axis = 0 + + kwargs = { + "ties_method": method, + "ascending": ascending, + "na_option": na_option, + "pct": pct, + } + if axis != 0: + # DataFrame uses different keyword name + kwargs["method"] = kwargs.pop("ties_method") + f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs) + result = self._python_apply_general( + f, self._selected_obj, is_transform=True + ) + return result + + return self._cython_transform( + "rank", + numeric_only=False, + axis=axis, + **kwargs, + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def cumprod( + self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs + ) -> NDFrameT: + """ + Cumulative product for each group. + + Returns + ------- + Series or DataFrame + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([6, 2, 0], index=lst) + >>> ser + a 6 + a 2 + b 0 + dtype: int64 + >>> ser.groupby(level=0).cumprod() + a 6 + a 12 + b 0 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["cow", "horse", "bull"]) + >>> df + a b c + cow 1 8 2 + horse 1 2 5 + bull 2 6 9 + >>> df.groupby("a").groups + {1: ['cow', 'horse'], 2: ['bull']} + >>> df.groupby("a").cumprod() + b c + cow 8 2 + horse 16 10 + bull 6 9 + """ + nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "cumprod") + else: + axis = 0 + + if axis != 0: + f = lambda x: x.cumprod(axis=axis, **kwargs) + return self._python_apply_general(f, self._selected_obj, is_transform=True) + + return self._cython_transform("cumprod", **kwargs) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def cumsum( + self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs + ) -> NDFrameT: + """ + Cumulative sum for each group. + + Returns + ------- + Series or DataFrame + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b'] + >>> ser = pd.Series([6, 2, 0], index=lst) + >>> ser + a 6 + a 2 + b 0 + dtype: int64 + >>> ser.groupby(level=0).cumsum() + a 6 + a 8 + b 0 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["fox", "gorilla", "lion"]) + >>> df + a b c + fox 1 8 2 + gorilla 1 2 5 + lion 2 6 9 + >>> df.groupby("a").groups + {1: ['fox', 'gorilla'], 2: ['lion']} + >>> df.groupby("a").cumsum() + b c + fox 8 2 + gorilla 10 7 + lion 6 9 + """ + nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "cumsum") + else: + axis = 0 + + if axis != 0: + f = lambda x: x.cumsum(axis=axis, **kwargs) + return self._python_apply_general(f, self._selected_obj, is_transform=True) + + return self._cython_transform("cumsum", **kwargs) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def cummin( + self, + axis: AxisInt | lib.NoDefault = lib.no_default, + numeric_only: bool = False, + **kwargs, + ) -> NDFrameT: + """ + Cumulative min for each group. + + Returns + ------- + Series or DataFrame + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([1, 6, 2, 3, 0, 4], index=lst) + >>> ser + a 1 + a 6 + a 2 + b 3 + b 0 + b 4 + dtype: int64 + >>> ser.groupby(level=0).cummin() + a 1 + a 1 + a 1 + b 3 + b 0 + b 0 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 0, 2], [1, 1, 5], [6, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["snake", "rabbit", "turtle"]) + >>> df + a b c + snake 1 0 2 + rabbit 1 1 5 + turtle 6 6 9 + >>> df.groupby("a").groups + {1: ['snake', 'rabbit'], 6: ['turtle']} + >>> df.groupby("a").cummin() + b c + snake 0 2 + rabbit 0 2 + turtle 6 9 + """ + skipna = kwargs.get("skipna", True) + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "cummin") + else: + axis = 0 + + if axis != 0: + f = lambda x: np.minimum.accumulate(x, axis) + obj = self._selected_obj + if numeric_only: + obj = obj._get_numeric_data() + return self._python_apply_general(f, obj, is_transform=True) + + return self._cython_transform( + "cummin", numeric_only=numeric_only, skipna=skipna + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def cummax( + self, + axis: AxisInt | lib.NoDefault = lib.no_default, + numeric_only: bool = False, + **kwargs, + ) -> NDFrameT: + """ + Cumulative max for each group. + + Returns + ------- + Series or DataFrame + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([1, 6, 2, 3, 1, 4], index=lst) + >>> ser + a 1 + a 6 + a 2 + b 3 + b 1 + b 4 + dtype: int64 + >>> ser.groupby(level=0).cummax() + a 1 + a 6 + a 6 + b 3 + b 3 + b 4 + dtype: int64 + + For DataFrameGroupBy: + + >>> data = [[1, 8, 2], [1, 1, 0], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["cow", "horse", "bull"]) + >>> df + a b c + cow 1 8 2 + horse 1 1 0 + bull 2 6 9 + >>> df.groupby("a").groups + {1: ['cow', 'horse'], 2: ['bull']} + >>> df.groupby("a").cummax() + b c + cow 8 2 + horse 8 2 + bull 6 9 + """ + skipna = kwargs.get("skipna", True) + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "cummax") + else: + axis = 0 + + if axis != 0: + f = lambda x: np.maximum.accumulate(x, axis) + obj = self._selected_obj + if numeric_only: + obj = obj._get_numeric_data() + return self._python_apply_general(f, obj, is_transform=True) + + return self._cython_transform( + "cummax", numeric_only=numeric_only, skipna=skipna + ) + + @final + @Substitution(name="groupby") + def shift( + self, + periods: int | Sequence[int] = 1, + freq=None, + axis: Axis | lib.NoDefault = lib.no_default, + fill_value=lib.no_default, + suffix: str | None = None, + ): + """ + Shift each group by periods observations. + + If freq is passed, the index will be increased using the periods and the freq. + + Parameters + ---------- + periods : int | Sequence[int], default 1 + Number of periods to shift. If a list of values, shift each group by + each period. + freq : str, optional + Frequency string. + axis : axis to shift, default 0 + Shift direction. + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + fill_value : optional + The scalar value to use for newly introduced missing values. + + .. versionchanged:: 2.1.0 + Will raise a ``ValueError`` if ``freq`` is provided too. + + suffix : str, optional + A string to add to each shifted column if there are multiple periods. + Ignored otherwise. + + Returns + ------- + Series or DataFrame + Object shifted within each group. + + See Also + -------- + Index.shift : Shift values of Index. + + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).shift(1) + a NaN + a 1.0 + b NaN + b 3.0 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df + a b c + tuna 1 2 3 + salmon 1 5 6 + catfish 2 5 8 + goldfish 2 6 9 + >>> df.groupby("a").shift(1) + b c + tuna NaN NaN + salmon 2.0 3.0 + catfish NaN NaN + goldfish 5.0 8.0 + """ + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "shift") + else: + axis = 0 + + if is_list_like(periods): + if axis == 1: + raise ValueError( + "If `periods` contains multiple shifts, `axis` cannot be 1." + ) + periods = cast(Sequence, periods) + if len(periods) == 0: + raise ValueError("If `periods` is an iterable, it cannot be empty.") + from pandas.core.reshape.concat import concat + + add_suffix = True + else: + if not is_integer(periods): + raise TypeError( + f"Periods must be integer, but {periods} is {type(periods)}." + ) + if suffix: + raise ValueError("Cannot specify `suffix` if `periods` is an int.") + periods = [cast(int, periods)] + add_suffix = False + + shifted_dataframes = [] + for period in periods: + if not is_integer(period): + raise TypeError( + f"Periods must be integer, but {period} is {type(period)}." + ) + period = cast(int, period) + if freq is not None or axis != 0: + f = lambda x: x.shift( + period, freq, axis, fill_value # pylint: disable=cell-var-from-loop + ) + shifted = self._python_apply_general( + f, self._selected_obj, is_transform=True + ) + else: + if fill_value is lib.no_default: + fill_value = None + ids, _, ngroups = self._grouper.group_info + res_indexer = np.zeros(len(ids), dtype=np.int64) + + libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) + + obj = self._obj_with_exclusions + + shifted = obj._reindex_with_indexers( + {self.axis: (obj.axes[self.axis], res_indexer)}, + fill_value=fill_value, + allow_dups=True, + ) + + if add_suffix: + if isinstance(shifted, Series): + shifted = cast(NDFrameT, shifted.to_frame()) + shifted = shifted.add_suffix( + f"{suffix}_{period}" if suffix else f"_{period}" + ) + shifted_dataframes.append(cast(Union[Series, DataFrame], shifted)) + + return ( + shifted_dataframes[0] + if len(shifted_dataframes) == 1 + else concat(shifted_dataframes, axis=1) + ) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def diff( + self, periods: int = 1, axis: AxisInt | lib.NoDefault = lib.no_default + ) -> NDFrameT: + """ + First discrete difference of element. + + Calculates the difference of each element compared with another + element in the group (default is element in previous row). + + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating difference, accepts negative values. + axis : axis to shift, default 0 + Take difference over rows (0) or columns (1). + + .. deprecated:: 2.1.0 + For axis=1, operate on the underlying object instead. Otherwise + the axis keyword is not necessary. + + Returns + ------- + Series or DataFrame + First differences. + %(see_also)s + Examples + -------- + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] + >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) + >>> ser + a 7 + a 2 + a 8 + b 4 + b 3 + b 3 + dtype: int64 + >>> ser.groupby(level=0).diff() + a NaN + a -5.0 + a 6.0 + b NaN + b -1.0 + b 0.0 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} + >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', + ... 'mouse', 'mouse', 'mouse', 'mouse']) + >>> df + a b + dog 1 1 + dog 3 4 + dog 5 8 + mouse 7 4 + mouse 7 4 + mouse 8 2 + mouse 3 1 + >>> df.groupby(level=0).diff() + a b + dog NaN NaN + dog 2.0 3.0 + dog 2.0 4.0 + mouse NaN NaN + mouse 0.0 0.0 + mouse 1.0 -2.0 + mouse -5.0 -1.0 + """ + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "diff") + else: + axis = 0 + + if axis != 0: + return self.apply(lambda x: x.diff(periods=periods, axis=axis)) + + obj = self._obj_with_exclusions + shifted = self.shift(periods=periods) + + # GH45562 - to retain existing behavior and match behavior of Series.diff(), + # int8 and int16 are coerced to float32 rather than float64. + dtypes_to_f32 = ["int8", "int16"] + if obj.ndim == 1: + if obj.dtype in dtypes_to_f32: + shifted = shifted.astype("float32") + else: + to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32] + if len(to_coerce): + shifted = shifted.astype({c: "float32" for c in to_coerce}) + + return obj - shifted + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def pct_change( + self, + periods: int = 1, + fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default, + limit: int | None | lib.NoDefault = lib.no_default, + freq=None, + axis: Axis | lib.NoDefault = lib.no_default, + ): + """ + Calculate pct_change of each value to previous entry in group. + + Returns + ------- + Series or DataFrame + Percentage changes within each group. + %(see_also)s + Examples + -------- + + For SeriesGroupBy: + + >>> lst = ['a', 'a', 'b', 'b'] + >>> ser = pd.Series([1, 2, 3, 4], index=lst) + >>> ser + a 1 + a 2 + b 3 + b 4 + dtype: int64 + >>> ser.groupby(level=0).pct_change() + a NaN + a 1.000000 + b NaN + b 0.333333 + dtype: float64 + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] + >>> df = pd.DataFrame(data, columns=["a", "b", "c"], + ... index=["tuna", "salmon", "catfish", "goldfish"]) + >>> df + a b c + tuna 1 2 3 + salmon 1 5 6 + catfish 2 5 8 + goldfish 2 6 9 + >>> df.groupby("a").pct_change() + b c + tuna NaN NaN + salmon 1.5 1.000 + catfish NaN NaN + goldfish 0.2 0.125 + """ + # GH#53491 + if fill_method not in (lib.no_default, None) or limit is not lib.no_default: + warnings.warn( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + f"{type(self).__name__}.pct_change are deprecated and will be removed " + "in a future version. Either fill in any non-leading NA values prior " + "to calling pct_change or specify 'fill_method=None' to not fill NA " + "values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if fill_method is lib.no_default: + if limit is lib.no_default and any( + grp.isna().values.any() for _, grp in self + ): + warnings.warn( + "The default fill_method='ffill' in " + f"{type(self).__name__}.pct_change is deprecated and will " + "be removed in a future version. Either fill in any " + "non-leading NA values prior to calling pct_change or " + "specify 'fill_method=None' to not fill NA values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + fill_method = "ffill" + if limit is lib.no_default: + limit = None + + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "pct_change") + else: + axis = 0 + + # TODO(GH#23918): Remove this conditional for SeriesGroupBy when + # GH#23918 is fixed + if freq is not None or axis != 0: + f = lambda x: x.pct_change( + periods=periods, + fill_method=fill_method, + limit=limit, + freq=freq, + axis=axis, + ) + return self._python_apply_general(f, self._selected_obj, is_transform=True) + + if fill_method is None: # GH30463 + fill_method = "ffill" + limit = 0 + filled = getattr(self, fill_method)(limit=limit) + if self.axis == 0: + fill_grp = filled.groupby(self._grouper.codes, group_keys=self.group_keys) + else: + fill_grp = filled.T.groupby(self._grouper.codes, group_keys=self.group_keys) + shifted = fill_grp.shift(periods=periods, freq=freq) + if self.axis == 1: + shifted = shifted.T + return (filled / shifted) - 1 + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def head(self, n: int = 5) -> NDFrameT: + """ + Return first n rows of each group. + + Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). + + Parameters + ---------- + n : int + If positive: number of entries to include from start of each group. + If negative: number of entries to exclude from end of each group. + + Returns + ------- + Series or DataFrame + Subset of original Series or DataFrame as determined by n. + %(see_also)s + Examples + -------- + + >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], + ... columns=['A', 'B']) + >>> df.groupby('A').head(1) + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(-1) + A B + 0 1 2 + """ + mask = self._make_mask_from_positional_indexer(slice(None, n)) + return self._mask_selected_obj(mask) + + @final + @Substitution(name="groupby") + @Substitution(see_also=_common_see_also) + def tail(self, n: int = 5) -> NDFrameT: + """ + Return last n rows of each group. + + Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). + + Parameters + ---------- + n : int + If positive: number of entries to include from end of each group. + If negative: number of entries to exclude from start of each group. + + Returns + ------- + Series or DataFrame + Subset of original Series or DataFrame as determined by n. + %(see_also)s + Examples + -------- + + >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]], + ... columns=['A', 'B']) + >>> df.groupby('A').tail(1) + A B + 1 a 2 + 3 b 2 + >>> df.groupby('A').tail(-1) + A B + 1 a 2 + 3 b 2 + """ + if n: + mask = self._make_mask_from_positional_indexer(slice(-n, None)) + else: + mask = self._make_mask_from_positional_indexer([]) + + return self._mask_selected_obj(mask) + + @final + def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: + """ + Return _selected_obj with mask applied to the correct axis. + + Parameters + ---------- + mask : np.ndarray[bool] + Boolean mask to apply. + + Returns + ------- + Series or DataFrame + Filtered _selected_obj. + """ + ids = self._grouper.group_info[0] + mask = mask & (ids != -1) + + if self.axis == 0: + return self._selected_obj[mask] + else: + return self._selected_obj.iloc[:, mask] + + @final + def _reindex_output( + self, + output: OutputFrameOrSeries, + fill_value: Scalar = np.nan, + qs: npt.NDArray[np.float64] | None = None, + ) -> OutputFrameOrSeries: + """ + If we have categorical groupers, then we might want to make sure that + we have a fully re-indexed output to the levels. This means expanding + the output space to accommodate all values in the cartesian product of + our groups, regardless of whether they were observed in the data or + not. This will expand the output space if there are missing groups. + + The method returns early without modifying the input if the number of + groupings is less than 2, self.observed == True or none of the groupers + are categorical. + + Parameters + ---------- + output : Series or DataFrame + Object resulting from grouping and applying an operation. + fill_value : scalar, default np.nan + Value to use for unobserved categories if self.observed is False. + qs : np.ndarray[float64] or None, default None + quantile values, only relevant for quantile. + + Returns + ------- + Series or DataFrame + Object (potentially) re-indexed to include all possible groups. + """ + groupings = self._grouper.groupings + if len(groupings) == 1: + return output + + # if we only care about the observed values + # we are done + elif self.observed: + return output + + # reindexing only applies to a Categorical grouper + elif not any( + isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) + for ping in groupings + ): + return output + + levels_list = [ping._group_index for ping in groupings] + names = self._grouper.names + if qs is not None: + # error: Argument 1 to "append" of "list" has incompatible type + # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" + levels_list.append(qs) # type: ignore[arg-type] + names = names + [None] + index = MultiIndex.from_product(levels_list, names=names) + if self.sort: + index = index.sort_values() + + if self.as_index: + # Always holds for SeriesGroupBy unless GH#36507 is implemented + d = { + self.obj._get_axis_name(self.axis): index, + "copy": False, + "fill_value": fill_value, + } + return output.reindex(**d) # type: ignore[arg-type] + + # GH 13204 + # Here, the categorical in-axis groupers, which need to be fully + # expanded, are columns in `output`. An idea is to do: + # output = output.set_index(self._grouper.names) + # .reindex(index).reset_index() + # but special care has to be taken because of possible not-in-axis + # groupers. + # So, we manually select and drop the in-axis grouper columns, + # reindex `output`, and then reset the in-axis grouper columns. + + # Select in-axis groupers + in_axis_grps = [ + (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis + ] + if len(in_axis_grps) > 0: + g_nums, g_names = zip(*in_axis_grps) + output = output.drop(labels=list(g_names), axis=1) + + # Set a temp index and reindex (possibly expanding) + output = output.set_index(self._grouper.result_index).reindex( + index, copy=False, fill_value=fill_value + ) + + # Reset in-axis grouper columns + # (using level numbers `g_nums` because level names may not be unique) + if len(in_axis_grps) > 0: + output = output.reset_index(level=g_nums) + + return output.reset_index(drop=True) + + @final + def sample( + self, + n: int | None = None, + frac: float | None = None, + replace: bool = False, + weights: Sequence | Series | None = None, + random_state: RandomState | None = None, + ): + """ + Return a random sample of items from each group. + + You can use `random_state` for reproducibility. + + Parameters + ---------- + n : int, optional + Number of items to return for each group. Cannot be used with + `frac` and must be no larger than the smallest group unless + `replace` is True. Default is one if `frac` is None. + frac : float, optional + Fraction of items to return. Cannot be used with `n`. + replace : bool, default False + Allow or disallow sampling of the same row more than once. + weights : list-like, optional + Default None results in equal probability weighting. + If passed a list-like then values must have the same length as + the underlying DataFrame or Series object and will be used as + sampling probabilities after normalization within each group. + Values must be non-negative with at least one positive element + within each group. + random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional + If int, array-like, or BitGenerator, seed for random number generator. + If np.random.RandomState or np.random.Generator, use as given. + + .. versionchanged:: 1.4.0 + + np.random.Generator objects now accepted + + Returns + ------- + Series or DataFrame + A new object of same type as caller containing items randomly + sampled within each group from the caller object. + + See Also + -------- + DataFrame.sample: Generate random samples from a DataFrame object. + numpy.random.choice: Generate a random sample from a given 1-D numpy + array. + + Examples + -------- + >>> df = pd.DataFrame( + ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)} + ... ) + >>> df + a b + 0 red 0 + 1 red 1 + 2 blue 2 + 3 blue 3 + 4 black 4 + 5 black 5 + + Select one row at random for each distinct value in column a. The + `random_state` argument can be used to guarantee reproducibility: + + >>> df.groupby("a").sample(n=1, random_state=1) + a b + 4 black 4 + 2 blue 2 + 1 red 1 + + Set `frac` to sample fixed proportions rather than counts: + + >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2) + 5 5 + 2 2 + 0 0 + Name: b, dtype: int64 + + Control sample probabilities within groups by setting weights: + + >>> df.groupby("a").sample( + ... n=1, + ... weights=[1, 1, 1, 0, 0, 1], + ... random_state=1, + ... ) + a b + 5 black 5 + 2 blue 2 + 0 red 0 + """ # noqa: E501 + if self._selected_obj.empty: + # GH48459 prevent ValueError when object is empty + return self._selected_obj + size = sample.process_sampling_size(n, frac, replace) + if weights is not None: + weights_arr = sample.preprocess_weights( + self._selected_obj, weights, axis=self.axis + ) + + random_state = com.random_state(random_state) + + group_iterator = self._grouper.get_iterator(self._selected_obj, self.axis) + + sampled_indices = [] + for labels, obj in group_iterator: + grp_indices = self.indices[labels] + group_size = len(grp_indices) + if size is not None: + sample_size = size + else: + assert frac is not None + sample_size = round(frac * group_size) + + grp_sample = sample.sample( + group_size, + size=sample_size, + replace=replace, + weights=None if weights is None else weights_arr[grp_indices], + random_state=random_state, + ) + sampled_indices.append(grp_indices[grp_sample]) + + sampled_indices = np.concatenate(sampled_indices) + return self._selected_obj.take(sampled_indices, axis=self.axis) + + def _idxmax_idxmin( + self, + how: Literal["idxmax", "idxmin"], + ignore_unobserved: bool = False, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + ) -> NDFrameT: + """Compute idxmax/idxmin. + + Parameters + ---------- + how : {'idxmin', 'idxmax'} + Whether to compute idxmin or idxmax. + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + numeric_only : bool, default False + Include only float, int, boolean columns. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ignore_unobserved : bool, default False + When True and an unobserved group is encountered, do not raise. This used + for transform where unobserved groups do not play an impact on the result. + + Returns + ------- + Series or DataFrame + idxmax or idxmin for the groupby operation. + """ + if axis is not lib.no_default: + if axis is None: + axis = self.axis + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, how) + else: + axis = self.axis + + if not self.observed and any( + ping._passed_categorical for ping in self._grouper.groupings + ): + expected_len = np.prod( + [len(ping._group_index) for ping in self._grouper.groupings] + ) + if len(self._grouper.groupings) == 1: + result_len = len(self._grouper.groupings[0].grouping_vector.unique()) + else: + # result_index only contains observed groups in this case + result_len = len(self._grouper.result_index) + assert result_len <= expected_len + has_unobserved = result_len < expected_len + + raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved + # Only raise an error if there are columns to compute; otherwise we return + # an empty DataFrame with an index (possibly including unobserved) but no + # columns + data = self._obj_with_exclusions + if raise_err and isinstance(data, DataFrame): + if numeric_only: + data = data._get_numeric_data() + raise_err = len(data.columns) > 0 + + if raise_err: + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) + elif not skipna: + if self._obj_with_exclusions.isna().any(axis=None): + warnings.warn( + f"The behavior of {type(self).__name__}.{how} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if axis == 1: + try: + + def func(df): + method = getattr(df, how) + return method(axis=axis, skipna=skipna, numeric_only=numeric_only) + + func.__name__ = how + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) + except ValueError as err: + name = "argmax" if how == "idxmax" else "argmin" + if f"attempt to get {name} of an empty sequence" in str(err): + raise ValueError( + f"Can't get {how} of an empty group due to unobserved " + "categories. Specify observed=True in groupby instead." + ) from None + raise + return result + + result = self._agg_general( + numeric_only=numeric_only, + min_count=1, + alias=how, + skipna=skipna, + ) + return result + + def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: + index = self.obj._get_axis(self.axis) + if res.size == 0: + result = res.astype(index.dtype) + else: + if isinstance(index, MultiIndex): + index = index.to_flat_index() + values = res._values + assert isinstance(values, np.ndarray) + na_value = na_value_for_dtype(index.dtype, compat=False) + if isinstance(res, Series): + # mypy: expression has type "Series", variable has type "NDFrameT" + result = res._constructor( # type: ignore[assignment] + index.array.take(values, allow_fill=True, fill_value=na_value), + index=res.index, + name=res.name, + ) + else: + data = {} + for k, column_values in enumerate(values.T): + data[k] = index.array.take( + column_values, allow_fill=True, fill_value=na_value + ) + result = self.obj._constructor(data, index=res.index) + result.columns = res.columns + return result + + +@doc(GroupBy) +def get_groupby( + obj: NDFrame, + by: _KeysArgType | None = None, + axis: AxisInt = 0, + grouper: ops.BaseGrouper | None = None, + group_keys: bool = True, +) -> GroupBy: + klass: type[GroupBy] + if isinstance(obj, Series): + from pandas.core.groupby.generic import SeriesGroupBy + + klass = SeriesGroupBy + elif isinstance(obj, DataFrame): + from pandas.core.groupby.generic import DataFrameGroupBy + + klass = DataFrameGroupBy + else: # pragma: no cover + raise TypeError(f"invalid type: {obj}") + + return klass( + obj=obj, + keys=by, + axis=axis, + grouper=grouper, + group_keys=group_keys, + ) + + +def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex: + """ + Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex. + + The quantile level in the MultiIndex is a repeated copy of 'qs'. + + Parameters + ---------- + idx : Index + qs : np.ndarray[float64] + + Returns + ------- + MultiIndex + """ + nqs = len(qs) + lev_codes, lev = Index(qs).factorize() + lev_codes = coerce_indexer_dtype(lev_codes, lev) + + if idx._is_multi: + idx = cast(MultiIndex, idx) + levels = list(idx.levels) + [lev] + codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))] + mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None]) + else: + nidx = len(idx) + idx_codes = coerce_indexer_dtype(np.arange(nidx), idx) + levels = [idx, lev] + codes = [np.repeat(idx_codes, nqs), np.tile(lev_codes, nidx)] + mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) + + return mi + + +# GH#7155 +_apply_groupings_depr = ( + "{}.{} operated on the grouping columns. This behavior is deprecated, " + "and in a future version of pandas the grouping columns will be excluded " + "from the operation. Either pass `include_groups=False` to exclude the " + "groupings or explicitly select the grouping columns after groupby to silence " + "this warning." +) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/grouper.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/grouper.py new file mode 100644 index 0000000000000000000000000000000000000000..e2224caad9e846f5e386019e6da5b47d16ab3694 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/grouper.py @@ -0,0 +1,1102 @@ +""" +Provide user facing operators for doing the split part of the +split-apply-combine paradigm. +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + final, +) +import warnings + +import numpy as np + +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) + +from pandas._libs import lib +from pandas._libs.tslibs import OutOfBoundsDatetime +from pandas.errors import InvalidIndexError +from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_list_like, + is_scalar, +) +from pandas.core.dtypes.dtypes import CategoricalDtype + +from pandas.core import algorithms +from pandas.core.arrays import ( + Categorical, + ExtensionArray, +) +import pandas.core.common as com +from pandas.core.frame import DataFrame +from pandas.core.groupby import ops +from pandas.core.groupby.categorical import recode_for_groupby +from pandas.core.indexes.api import ( + CategoricalIndex, + Index, + MultiIndex, +) +from pandas.core.series import Series + +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterator, + ) + + from pandas._typing import ( + ArrayLike, + Axis, + NDFrameT, + npt, + ) + + from pandas.core.generic import NDFrame + + +class Grouper: + """ + A Grouper allows the user to specify a groupby instruction for an object. + + This specification will select a column via the key parameter, or if the + level and/or axis parameters are given, a level of the index of the target + object. + + If `axis` and/or `level` are passed as keywords to both `Grouper` and + `groupby`, the values passed to `Grouper` take precedence. + + Parameters + ---------- + key : str, defaults to None + Groupby key, which selects the grouping column of the target. + level : name/number, defaults to None + The level for the target index. + freq : str / frequency object, defaults to None + This will groupby the specified frequency if the target selection + (via key or level) is a datetime-like object. For full specification + of available frequencies, please see `here + `_. + axis : str, int, defaults to 0 + Number/name of the axis. + sort : bool, default to False + Whether to sort the resulting labels. + closed : {'left' or 'right'} + Closed end of interval. Only when `freq` parameter is passed. + label : {'left' or 'right'} + Interval boundary to use for labeling. + Only when `freq` parameter is passed. + convention : {'start', 'end', 'e', 's'} + If grouper is PeriodIndex and `freq` parameter is passed. + + origin : Timestamp or str, default 'start_day' + The timestamp on which to adjust the grouping. The timezone of origin must + match the timezone of the index. + If string, must be one of the following: + + - 'epoch': `origin` is 1970-01-01 + - 'start': `origin` is the first value of the timeseries + - 'start_day': `origin` is the first day at midnight of the timeseries + + - 'end': `origin` is the last value of the timeseries + - 'end_day': `origin` is the ceiling midnight of the last day + + .. versionadded:: 1.3.0 + + offset : Timedelta or str, default is None + An offset timedelta added to the origin. + + dropna : bool, default True + If True, and if group keys contain NA values, NA values together with + row/column will be dropped. If False, NA values will also be treated as + the key in groups. + + Returns + ------- + Grouper or pandas.api.typing.TimeGrouper + A TimeGrouper is returned if ``freq`` is not ``None``. Otherwise, a Grouper + is returned. + + Examples + -------- + ``df.groupby(pd.Grouper(key="Animal"))`` is equivalent to ``df.groupby('Animal')`` + + >>> df = pd.DataFrame( + ... { + ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"], + ... "Speed": [100, 5, 200, 300, 15], + ... } + ... ) + >>> df + Animal Speed + 0 Falcon 100 + 1 Parrot 5 + 2 Falcon 200 + 3 Falcon 300 + 4 Parrot 15 + >>> df.groupby(pd.Grouper(key="Animal")).mean() + Speed + Animal + Falcon 200.0 + Parrot 10.0 + + Specify a resample operation on the column 'Publish date' + + >>> df = pd.DataFrame( + ... { + ... "Publish date": [ + ... pd.Timestamp("2000-01-02"), + ... pd.Timestamp("2000-01-02"), + ... pd.Timestamp("2000-01-09"), + ... pd.Timestamp("2000-01-16") + ... ], + ... "ID": [0, 1, 2, 3], + ... "Price": [10, 20, 30, 40] + ... } + ... ) + >>> df + Publish date ID Price + 0 2000-01-02 0 10 + 1 2000-01-02 1 20 + 2 2000-01-09 2 30 + 3 2000-01-16 3 40 + >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean() + ID Price + Publish date + 2000-01-02 0.5 15.0 + 2000-01-09 2.0 30.0 + 2000-01-16 3.0 40.0 + + If you want to adjust the start of the bins based on a fixed timestamp: + + >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' + >>> rng = pd.date_range(start, end, freq='7min') + >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) + >>> ts + 2000-10-01 23:30:00 0 + 2000-10-01 23:37:00 3 + 2000-10-01 23:44:00 6 + 2000-10-01 23:51:00 9 + 2000-10-01 23:58:00 12 + 2000-10-02 00:05:00 15 + 2000-10-02 00:12:00 18 + 2000-10-02 00:19:00 21 + 2000-10-02 00:26:00 24 + Freq: 7min, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min')).sum() + 2000-10-01 23:14:00 0 + 2000-10-01 23:31:00 9 + 2000-10-01 23:48:00 21 + 2000-10-02 00:05:00 54 + 2000-10-02 00:22:00 24 + Freq: 17min, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() + 2000-10-01 23:18:00 0 + 2000-10-01 23:35:00 18 + 2000-10-01 23:52:00 27 + 2000-10-02 00:09:00 39 + 2000-10-02 00:26:00 24 + Freq: 17min, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17min, dtype: int64 + + If you want to adjust the start of the bins with an `offset` Timedelta, the two + following lines are equivalent: + + >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17min, dtype: int64 + + >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() + 2000-10-01 23:30:00 9 + 2000-10-01 23:47:00 21 + 2000-10-02 00:04:00 54 + 2000-10-02 00:21:00 24 + Freq: 17min, dtype: int64 + + To replace the use of the deprecated `base` argument, you can now use `offset`, + in this example it is equivalent to have `base=2`: + + >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum() + 2000-10-01 23:16:00 0 + 2000-10-01 23:33:00 9 + 2000-10-01 23:50:00 36 + 2000-10-02 00:07:00 39 + 2000-10-02 00:24:00 24 + Freq: 17min, dtype: int64 + """ + + sort: bool + dropna: bool + _gpr_index: Index | None + _grouper: Index | None + + _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna") + + def __new__(cls, *args, **kwargs): + if kwargs.get("freq") is not None: + from pandas.core.resample import TimeGrouper + + cls = TimeGrouper + return super().__new__(cls) + + def __init__( + self, + key=None, + level=None, + freq=None, + axis: Axis | lib.NoDefault = lib.no_default, + sort: bool = False, + dropna: bool = True, + ) -> None: + if type(self) is Grouper: + # i.e. not TimeGrouper + if axis is not lib.no_default: + warnings.warn( + "Grouper axis keyword is deprecated and will be removed in a " + "future version. To group on axis=1, use obj.T.groupby(...) " + "instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + axis = 0 + if axis is lib.no_default: + axis = 0 + + self.key = key + self.level = level + self.freq = freq + self.axis = axis + self.sort = sort + self.dropna = dropna + + self._grouper_deprecated = None + self._indexer_deprecated: npt.NDArray[np.intp] | None = None + self._obj_deprecated = None + self._gpr_index = None + self.binner = None + self._grouper = None + self._indexer: npt.NDArray[np.intp] | None = None + + def _get_grouper( + self, obj: NDFrameT, validate: bool = True + ) -> tuple[ops.BaseGrouper, NDFrameT]: + """ + Parameters + ---------- + obj : Series or DataFrame + validate : bool, default True + if True, validate the grouper + + Returns + ------- + a tuple of grouper, obj (possibly sorted) + """ + obj, _, _ = self._set_grouper(obj) + grouper, _, obj = get_grouper( + obj, + [self.key], + axis=self.axis, + level=self.level, + sort=self.sort, + validate=validate, + dropna=self.dropna, + ) + # Without setting this, subsequent lookups to .groups raise + # error: Incompatible types in assignment (expression has type "BaseGrouper", + # variable has type "None") + self._grouper_deprecated = grouper # type: ignore[assignment] + + return grouper, obj + + def _set_grouper( + self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None + ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: + """ + given an object and the specifications, setup the internal grouper + for this particular specification + + Parameters + ---------- + obj : Series or DataFrame + sort : bool, default False + whether the resulting grouper should be sorted + gpr_index : Index or None, default None + + Returns + ------- + NDFrame + Index + np.ndarray[np.intp] | None + """ + assert obj is not None + + if self.key is not None and self.level is not None: + raise ValueError("The Grouper cannot specify both a key and a level!") + + # Keep self._grouper value before overriding + if self._grouper is None: + # TODO: What are we assuming about subsequent calls? + self._grouper = gpr_index + self._indexer = self._indexer_deprecated + + # the key must be a valid info item + if self.key is not None: + key = self.key + # The 'on' is already defined + if getattr(gpr_index, "name", None) == key and isinstance(obj, Series): + # Sometimes self._grouper will have been resorted while + # obj has not. In this case there is a mismatch when we + # call self._grouper.take(obj.index) so we need to undo the sorting + # before we call _grouper.take. + assert self._grouper is not None + if self._indexer is not None: + reverse_indexer = self._indexer.argsort() + unsorted_ax = self._grouper.take(reverse_indexer) + ax = unsorted_ax.take(obj.index) + else: + ax = self._grouper.take(obj.index) + else: + if key not in obj._info_axis: + raise KeyError(f"The grouper name {key} is not found") + ax = Index(obj[key], name=key) + + else: + ax = obj._get_axis(self.axis) + if self.level is not None: + level = self.level + + # if a level is given it must be a mi level or + # equivalent to the axis name + if isinstance(ax, MultiIndex): + level = ax._get_level_number(level) + ax = Index(ax._get_level_values(level), name=ax.names[level]) + + else: + if level not in (0, ax.name): + raise ValueError(f"The level {level} is not valid") + + # possibly sort + indexer: npt.NDArray[np.intp] | None = None + if (self.sort or sort) and not ax.is_monotonic_increasing: + # use stable sort to support first, last, nth + # TODO: why does putting na_position="first" fix datetimelike cases? + indexer = self._indexer_deprecated = ax.array.argsort( + kind="mergesort", na_position="first" + ) + ax = ax.take(indexer) + obj = obj.take(indexer, axis=self.axis) + + # error: Incompatible types in assignment (expression has type + # "NDFrameT", variable has type "None") + self._obj_deprecated = obj # type: ignore[assignment] + self._gpr_index = ax + return obj, ax, indexer + + @final + @property + def ax(self) -> Index: + warnings.warn( + f"{type(self).__name__}.ax is deprecated and will be removed in a " + "future version. Use Resampler.ax instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + index = self._gpr_index + if index is None: + raise ValueError("_set_grouper must be called before ax is accessed") + return index + + @final + @property + def indexer(self): + warnings.warn( + f"{type(self).__name__}.indexer is deprecated and will be removed " + "in a future version. Use Resampler.indexer instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._indexer_deprecated + + @final + @property + def obj(self): + # TODO(3.0): enforcing these deprecations on Grouper should close + # GH#25564, GH#41930 + warnings.warn( + f"{type(self).__name__}.obj is deprecated and will be removed " + "in a future version. Use GroupBy.indexer instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._obj_deprecated + + @final + @property + def grouper(self): + warnings.warn( + f"{type(self).__name__}.grouper is deprecated and will be removed " + "in a future version. Use GroupBy.grouper instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._grouper_deprecated + + @final + @property + def groups(self): + warnings.warn( + f"{type(self).__name__}.groups is deprecated and will be removed " + "in a future version. Use GroupBy.groups instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + # error: "None" has no attribute "groups" + return self._grouper_deprecated.groups # type: ignore[attr-defined] + + @final + def __repr__(self) -> str: + attrs_list = ( + f"{attr_name}={repr(getattr(self, attr_name))}" + for attr_name in self._attributes + if getattr(self, attr_name) is not None + ) + attrs = ", ".join(attrs_list) + cls_name = type(self).__name__ + return f"{cls_name}({attrs})" + + +@final +class Grouping: + """ + Holds the grouping information for a single key + + Parameters + ---------- + index : Index + grouper : + obj : DataFrame or Series + name : Label + level : + observed : bool, default False + If we are a Categorical, use the observed values + in_axis : if the Grouping is a column in self.obj and hence among + Groupby.exclusions list + dropna : bool, default True + Whether to drop NA groups. + uniques : Array-like, optional + When specified, will be used for unique values. Enables including empty groups + in the result for a BinGrouper. Must not contain duplicates. + + Attributes + ------- + indices : dict + Mapping of {group -> index_list} + codes : ndarray + Group codes + group_index : Index or None + unique groups + groups : dict + Mapping of {group -> label_list} + """ + + _codes: npt.NDArray[np.signedinteger] | None = None + _all_grouper: Categorical | None + _orig_cats: Index | None + _index: Index + + def __init__( + self, + index: Index, + grouper=None, + obj: NDFrame | None = None, + level=None, + sort: bool = True, + observed: bool = False, + in_axis: bool = False, + dropna: bool = True, + uniques: ArrayLike | None = None, + ) -> None: + self.level = level + self._orig_grouper = grouper + grouping_vector = _convert_grouper(index, grouper) + self._all_grouper = None + self._orig_cats = None + self._index = index + self._sort = sort + self.obj = obj + self._observed = observed + self.in_axis = in_axis + self._dropna = dropna + self._uniques = uniques + + # we have a single grouper which may be a myriad of things, + # some of which are dependent on the passing in level + + ilevel = self._ilevel + if ilevel is not None: + # In extant tests, the new self.grouping_vector matches + # `index.get_level_values(ilevel)` whenever + # mapper is None and isinstance(index, MultiIndex) + if isinstance(index, MultiIndex): + index_level = index.get_level_values(ilevel) + else: + index_level = index + + if grouping_vector is None: + grouping_vector = index_level + else: + mapper = grouping_vector + grouping_vector = index_level.map(mapper) + + # a passed Grouper like, directly get the grouper in the same way + # as single grouper groupby, use the group_info to get codes + elif isinstance(grouping_vector, Grouper): + # get the new grouper; we already have disambiguated + # what key/level refer to exactly, don't need to + # check again as we have by this point converted these + # to an actual value (rather than a pd.Grouper) + assert self.obj is not None # for mypy + newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False) + self.obj = newobj + + if isinstance(newgrouper, ops.BinGrouper): + # TODO: can we unwrap this and get a tighter typing + # for self.grouping_vector? + grouping_vector = newgrouper + else: + # ops.BaseGrouper + # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1. + # If that were to occur, would we be throwing out information? + # error: Cannot determine type of "grouping_vector" [has-type] + ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type] + # use Index instead of ndarray so we can recover the name + grouping_vector = Index(ng, name=newgrouper.result_index.name) + + elif not isinstance( + grouping_vector, (Series, Index, ExtensionArray, np.ndarray) + ): + # no level passed + if getattr(grouping_vector, "ndim", 1) != 1: + t = str(type(grouping_vector)) + raise ValueError(f"Grouper for '{t}' not 1-dimensional") + + grouping_vector = index.map(grouping_vector) + + if not ( + hasattr(grouping_vector, "__len__") + and len(grouping_vector) == len(index) + ): + grper = pprint_thing(grouping_vector) + errmsg = ( + "Grouper result violates len(labels) == " + f"len(data)\nresult: {grper}" + ) + raise AssertionError(errmsg) + + if isinstance(grouping_vector, np.ndarray): + if grouping_vector.dtype.kind in "mM": + # if we have a date/time-like grouper, make sure that we have + # Timestamps like + # TODO 2022-10-08 we only have one test that gets here and + # values are already in nanoseconds in that case. + grouping_vector = Series(grouping_vector).to_numpy() + elif isinstance(getattr(grouping_vector, "dtype", None), CategoricalDtype): + # a passed Categorical + self._orig_cats = grouping_vector.categories + grouping_vector, self._all_grouper = recode_for_groupby( + grouping_vector, sort, observed + ) + + self.grouping_vector = grouping_vector + + def __repr__(self) -> str: + return f"Grouping({self.name})" + + def __iter__(self) -> Iterator: + return iter(self.indices) + + @cache_readonly + def _passed_categorical(self) -> bool: + dtype = getattr(self.grouping_vector, "dtype", None) + return isinstance(dtype, CategoricalDtype) + + @cache_readonly + def name(self) -> Hashable: + ilevel = self._ilevel + if ilevel is not None: + return self._index.names[ilevel] + + if isinstance(self._orig_grouper, (Index, Series)): + return self._orig_grouper.name + + elif isinstance(self.grouping_vector, ops.BaseGrouper): + return self.grouping_vector.result_index.name + + elif isinstance(self.grouping_vector, Index): + return self.grouping_vector.name + + # otherwise we have ndarray or ExtensionArray -> no name + return None + + @cache_readonly + def _ilevel(self) -> int | None: + """ + If necessary, converted index level name to index level position. + """ + level = self.level + if level is None: + return None + if not isinstance(level, int): + index = self._index + if level not in index.names: + raise AssertionError(f"Level {level} not in index") + return index.names.index(level) + return level + + @property + def ngroups(self) -> int: + return len(self._group_index) + + @cache_readonly + def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: + # we have a list of groupers + if isinstance(self.grouping_vector, ops.BaseGrouper): + return self.grouping_vector.indices + + values = Categorical(self.grouping_vector) + return values._reverse_indexer() + + @property + def codes(self) -> npt.NDArray[np.signedinteger]: + return self._codes_and_uniques[0] + + @cache_readonly + def _group_arraylike(self) -> ArrayLike: + """ + Analogous to result_index, but holding an ArrayLike to ensure + we can retain ExtensionDtypes. + """ + if self._all_grouper is not None: + # retain dtype for categories, including unobserved ones + return self._result_index._values + + elif self._passed_categorical: + return self._group_index._values + + return self._codes_and_uniques[1] + + @property + def group_arraylike(self) -> ArrayLike: + """ + Analogous to result_index, but holding an ArrayLike to ensure + we can retain ExtensionDtypes. + """ + warnings.warn( + "group_arraylike is deprecated and will be removed in a future " + "version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._group_arraylike + + @cache_readonly + def _result_index(self) -> Index: + # result_index retains dtype for categories, including unobserved ones, + # which group_index does not + if self._all_grouper is not None: + group_idx = self._group_index + assert isinstance(group_idx, CategoricalIndex) + cats = self._orig_cats + # set_categories is dynamically added + return group_idx.set_categories(cats) # type: ignore[attr-defined] + return self._group_index + + @property + def result_index(self) -> Index: + warnings.warn( + "result_index is deprecated and will be removed in a future " + "version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._result_index + + @cache_readonly + def _group_index(self) -> Index: + codes, uniques = self._codes_and_uniques + if not self._dropna and self._passed_categorical: + assert isinstance(uniques, Categorical) + if self._sort and (codes == len(uniques)).any(): + # Add NA value on the end when sorting + uniques = Categorical.from_codes( + np.append(uniques.codes, [-1]), uniques.categories, validate=False + ) + elif len(codes) > 0: + # Need to determine proper placement of NA value when not sorting + cat = self.grouping_vector + na_idx = (cat.codes < 0).argmax() + if cat.codes[na_idx] < 0: + # count number of unique codes that comes before the nan value + na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx]) + new_codes = np.insert(uniques.codes, na_unique_idx, -1) + uniques = Categorical.from_codes( + new_codes, uniques.categories, validate=False + ) + return Index._with_infer(uniques, name=self.name) + + @property + def group_index(self) -> Index: + warnings.warn( + "group_index is deprecated and will be removed in a future " + "version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._group_index + + @cache_readonly + def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: + uniques: ArrayLike + if self._passed_categorical: + # we make a CategoricalIndex out of the cat grouper + # preserving the categories / ordered attributes; + # doesn't (yet - GH#46909) handle dropna=False + cat = self.grouping_vector + categories = cat.categories + + if self._observed: + ucodes = algorithms.unique1d(cat.codes) + ucodes = ucodes[ucodes != -1] + if self._sort: + ucodes = np.sort(ucodes) + else: + ucodes = np.arange(len(categories)) + + uniques = Categorical.from_codes( + codes=ucodes, categories=categories, ordered=cat.ordered, validate=False + ) + + codes = cat.codes + if not self._dropna: + na_mask = codes < 0 + if np.any(na_mask): + if self._sort: + # Replace NA codes with `largest code + 1` + na_code = len(categories) + codes = np.where(na_mask, na_code, codes) + else: + # Insert NA code into the codes based on first appearance + # A negative code must exist, no need to check codes[na_idx] < 0 + na_idx = na_mask.argmax() + # count number of unique codes that comes before the nan value + na_code = algorithms.nunique_ints(codes[:na_idx]) + codes = np.where(codes >= na_code, codes + 1, codes) + codes = np.where(na_mask, na_code, codes) + + if not self._observed: + uniques = uniques.reorder_categories(self._orig_cats) + + return codes, uniques + + elif isinstance(self.grouping_vector, ops.BaseGrouper): + # we have a list of groupers + codes = self.grouping_vector.codes_info + uniques = self.grouping_vector.result_index._values + elif self._uniques is not None: + # GH#50486 Code grouping_vector using _uniques; allows + # including uniques that are not present in grouping_vector. + cat = Categorical(self.grouping_vector, categories=self._uniques) + codes = cat.codes + uniques = self._uniques + else: + # GH35667, replace dropna=False with use_na_sentinel=False + # error: Incompatible types in assignment (expression has type "Union[ + # ndarray[Any, Any], Index]", variable has type "Categorical") + codes, uniques = algorithms.factorize( # type: ignore[assignment] + self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna + ) + return codes, uniques + + @cache_readonly + def groups(self) -> dict[Hashable, np.ndarray]: + cats = Categorical.from_codes(self.codes, self._group_index, validate=False) + return self._index.groupby(cats) + + +def get_grouper( + obj: NDFrameT, + key=None, + axis: Axis = 0, + level=None, + sort: bool = True, + observed: bool = False, + validate: bool = True, + dropna: bool = True, +) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]: + """ + Create and return a BaseGrouper, which is an internal + mapping of how to create the grouper indexers. + This may be composed of multiple Grouping objects, indicating + multiple groupers + + Groupers are ultimately index mappings. They can originate as: + index mappings, keys to columns, functions, or Groupers + + Groupers enable local references to axis,level,sort, while + the passed in axis, level, and sort are 'global'. + + This routine tries to figure out what the passing in references + are and then creates a Grouping for each one, combined into + a BaseGrouper. + + If observed & we have a categorical grouper, only show the observed + values. + + If validate, then check for key/level overlaps. + + """ + group_axis = obj._get_axis(axis) + + # validate that the passed single level is compatible with the passed + # axis of the object + if level is not None: + # TODO: These if-block and else-block are almost same. + # MultiIndex instance check is removable, but it seems that there are + # some processes only for non-MultiIndex in else-block, + # eg. `obj.index.name != level`. We have to consider carefully whether + # these are applicable for MultiIndex. Even if these are applicable, + # we need to check if it makes no side effect to subsequent processes + # on the outside of this condition. + # (GH 17621) + if isinstance(group_axis, MultiIndex): + if is_list_like(level) and len(level) == 1: + level = level[0] + + if key is None and is_scalar(level): + # Get the level values from group_axis + key = group_axis.get_level_values(level) + level = None + + else: + # allow level to be a length-one list-like object + # (e.g., level=[0]) + # GH 13901 + if is_list_like(level): + nlevels = len(level) + if nlevels == 1: + level = level[0] + elif nlevels == 0: + raise ValueError("No group keys passed!") + else: + raise ValueError("multiple levels only valid with MultiIndex") + + if isinstance(level, str): + if obj._get_axis(axis).name != level: + raise ValueError( + f"level name {level} is not the name " + f"of the {obj._get_axis_name(axis)}" + ) + elif level > 0 or level < -1: + raise ValueError("level > 0 or level < -1 only valid with MultiIndex") + + # NOTE: `group_axis` and `group_axis.get_level_values(level)` + # are same in this section. + level = None + key = group_axis + + # a passed-in Grouper, directly convert + if isinstance(key, Grouper): + grouper, obj = key._get_grouper(obj, validate=False) + if key.key is None: + return grouper, frozenset(), obj + else: + return grouper, frozenset({key.key}), obj + + # already have a BaseGrouper, just return it + elif isinstance(key, ops.BaseGrouper): + return key, frozenset(), obj + + if not isinstance(key, list): + keys = [key] + match_axis_length = False + else: + keys = key + match_axis_length = len(keys) == len(group_axis) + + # what are we after, exactly? + any_callable = any(callable(g) or isinstance(g, dict) for g in keys) + any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys) + any_arraylike = any( + isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys + ) + + # is this an index replacement? + if ( + not any_callable + and not any_arraylike + and not any_groupers + and match_axis_length + and level is None + ): + if isinstance(obj, DataFrame): + all_in_columns_index = all( + g in obj.columns or g in obj.index.names for g in keys + ) + else: + assert isinstance(obj, Series) + all_in_columns_index = all(g in obj.index.names for g in keys) + + if not all_in_columns_index: + keys = [com.asarray_tuplesafe(keys)] + + if isinstance(level, (tuple, list)): + if key is None: + keys = [None] * len(level) + levels = level + else: + levels = [level] * len(keys) + + groupings: list[Grouping] = [] + exclusions: set[Hashable] = set() + + # if the actual grouper should be obj[key] + def is_in_axis(key) -> bool: + if not _is_label_like(key): + if obj.ndim == 1: + return False + + # items -> .columns for DataFrame, .index for Series + items = obj.axes[-1] + try: + items.get_loc(key) + except (KeyError, TypeError, InvalidIndexError): + # TypeError shows up here if we pass e.g. an Index + return False + + return True + + # if the grouper is obj[name] + def is_in_obj(gpr) -> bool: + if not hasattr(gpr, "name"): + return False + if using_copy_on_write() or warn_copy_on_write(): + # For the CoW case, we check the references to determine if the + # series is part of the object + try: + obj_gpr_column = obj[gpr.name] + except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime): + return False + if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series): + return gpr._mgr.references_same_values( # type: ignore[union-attr] + obj_gpr_column._mgr, 0 # type: ignore[arg-type] + ) + return False + try: + return gpr is obj[gpr.name] + except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime): + # IndexError reached in e.g. test_skip_group_keys when we pass + # lambda here + # InvalidIndexError raised on key-types inappropriate for index, + # e.g. DatetimeIndex.get_loc(tuple()) + # OutOfBoundsDatetime raised when obj is a Series with DatetimeIndex + # and gpr.name is month str + return False + + for gpr, level in zip(keys, levels): + if is_in_obj(gpr): # df.groupby(df['name']) + in_axis = True + exclusions.add(gpr.name) + + elif is_in_axis(gpr): # df.groupby('name') + if obj.ndim != 1 and gpr in obj: + if validate: + obj._check_label_or_level_ambiguity(gpr, axis=axis) + in_axis, name, gpr = True, gpr, obj[gpr] + if gpr.ndim != 1: + # non-unique columns; raise here to get the name in the + # exception message + raise ValueError(f"Grouper for '{name}' not 1-dimensional") + exclusions.add(name) + elif obj._is_level_reference(gpr, axis=axis): + in_axis, level, gpr = False, gpr, None + else: + raise KeyError(gpr) + elif isinstance(gpr, Grouper) and gpr.key is not None: + # Add key to exclusions + exclusions.add(gpr.key) + in_axis = True + else: + in_axis = False + + # create the Grouping + # allow us to passing the actual Grouping as the gpr + ping = ( + Grouping( + group_axis, + gpr, + obj=obj, + level=level, + sort=sort, + observed=observed, + in_axis=in_axis, + dropna=dropna, + ) + if not isinstance(gpr, Grouping) + else gpr + ) + + groupings.append(ping) + + if len(groupings) == 0 and len(obj): + raise ValueError("No group keys passed!") + if len(groupings) == 0: + groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) + + # create the internals grouper + grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) + return grouper, frozenset(exclusions), obj + + +def _is_label_like(val) -> bool: + return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) + + +def _convert_grouper(axis: Index, grouper): + if isinstance(grouper, dict): + return grouper.get + elif isinstance(grouper, Series): + if grouper.index.equals(axis): + return grouper._values + else: + return grouper.reindex(axis)._values + elif isinstance(grouper, MultiIndex): + return grouper._values + elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)): + if len(grouper) != len(axis): + raise ValueError("Grouper and axis must be same length") + + if isinstance(grouper, (list, tuple)): + grouper = com.asarray_tuplesafe(grouper) + return grouper + else: + return grouper diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..a3c5ab8edc94e4f91175891282252d0e8cdfd3ec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/indexing.py @@ -0,0 +1,304 @@ +from __future__ import annotations + +from collections.abc import Iterable +from typing import ( + TYPE_CHECKING, + Literal, + cast, +) + +import numpy as np + +from pandas.util._decorators import ( + cache_readonly, + doc, +) + +from pandas.core.dtypes.common import ( + is_integer, + is_list_like, +) + +if TYPE_CHECKING: + from pandas._typing import PositionalIndexer + + from pandas import ( + DataFrame, + Series, + ) + from pandas.core.groupby import groupby + + +class GroupByIndexingMixin: + """ + Mixin for adding ._positional_selector to GroupBy. + """ + + @cache_readonly + def _positional_selector(self) -> GroupByPositionalSelector: + """ + Return positional selection for each group. + + ``groupby._positional_selector[i:j]`` is similar to + ``groupby.apply(lambda x: x.iloc[i:j])`` + but much faster and preserves the original index and order. + + ``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head` + and :meth:`~GroupBy.tail`. For example: + + - ``head(5)`` + - ``_positional_selector[5:-5]`` + - ``tail(5)`` + + together return all the rows. + + Allowed inputs for the index are: + + - An integer valued iterable, e.g. ``range(2, 4)``. + - A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``. + + The output format is the same as :meth:`~GroupBy.head` and + :meth:`~GroupBy.tail`, namely + a subset of the ``DataFrame`` or ``Series`` with the index and order preserved. + + Returns + ------- + Series + The filtered subset of the original Series. + DataFrame + The filtered subset of the original DataFrame. + + See Also + -------- + DataFrame.iloc : Purely integer-location based indexing for selection by + position. + GroupBy.head : Return first n rows of each group. + GroupBy.tail : Return last n rows of each group. + GroupBy.nth : Take the nth row from each group if n is an int, or a + subset of rows, if n is a list of ints. + + Notes + ----- + - The slice step cannot be negative. + - If the index specification results in overlaps, the item is not duplicated. + - If the index specification changes the order of items, then + they are returned in their original order. + By contrast, ``DataFrame.iloc`` can change the row order. + - ``groupby()`` parameters such as as_index and dropna are ignored. + + The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth` + with ``as_index=False`` are: + + - Input to ``_positional_selector`` can include + one or more slices whereas ``nth`` + just handles an integer or a list of integers. + - ``_positional_selector`` can accept a slice relative to the + last row of each group. + - ``_positional_selector`` does not have an equivalent to the + ``nth()`` ``dropna`` parameter. + + Examples + -------- + >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"]) + >>> df.groupby("A")._positional_selector[1:2] + A B + 1 a 2 + 4 b 5 + + >>> df.groupby("A")._positional_selector[1, -1] + A B + 1 a 2 + 2 a 3 + 4 b 5 + """ + if TYPE_CHECKING: + # pylint: disable-next=used-before-assignment + groupby_self = cast(groupby.GroupBy, self) + else: + groupby_self = self + + return GroupByPositionalSelector(groupby_self) + + def _make_mask_from_positional_indexer( + self, + arg: PositionalIndexer | tuple, + ) -> np.ndarray: + if is_list_like(arg): + if all(is_integer(i) for i in cast(Iterable, arg)): + mask = self._make_mask_from_list(cast(Iterable[int], arg)) + else: + mask = self._make_mask_from_tuple(cast(tuple, arg)) + + elif isinstance(arg, slice): + mask = self._make_mask_from_slice(arg) + elif is_integer(arg): + mask = self._make_mask_from_int(cast(int, arg)) + else: + raise TypeError( + f"Invalid index {type(arg)}. " + "Must be integer, list-like, slice or a tuple of " + "integers and slices" + ) + + if isinstance(mask, bool): + if mask: + mask = self._ascending_count >= 0 + else: + mask = self._ascending_count < 0 + + return cast(np.ndarray, mask) + + def _make_mask_from_int(self, arg: int) -> np.ndarray: + if arg >= 0: + return self._ascending_count == arg + else: + return self._descending_count == (-arg - 1) + + def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray: + positive = [arg for arg in args if arg >= 0] + negative = [-arg - 1 for arg in args if arg < 0] + + mask: bool | np.ndarray = False + + if positive: + mask |= np.isin(self._ascending_count, positive) + + if negative: + mask |= np.isin(self._descending_count, negative) + + return mask + + def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray: + mask: bool | np.ndarray = False + + for arg in args: + if is_integer(arg): + mask |= self._make_mask_from_int(cast(int, arg)) + elif isinstance(arg, slice): + mask |= self._make_mask_from_slice(arg) + else: + raise ValueError( + f"Invalid argument {type(arg)}. Should be int or slice." + ) + + return mask + + def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray: + start = arg.start + stop = arg.stop + step = arg.step + + if step is not None and step < 0: + raise ValueError(f"Invalid step {step}. Must be non-negative") + + mask: bool | np.ndarray = True + + if step is None: + step = 1 + + if start is None: + if step > 1: + mask &= self._ascending_count % step == 0 + + elif start >= 0: + mask &= self._ascending_count >= start + + if step > 1: + mask &= (self._ascending_count - start) % step == 0 + + else: + mask &= self._descending_count < -start + + offset_array = self._descending_count + start + 1 + limit_array = ( + self._ascending_count + self._descending_count + (start + 1) + ) < 0 + offset_array = np.where(limit_array, self._ascending_count, offset_array) + + mask &= offset_array % step == 0 + + if stop is not None: + if stop >= 0: + mask &= self._ascending_count < stop + else: + mask &= self._descending_count >= -stop + + return mask + + @cache_readonly + def _ascending_count(self) -> np.ndarray: + if TYPE_CHECKING: + groupby_self = cast(groupby.GroupBy, self) + else: + groupby_self = self + + return groupby_self._cumcount_array() + + @cache_readonly + def _descending_count(self) -> np.ndarray: + if TYPE_CHECKING: + groupby_self = cast(groupby.GroupBy, self) + else: + groupby_self = self + + return groupby_self._cumcount_array(ascending=False) + + +@doc(GroupByIndexingMixin._positional_selector) +class GroupByPositionalSelector: + def __init__(self, groupby_object: groupby.GroupBy) -> None: + self.groupby_object = groupby_object + + def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: + """ + Select by positional index per group. + + Implements GroupBy._positional_selector + + Parameters + ---------- + arg : PositionalIndexer | tuple + Allowed values are: + - int + - int valued iterable such as list or range + - slice with step either None or positive + - tuple of integers and slices + + Returns + ------- + Series + The filtered subset of the original groupby Series. + DataFrame + The filtered subset of the original groupby DataFrame. + + See Also + -------- + DataFrame.iloc : Integer-location based indexing for selection by position. + GroupBy.head : Return first n rows of each group. + GroupBy.tail : Return last n rows of each group. + GroupBy._positional_selector : Return positional selection for each group. + GroupBy.nth : Take the nth row from each group if n is an int, or a + subset of rows, if n is a list of ints. + """ + mask = self.groupby_object._make_mask_from_positional_indexer(arg) + return self.groupby_object._mask_selected_obj(mask) + + +class GroupByNthSelector: + """ + Dynamically substituted for GroupBy.nth to enable both call and index + """ + + def __init__(self, groupby_object: groupby.GroupBy) -> None: + self.groupby_object = groupby_object + + def __call__( + self, + n: PositionalIndexer | tuple, + dropna: Literal["any", "all", None] = None, + ) -> DataFrame | Series: + return self.groupby_object._nth(n, dropna) + + def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series: + return self.groupby_object._nth(n) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/numba_.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/numba_.py new file mode 100644 index 0000000000000000000000000000000000000000..3b7a58e87603e578216c4c80e8c88e06828d5dfa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/numba_.py @@ -0,0 +1,181 @@ +"""Common utilities for Numba operations with groupby ops""" +from __future__ import annotations + +import functools +import inspect +from typing import ( + TYPE_CHECKING, + Any, + Callable, +) + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + +from pandas.core.util.numba_ import ( + NumbaUtilError, + jit_user_function, +) + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +def validate_udf(func: Callable) -> None: + """ + Validate user defined function for ops when using Numba with groupby ops. + + The first signature arguments should include: + + def f(values, index, ...): + ... + + Parameters + ---------- + func : function, default False + user defined function + + Returns + ------- + None + + Raises + ------ + NumbaUtilError + """ + if not callable(func): + raise NotImplementedError( + "Numba engine can only be used with a single function." + ) + udf_signature = list(inspect.signature(func).parameters.keys()) + expected_args = ["values", "index"] + min_number_args = len(expected_args) + if ( + len(udf_signature) < min_number_args + or udf_signature[:min_number_args] != expected_args + ): + raise NumbaUtilError( + f"The first {min_number_args} arguments to {func.__name__} must be " + f"{expected_args}" + ) + + +@functools.cache +def generate_numba_agg_func( + func: Callable[..., Scalar], + nopython: bool, + nogil: bool, + parallel: bool, +) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]: + """ + Generate a numba jitted agg function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a groupby agg function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the groupby evaluation loop. + + Parameters + ---------- + func : function + function to be applied to each group and will be JITed + nopython : bool + nopython to be passed into numba.jit + nogil : bool + nogil to be passed into numba.jit + parallel : bool + parallel to be passed into numba.jit + + Returns + ------- + Numba function + """ + numba_func = jit_user_function(func) + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def group_agg( + values: np.ndarray, + index: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + num_columns: int, + *args: Any, + ) -> np.ndarray: + assert len(begin) == len(end) + num_groups = len(begin) + + result = np.empty((num_groups, num_columns)) + for i in numba.prange(num_groups): + group_index = index[begin[i] : end[i]] + for j in numba.prange(num_columns): + group = values[begin[i] : end[i], j] + result[i, j] = numba_func(group, group_index, *args) + return result + + return group_agg + + +@functools.cache +def generate_numba_transform_func( + func: Callable[..., np.ndarray], + nopython: bool, + nogil: bool, + parallel: bool, +) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]: + """ + Generate a numba jitted transform function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a groupby transform function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the groupby evaluation loop. + + Parameters + ---------- + func : function + function to be applied to each window and will be JITed + nopython : bool + nopython to be passed into numba.jit + nogil : bool + nogil to be passed into numba.jit + parallel : bool + parallel to be passed into numba.jit + + Returns + ------- + Numba function + """ + numba_func = jit_user_function(func) + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def group_transform( + values: np.ndarray, + index: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + num_columns: int, + *args: Any, + ) -> np.ndarray: + assert len(begin) == len(end) + num_groups = len(begin) + + result = np.empty((len(values), num_columns)) + for i in numba.prange(num_groups): + group_index = index[begin[i] : end[i]] + for j in numba.prange(num_columns): + group = values[begin[i] : end[i], j] + result[begin[i] : end[i], j] = numba_func(group, group_index, *args) + return result + + return group_transform diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/ops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..e2ddf9aa5c0c1752e1de9b81e2e72873db050e85 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/groupby/ops.py @@ -0,0 +1,1208 @@ +""" +Provide classes to perform the groupby aggregate operations. + +These are not exposed to the user and provide implementations of the grouping +operations, primarily in cython. These classes (BaseGrouper and BinGrouper) +are contained *in* the SeriesGroupBy and DataFrameGroupBy objects. +""" +from __future__ import annotations + +import collections +import functools +from typing import ( + TYPE_CHECKING, + Callable, + Generic, + final, +) + +import numpy as np + +from pandas._libs import ( + NaT, + lib, +) +import pandas._libs.groupby as libgroupby +from pandas._typing import ( + ArrayLike, + AxisInt, + NDFrameT, + Shape, + npt, +) +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.cast import ( + maybe_cast_pointwise_result, + maybe_downcast_to_dtype, +) +from pandas.core.dtypes.common import ( + ensure_float64, + ensure_int64, + ensure_platform_int, + ensure_uint64, + is_1d_only_ea_dtype, +) +from pandas.core.dtypes.missing import ( + isna, + maybe_fill, +) + +from pandas.core.frame import DataFrame +from pandas.core.groupby import grouper +from pandas.core.indexes.api import ( + CategoricalIndex, + Index, + MultiIndex, + ensure_index, +) +from pandas.core.series import Series +from pandas.core.sorting import ( + compress_group_index, + decons_obs_group_ids, + get_flattened_list, + get_group_index, + get_group_index_sorter, + get_indexer_dict, +) + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterator, + Sequence, + ) + + from pandas.core.generic import NDFrame + + +def check_result_array(obj, dtype) -> None: + # Our operation is supposed to be an aggregation/reduction. If + # it returns an ndarray, this likely means an invalid operation has + # been passed. See test_apply_without_aggregation, test_agg_must_agg + if isinstance(obj, np.ndarray): + if dtype != object: + # If it is object dtype, the function can be a reduction/aggregation + # and still return an ndarray e.g. test_agg_over_numpy_arrays + raise ValueError("Must produce aggregated value") + + +def extract_result(res): + """ + Extract the result object, it might be a 0-dim ndarray + or a len-1 0-dim, or a scalar + """ + if hasattr(res, "_values"): + # Preserve EA + res = res._values + if res.ndim == 1 and len(res) == 1: + # see test_agg_lambda_with_timezone, test_resampler_grouper.py::test_apply + res = res[0] + return res + + +class WrappedCythonOp: + """ + Dispatch logic for functions defined in _libs.groupby + + Parameters + ---------- + kind: str + Whether the operation is an aggregate or transform. + how: str + Operation name, e.g. "mean". + has_dropped_na: bool + True precisely when dropna=True and the grouper contains a null value. + """ + + # Functions for which we do _not_ attempt to cast the cython result + # back to the original dtype. + cast_blocklist = frozenset( + ["any", "all", "rank", "count", "size", "idxmin", "idxmax"] + ) + + def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: + self.kind = kind + self.how = how + self.has_dropped_na = has_dropped_na + + _CYTHON_FUNCTIONS: dict[str, dict] = { + "aggregate": { + "any": functools.partial(libgroupby.group_any_all, val_test="any"), + "all": functools.partial(libgroupby.group_any_all, val_test="all"), + "sum": "group_sum", + "prod": "group_prod", + "idxmin": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmin"), + "idxmax": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmax"), + "min": "group_min", + "max": "group_max", + "mean": "group_mean", + "median": "group_median_float64", + "var": "group_var", + "std": functools.partial(libgroupby.group_var, name="std"), + "sem": functools.partial(libgroupby.group_var, name="sem"), + "skew": "group_skew", + "first": "group_nth", + "last": "group_last", + "ohlc": "group_ohlc", + }, + "transform": { + "cumprod": "group_cumprod", + "cumsum": "group_cumsum", + "cummin": "group_cummin", + "cummax": "group_cummax", + "rank": "group_rank", + }, + } + + _cython_arity = {"ohlc": 4} # OHLC + + @classmethod + def get_kind_from_how(cls, how: str) -> str: + if how in cls._CYTHON_FUNCTIONS["aggregate"]: + return "aggregate" + return "transform" + + # Note: we make this a classmethod and pass kind+how so that caching + # works at the class level and not the instance level + @classmethod + @functools.cache + def _get_cython_function( + cls, kind: str, how: str, dtype: np.dtype, is_numeric: bool + ): + dtype_str = dtype.name + ftype = cls._CYTHON_FUNCTIONS[kind][how] + + # see if there is a fused-type version of function + # only valid for numeric + if callable(ftype): + f = ftype + else: + f = getattr(libgroupby, ftype) + if is_numeric: + return f + elif dtype == np.dtype(object): + if how in ["median", "cumprod"]: + # no fused types -> no __signatures__ + raise NotImplementedError( + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" + ) + elif how in ["std", "sem", "idxmin", "idxmax"]: + # We have a partial object that does not have __signatures__ + return f + elif how == "skew": + # _get_cython_vals will convert to float64 + pass + elif "object" not in f.__signatures__: + # raise NotImplementedError here rather than TypeError later + raise NotImplementedError( + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" + ) + return f + else: + raise NotImplementedError( + "This should not be reached. Please report a bug at " + "github.com/pandas-dev/pandas/", + dtype, + ) + + def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: + """ + Cast numeric dtypes to float64 for functions that only support that. + + Parameters + ---------- + values : np.ndarray + + Returns + ------- + values : np.ndarray + """ + how = self.how + + if how in ["median", "std", "sem", "skew"]: + # median only has a float64 implementation + # We should only get here with is_numeric, as non-numeric cases + # should raise in _get_cython_function + values = ensure_float64(values) + + elif values.dtype.kind in "iu": + if how in ["var", "mean"] or ( + self.kind == "transform" and self.has_dropped_na + ): + # has_dropped_na check need for test_null_group_str_transformer + # result may still include NaN, so we have to cast + values = ensure_float64(values) + + elif how in ["sum", "ohlc", "prod", "cumsum", "cumprod"]: + # Avoid overflow during group op + if values.dtype.kind == "i": + values = ensure_int64(values) + else: + values = ensure_uint64(values) + + return values + + def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: + how = self.how + kind = self.kind + + arity = self._cython_arity.get(how, 1) + + out_shape: Shape + if how == "ohlc": + out_shape = (ngroups, arity) + elif arity > 1: + raise NotImplementedError( + "arity of more than 1 is not supported for the 'how' argument" + ) + elif kind == "transform": + out_shape = values.shape + else: + out_shape = (ngroups,) + values.shape[1:] + return out_shape + + def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: + how = self.how + + if how == "rank": + out_dtype = "float64" + elif how in ["idxmin", "idxmax"]: + # The Cython implementation only produces the row number; we'll take + # from the index using this in post processing + out_dtype = "intp" + else: + if dtype.kind in "iufcb": + out_dtype = f"{dtype.kind}{dtype.itemsize}" + else: + out_dtype = "object" + return np.dtype(out_dtype) + + def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: + """ + Get the desired dtype of a result based on the + input dtype and how it was computed. + + Parameters + ---------- + dtype : np.dtype + + Returns + ------- + np.dtype + The desired dtype of the result. + """ + how = self.how + + if how in ["sum", "cumsum", "sum", "prod", "cumprod"]: + if dtype == np.dtype(bool): + return np.dtype(np.int64) + elif how in ["mean", "median", "var", "std", "sem"]: + if dtype.kind in "fc": + return dtype + elif dtype.kind in "iub": + return np.dtype(np.float64) + return dtype + + @final + def _cython_op_ndim_compat( + self, + values: np.ndarray, + *, + min_count: int, + ngroups: int, + comp_ids: np.ndarray, + mask: npt.NDArray[np.bool_] | None = None, + result_mask: npt.NDArray[np.bool_] | None = None, + **kwargs, + ) -> np.ndarray: + if values.ndim == 1: + # expand to 2d, dispatch, then squeeze if appropriate + values2d = values[None, :] + if mask is not None: + mask = mask[None, :] + if result_mask is not None: + result_mask = result_mask[None, :] + res = self._call_cython_op( + values2d, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + if res.shape[0] == 1: + return res[0] + + # otherwise we have OHLC + return res.T + + return self._call_cython_op( + values, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + + @final + def _call_cython_op( + self, + values: np.ndarray, # np.ndarray[ndim=2] + *, + min_count: int, + ngroups: int, + comp_ids: np.ndarray, + mask: npt.NDArray[np.bool_] | None, + result_mask: npt.NDArray[np.bool_] | None, + **kwargs, + ) -> np.ndarray: # np.ndarray[ndim=2] + orig_values = values + + dtype = values.dtype + is_numeric = dtype.kind in "iufcb" + + is_datetimelike = dtype.kind in "mM" + + if is_datetimelike: + values = values.view("int64") + is_numeric = True + elif dtype.kind == "b": + values = values.view("uint8") + if values.dtype == "float16": + values = values.astype(np.float32) + + if self.how in ["any", "all"]: + if mask is None: + mask = isna(values) + if dtype == object: + if kwargs["skipna"]: + # GH#37501: don't raise on pd.NA when skipna=True + if mask.any(): + # mask on original values computed separately + values = values.copy() + values[mask] = True + values = values.astype(bool, copy=False).view(np.int8) + is_numeric = True + + values = values.T + if mask is not None: + mask = mask.T + if result_mask is not None: + result_mask = result_mask.T + + out_shape = self._get_output_shape(ngroups, values) + func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric) + values = self._get_cython_vals(values) + out_dtype = self._get_out_dtype(values.dtype) + + result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) + if self.kind == "aggregate": + counts = np.zeros(ngroups, dtype=np.int64) + if self.how in [ + "idxmin", + "idxmax", + "min", + "max", + "mean", + "last", + "first", + "sum", + ]: + func( + out=result, + counts=counts, + values=values, + labels=comp_ids, + min_count=min_count, + mask=mask, + result_mask=result_mask, + is_datetimelike=is_datetimelike, + **kwargs, + ) + elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: + if self.how in ["std", "sem"]: + kwargs["is_datetimelike"] = is_datetimelike + func( + result, + counts, + values, + comp_ids, + min_count=min_count, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + elif self.how in ["any", "all"]: + func( + out=result, + values=values, + labels=comp_ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + result = result.astype(bool, copy=False) + elif self.how in ["skew"]: + func( + out=result, + counts=counts, + values=values, + labels=comp_ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + if dtype == object: + result = result.astype(object) + + else: + raise NotImplementedError(f"{self.how} is not implemented") + else: + # TODO: min_count + if self.how != "rank": + # TODO: should rank take result_mask? + kwargs["result_mask"] = result_mask + func( + out=result, + values=values, + labels=comp_ids, + ngroups=ngroups, + is_datetimelike=is_datetimelike, + mask=mask, + **kwargs, + ) + + if self.kind == "aggregate" and self.how not in ["idxmin", "idxmax"]: + # i.e. counts is defined. Locations where count None: + if values.ndim > 2: + raise NotImplementedError("number of dimensions is currently limited to 2") + if values.ndim == 2: + assert axis == 1, axis + elif not is_1d_only_ea_dtype(values.dtype): + # Note: it is *not* the case that axis is always 0 for 1-dim values, + # as we can have 1D ExtensionArrays that we need to treat as 2D + assert axis == 0 + + @final + def cython_operation( + self, + *, + values: ArrayLike, + axis: AxisInt, + min_count: int = -1, + comp_ids: np.ndarray, + ngroups: int, + **kwargs, + ) -> ArrayLike: + """ + Call our cython function, with appropriate pre- and post- processing. + """ + self._validate_axis(axis, values) + + if not isinstance(values, np.ndarray): + # i.e. ExtensionArray + return values._groupby_op( + how=self.how, + has_dropped_na=self.has_dropped_na, + min_count=min_count, + ngroups=ngroups, + ids=comp_ids, + **kwargs, + ) + + return self._cython_op_ndim_compat( + values, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=None, + **kwargs, + ) + + +class BaseGrouper: + """ + This is an internal Grouper class, which actually holds + the generated groups + + Parameters + ---------- + axis : Index + groupings : Sequence[Grouping] + all the grouping instances to handle in this grouper + for example for grouper list to groupby, need to pass the list + sort : bool, default True + whether this grouper will give sorted result or not + + """ + + axis: Index + + def __init__( + self, + axis: Index, + groupings: Sequence[grouper.Grouping], + sort: bool = True, + dropna: bool = True, + ) -> None: + assert isinstance(axis, Index), axis + + self.axis = axis + self._groupings: list[grouper.Grouping] = list(groupings) + self._sort = sort + self.dropna = dropna + + @property + def groupings(self) -> list[grouper.Grouping]: + return self._groupings + + @property + def shape(self) -> Shape: + return tuple(ping.ngroups for ping in self.groupings) + + def __iter__(self) -> Iterator[Hashable]: + return iter(self.indices) + + @property + def nkeys(self) -> int: + return len(self.groupings) + + def get_iterator( + self, data: NDFrameT, axis: AxisInt = 0 + ) -> Iterator[tuple[Hashable, NDFrameT]]: + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + splitter = self._get_splitter(data, axis=axis) + keys = self.group_keys_seq + yield from zip(keys, splitter) + + @final + def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter: + """ + Returns + ------- + Generator yielding subsetted objects + """ + ids, _, ngroups = self.group_info + return _get_splitter( + data, + ids, + ngroups, + sorted_ids=self._sorted_ids, + sort_idx=self._sort_idx, + axis=axis, + ) + + @final + @cache_readonly + def group_keys_seq(self): + if len(self.groupings) == 1: + return self.levels[0] + else: + ids, _, ngroups = self.group_info + + # provide "flattened" iterator for multi-group setting + return get_flattened_list(ids, ngroups, self.levels, self.codes) + + @cache_readonly + def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: + """dict {group name -> group indices}""" + if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex): + # This shows unused categories in indices GH#38642 + return self.groupings[0].indices + codes_list = [ping.codes for ping in self.groupings] + keys = [ping._group_index for ping in self.groupings] + return get_indexer_dict(codes_list, keys) + + @final + def result_ilocs(self) -> npt.NDArray[np.intp]: + """ + Get the original integer locations of result_index in the input. + """ + # Original indices are where group_index would go via sorting. + # But when dropna is true, we need to remove null values while accounting for + # any gaps that then occur because of them. + group_index = get_group_index( + self.codes, self.shape, sort=self._sort, xnull=True + ) + group_index, _ = compress_group_index(group_index, sort=self._sort) + + if self.has_dropped_na: + mask = np.where(group_index >= 0) + # Count how many gaps are caused by previous null values for each position + null_gaps = np.cumsum(group_index == -1)[mask] + group_index = group_index[mask] + + result = get_group_index_sorter(group_index, self.ngroups) + + if self.has_dropped_na: + # Shift by the number of prior null gaps + result += np.take(null_gaps, result) + + return result + + @final + @property + def codes(self) -> list[npt.NDArray[np.signedinteger]]: + return [ping.codes for ping in self.groupings] + + @property + def levels(self) -> list[Index]: + return [ping._group_index for ping in self.groupings] + + @property + def names(self) -> list[Hashable]: + return [ping.name for ping in self.groupings] + + @final + def size(self) -> Series: + """ + Compute group sizes. + """ + ids, _, ngroups = self.group_info + out: np.ndarray | list + if ngroups: + out = np.bincount(ids[ids != -1], minlength=ngroups) + else: + out = [] + return Series(out, index=self.result_index, dtype="int64", copy=False) + + @cache_readonly + def groups(self) -> dict[Hashable, np.ndarray]: + """dict {group name -> group labels}""" + if len(self.groupings) == 1: + return self.groupings[0].groups + else: + to_groupby = [] + for ping in self.groupings: + gv = ping.grouping_vector + if not isinstance(gv, BaseGrouper): + to_groupby.append(gv) + else: + to_groupby.append(gv.groupings[0].grouping_vector) + index = MultiIndex.from_arrays(to_groupby) + return self.axis.groupby(index) + + @final + @cache_readonly + def is_monotonic(self) -> bool: + # return if my group orderings are monotonic + return Index(self.group_info[0]).is_monotonic_increasing + + @final + @cache_readonly + def has_dropped_na(self) -> bool: + """ + Whether grouper has null value(s) that are dropped. + """ + return bool((self.group_info[0] < 0).any()) + + @cache_readonly + def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + comp_ids, obs_group_ids = self._get_compressed_codes() + + ngroups = len(obs_group_ids) + comp_ids = ensure_platform_int(comp_ids) + + return comp_ids, obs_group_ids, ngroups + + @cache_readonly + def codes_info(self) -> npt.NDArray[np.intp]: + # return the codes of items in original grouped axis + ids, _, _ = self.group_info + return ids + + @final + def _get_compressed_codes( + self, + ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]: + # The first returned ndarray may have any signed integer dtype + if len(self.groupings) > 1: + group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) + return compress_group_index(group_index, sort=self._sort) + # FIXME: compress_group_index's second return value is int64, not intp + + ping = self.groupings[0] + return ping.codes, np.arange(len(ping._group_index), dtype=np.intp) + + @final + @cache_readonly + def ngroups(self) -> int: + return len(self.result_index) + + @property + def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: + codes = self.codes + ids, obs_ids, _ = self.group_info + return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + + @cache_readonly + def result_index(self) -> Index: + if len(self.groupings) == 1: + return self.groupings[0]._result_index.rename(self.names[0]) + + codes = self.reconstructed_codes + levels = [ping._result_index for ping in self.groupings] + return MultiIndex( + levels=levels, codes=codes, verify_integrity=False, names=self.names + ) + + @final + def get_group_levels(self) -> list[ArrayLike]: + # Note: only called from _insert_inaxis_grouper, which + # is only called for BaseGrouper, never for BinGrouper + if len(self.groupings) == 1: + return [self.groupings[0]._group_arraylike] + + name_list = [] + for ping, codes in zip(self.groupings, self.reconstructed_codes): + codes = ensure_platform_int(codes) + levels = ping._group_arraylike.take(codes) + + name_list.append(levels) + + return name_list + + # ------------------------------------------------------------ + # Aggregation functions + + @final + def _cython_operation( + self, + kind: str, + values, + how: str, + axis: AxisInt, + min_count: int = -1, + **kwargs, + ) -> ArrayLike: + """ + Returns the values of a cython operation. + """ + assert kind in ["transform", "aggregate"] + + cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na) + + ids, _, _ = self.group_info + ngroups = self.ngroups + return cy_op.cython_operation( + values=values, + axis=axis, + min_count=min_count, + comp_ids=ids, + ngroups=ngroups, + **kwargs, + ) + + @final + def agg_series( + self, obj: Series, func: Callable, preserve_dtype: bool = False + ) -> ArrayLike: + """ + Parameters + ---------- + obj : Series + func : function taking a Series and returning a scalar-like + preserve_dtype : bool + Whether the aggregation is known to be dtype-preserving. + + Returns + ------- + np.ndarray or ExtensionArray + """ + + if not isinstance(obj._values, np.ndarray): + # we can preserve a little bit more aggressively with EA dtype + # because maybe_cast_pointwise_result will do a try/except + # with _from_sequence. NB we are assuming here that _from_sequence + # is sufficiently strict that it casts appropriately. + preserve_dtype = True + + result = self._aggregate_series_pure_python(obj, func) + + npvalues = lib.maybe_convert_objects(result, try_float=False) + if preserve_dtype: + out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) + else: + out = npvalues + return out + + @final + def _aggregate_series_pure_python( + self, obj: Series, func: Callable + ) -> npt.NDArray[np.object_]: + _, _, ngroups = self.group_info + + result = np.empty(ngroups, dtype="O") + initialized = False + + splitter = self._get_splitter(obj, axis=0) + + for i, group in enumerate(splitter): + res = func(group) + res = extract_result(res) + + if not initialized: + # We only do this validation on the first iteration + check_result_array(res, group.dtype) + initialized = True + + result[i] = res + + return result + + @final + def apply_groupwise( + self, f: Callable, data: DataFrame | Series, axis: AxisInt = 0 + ) -> tuple[list, bool]: + mutated = False + splitter = self._get_splitter(data, axis=axis) + group_keys = self.group_keys_seq + result_values = [] + + # This calls DataSplitter.__iter__ + zipped = zip(group_keys, splitter) + + for key, group in zipped: + # Pinning name is needed for + # test_group_apply_once_per_group, + # test_inconsistent_return_type, test_set_group_name, + # test_group_name_available_in_inference_pass, + # test_groupby_multi_timezone + object.__setattr__(group, "name", key) + + # group might be modified + group_axes = group.axes + res = f(group) + if not mutated and not _is_indexed_like(res, group_axes, axis): + mutated = True + result_values.append(res) + # getattr pattern for __name__ is needed for functools.partial objects + if len(group_keys) == 0 and getattr(f, "__name__", None) in [ + "skew", + "sum", + "prod", + ]: + # If group_keys is empty, then no function calls have been made, + # so we will not have raised even if this is an invalid dtype. + # So do one dummy call here to raise appropriate TypeError. + f(data.iloc[:0]) + + return result_values, mutated + + # ------------------------------------------------------------ + # Methods for sorting subsets of our GroupBy's object + + @final + @cache_readonly + def _sort_idx(self) -> npt.NDArray[np.intp]: + # Counting sort indexer + ids, _, ngroups = self.group_info + return get_group_index_sorter(ids, ngroups) + + @final + @cache_readonly + def _sorted_ids(self) -> npt.NDArray[np.intp]: + ids, _, _ = self.group_info + return ids.take(self._sort_idx) + + +class BinGrouper(BaseGrouper): + """ + This is an internal Grouper class + + Parameters + ---------- + bins : the split index of binlabels to group the item of axis + binlabels : the label list + indexer : np.ndarray[np.intp], optional + the indexer created by Grouper + some groupers (TimeGrouper) will sort its axis and its + group_info is also sorted, so need the indexer to reorder + + Examples + -------- + bins: [2, 4, 6, 8, 10] + binlabels: DatetimeIndex(['2005-01-01', '2005-01-03', + '2005-01-05', '2005-01-07', '2005-01-09'], + dtype='datetime64[ns]', freq='2D') + + the group_info, which contains the label of each item in grouped + axis, the index of label in label list, group number, is + + (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5) + + means that, the grouped axis has 10 items, can be grouped into 5 + labels, the first and second items belong to the first label, the + third and forth items belong to the second label, and so on + + """ + + bins: npt.NDArray[np.int64] + binlabels: Index + + def __init__( + self, + bins, + binlabels, + indexer=None, + ) -> None: + self.bins = ensure_int64(bins) + self.binlabels = ensure_index(binlabels) + self.indexer = indexer + + # These lengths must match, otherwise we could call agg_series + # with empty self.bins, which would raise later. + assert len(self.binlabels) == len(self.bins) + + @cache_readonly + def groups(self): + """dict {group name -> group labels}""" + # this is mainly for compat + # GH 3881 + result = { + key: value + for key, value in zip(self.binlabels, self.bins) + if key is not NaT + } + return result + + @property + def nkeys(self) -> int: + # still matches len(self.groupings), but we can hard-code + return 1 + + @cache_readonly + def codes_info(self) -> npt.NDArray[np.intp]: + # return the codes of items in original grouped axis + ids, _, _ = self.group_info + if self.indexer is not None: + sorter = np.lexsort((ids, self.indexer)) + ids = ids[sorter] + return ids + + def get_iterator(self, data: NDFrame, axis: AxisInt = 0): + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + if axis == 0: + slicer = lambda start, edge: data.iloc[start:edge] + else: + slicer = lambda start, edge: data.iloc[:, start:edge] + + length = len(data.axes[axis]) + + start = 0 + for edge, label in zip(self.bins, self.binlabels): + if label is not NaT: + yield label, slicer(start, edge) + start = edge + + if start < length: + yield self.binlabels[-1], slicer(start, None) + + @cache_readonly + def indices(self): + indices = collections.defaultdict(list) + + i = 0 + for label, bin in zip(self.binlabels, self.bins): + if i < bin: + if label is not NaT: + indices[label] = list(range(i, bin)) + i = bin + return indices + + @cache_readonly + def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + ngroups = self.ngroups + obs_group_ids = np.arange(ngroups, dtype=np.intp) + rep = np.diff(np.r_[0, self.bins]) + + rep = ensure_platform_int(rep) + if ngroups == len(self.bins): + comp_ids = np.repeat(np.arange(ngroups), rep) + else: + comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) + + return ( + ensure_platform_int(comp_ids), + obs_group_ids, + ngroups, + ) + + @cache_readonly + def reconstructed_codes(self) -> list[np.ndarray]: + # get unique result indices, and prepend 0 as groupby starts from the first + return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] + + @cache_readonly + def result_index(self) -> Index: + if len(self.binlabels) != 0 and isna(self.binlabels[0]): + return self.binlabels[1:] + + return self.binlabels + + @property + def levels(self) -> list[Index]: + return [self.binlabels] + + @property + def names(self) -> list[Hashable]: + return [self.binlabels.name] + + @property + def groupings(self) -> list[grouper.Grouping]: + lev = self.binlabels + codes = self.group_info[0] + labels = lev.take(codes) + ping = grouper.Grouping( + labels, labels, in_axis=False, level=None, uniques=lev._values + ) + return [ping] + + +def _is_indexed_like(obj, axes, axis: AxisInt) -> bool: + if isinstance(obj, Series): + if len(axes) > 1: + return False + return obj.axes[axis].equals(axes[axis]) + elif isinstance(obj, DataFrame): + return obj.axes[axis].equals(axes[axis]) + + return False + + +# ---------------------------------------------------------------------- +# Splitting / application + + +class DataSplitter(Generic[NDFrameT]): + def __init__( + self, + data: NDFrameT, + labels: npt.NDArray[np.intp], + ngroups: int, + *, + sort_idx: npt.NDArray[np.intp], + sorted_ids: npt.NDArray[np.intp], + axis: AxisInt = 0, + ) -> None: + self.data = data + self.labels = ensure_platform_int(labels) # _should_ already be np.intp + self.ngroups = ngroups + + self._slabels = sorted_ids + self._sort_idx = sort_idx + + self.axis = axis + assert isinstance(axis, int), axis + + def __iter__(self) -> Iterator: + sdata = self._sorted_data + + if self.ngroups == 0: + # we are inside a generator, rather than raise StopIteration + # we merely return signal the end + return + + starts, ends = lib.generate_slices(self._slabels, self.ngroups) + + for start, end in zip(starts, ends): + yield self._chop(sdata, slice(start, end)) + + @cache_readonly + def _sorted_data(self) -> NDFrameT: + return self.data.take(self._sort_idx, axis=self.axis) + + def _chop(self, sdata, slice_obj: slice) -> NDFrame: + raise AbstractMethodError(self) + + +class SeriesSplitter(DataSplitter): + def _chop(self, sdata: Series, slice_obj: slice) -> Series: + # fastpath equivalent to `sdata.iloc[slice_obj]` + mgr = sdata._mgr.get_slice(slice_obj) + ser = sdata._constructor_from_mgr(mgr, axes=mgr.axes) + ser._name = sdata.name + return ser.__finalize__(sdata, method="groupby") + + +class FrameSplitter(DataSplitter): + def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: + # Fastpath equivalent to: + # if self.axis == 0: + # return sdata.iloc[slice_obj] + # else: + # return sdata.iloc[:, slice_obj] + mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) + df = sdata._constructor_from_mgr(mgr, axes=mgr.axes) + return df.__finalize__(sdata, method="groupby") + + +def _get_splitter( + data: NDFrame, + labels: npt.NDArray[np.intp], + ngroups: int, + *, + sort_idx: npt.NDArray[np.intp], + sorted_ids: npt.NDArray[np.intp], + axis: AxisInt = 0, +) -> DataSplitter: + if isinstance(data, Series): + klass: type[DataSplitter] = SeriesSplitter + else: + # i.e. DataFrame + klass = FrameSplitter + + return klass( + data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids, axis=axis + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ba8a4f1d0ee7adb668c6b0ac49b2360d3c0dc356 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__init__.py @@ -0,0 +1,31 @@ +from pandas.core.indexers.utils import ( + check_array_indexer, + check_key_length, + check_setitem_lengths, + disallow_ndim_indexing, + is_empty_indexer, + is_list_like_indexer, + is_scalar_indexer, + is_valid_positional_slice, + length_of_indexer, + maybe_convert_indices, + unpack_1tuple, + unpack_tuple_and_ellipses, + validate_indices, +) + +__all__ = [ + "is_valid_positional_slice", + "is_list_like_indexer", + "is_scalar_indexer", + "is_empty_indexer", + "check_setitem_lengths", + "validate_indices", + "maybe_convert_indices", + "length_of_indexer", + "disallow_ndim_indexing", + "unpack_1tuple", + "check_key_length", + "check_array_indexer", + "unpack_tuple_and_ellipses", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5eac49ee2598d2d29bd365183107712ab7bcbb3c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__pycache__/objects.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__pycache__/objects.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92c2fc43a241c3698bbd917eaefd471cfe47c1d2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__pycache__/objects.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b66225f2c682a98e43e2564e0a9c621e6c7d1d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/objects.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/objects.py new file mode 100644 index 0000000000000000000000000000000000000000..f2db4886a559017422ed41bb8bd2246d6a3f0fb0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/objects.py @@ -0,0 +1,453 @@ +"""Indexer objects for computing start/end window bounds for rolling operations""" +from __future__ import annotations + +from datetime import timedelta + +import numpy as np + +from pandas._libs.tslibs import BaseOffset +from pandas._libs.window.indexers import calculate_variable_window_bounds +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import ensure_platform_int + +from pandas.core.indexes.datetimes import DatetimeIndex + +from pandas.tseries.offsets import Nano + +get_window_bounds_doc = """ +Computes the bounds of a window. + +Parameters +---------- +num_values : int, default 0 + number of values that will be aggregated over +window_size : int, default 0 + the number of rows in a window +min_periods : int, default None + min_periods passed from the top level rolling API +center : bool, default None + center passed from the top level rolling API +closed : str, default None + closed passed from the top level rolling API +step : int, default None + step passed from the top level rolling API + .. versionadded:: 1.5 +win_type : str, default None + win_type passed from the top level rolling API + +Returns +------- +A tuple of ndarray[int64]s, indicating the boundaries of each +window +""" + + +class BaseIndexer: + """ + Base class for window bounds calculations. + + Examples + -------- + >>> from pandas.api.indexers import BaseIndexer + >>> class CustomIndexer(BaseIndexer): + ... def get_window_bounds(self, num_values, min_periods, center, closed, step): + ... start = np.empty(num_values, dtype=np.int64) + ... end = np.empty(num_values, dtype=np.int64) + ... for i in range(num_values): + ... start[i] = i + ... end[i] = i + self.window_size + ... return start, end + >>> df = pd.DataFrame({"values": range(5)}) + >>> indexer = CustomIndexer(window_size=2) + >>> df.rolling(indexer).sum() + values + 0 1.0 + 1 3.0 + 2 5.0 + 3 7.0 + 4 4.0 + """ + + def __init__( + self, index_array: np.ndarray | None = None, window_size: int = 0, **kwargs + ) -> None: + self.index_array = index_array + self.window_size = window_size + # Set user defined kwargs as attributes that can be used in get_window_bounds + for key, value in kwargs.items(): + setattr(self, key, value) + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray]: + raise NotImplementedError + + +class FixedWindowIndexer(BaseIndexer): + """Creates window boundaries that are of fixed length.""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray]: + if center or self.window_size == 0: + offset = (self.window_size - 1) // 2 + else: + offset = 0 + + end = np.arange(1 + offset, num_values + 1 + offset, step, dtype="int64") + start = end - self.window_size + if closed in ["left", "both"]: + start -= 1 + if closed in ["left", "neither"]: + end -= 1 + + end = np.clip(end, 0, num_values) + start = np.clip(start, 0, num_values) + + return start, end + + +class VariableWindowIndexer(BaseIndexer): + """Creates window boundaries that are of variable length, namely for time series.""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray]: + # error: Argument 4 to "calculate_variable_window_bounds" has incompatible + # type "Optional[bool]"; expected "bool" + # error: Argument 6 to "calculate_variable_window_bounds" has incompatible + # type "Optional[ndarray]"; expected "ndarray" + return calculate_variable_window_bounds( + num_values, + self.window_size, + min_periods, + center, # type: ignore[arg-type] + closed, + self.index_array, # type: ignore[arg-type] + ) + + +class VariableOffsetWindowIndexer(BaseIndexer): + """ + Calculate window boundaries based on a non-fixed offset such as a BusinessDay. + + Examples + -------- + >>> from pandas.api.indexers import VariableOffsetWindowIndexer + >>> df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10)) + >>> offset = pd.offsets.BDay(1) + >>> indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset) + >>> df + 0 + 2020-01-01 0 + 2020-01-02 1 + 2020-01-03 2 + 2020-01-04 3 + 2020-01-05 4 + 2020-01-06 5 + 2020-01-07 6 + 2020-01-08 7 + 2020-01-09 8 + 2020-01-10 9 + >>> df.rolling(indexer).sum() + 0 + 2020-01-01 0.0 + 2020-01-02 1.0 + 2020-01-03 2.0 + 2020-01-04 3.0 + 2020-01-05 7.0 + 2020-01-06 12.0 + 2020-01-07 6.0 + 2020-01-08 7.0 + 2020-01-09 8.0 + 2020-01-10 9.0 + """ + + def __init__( + self, + index_array: np.ndarray | None = None, + window_size: int = 0, + index: DatetimeIndex | None = None, + offset: BaseOffset | None = None, + **kwargs, + ) -> None: + super().__init__(index_array, window_size, **kwargs) + if not isinstance(index, DatetimeIndex): + raise ValueError("index must be a DatetimeIndex.") + self.index = index + if not isinstance(offset, BaseOffset): + raise ValueError("offset must be a DateOffset-like object.") + self.offset = offset + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray]: + if step is not None: + raise NotImplementedError("step not implemented for variable offset window") + if num_values <= 0: + return np.empty(0, dtype="int64"), np.empty(0, dtype="int64") + + # if windows is variable, default is 'right', otherwise default is 'both' + if closed is None: + closed = "right" if self.index is not None else "both" + + right_closed = closed in ["right", "both"] + left_closed = closed in ["left", "both"] + + if self.index[num_values - 1] < self.index[0]: + index_growth_sign = -1 + else: + index_growth_sign = 1 + offset_diff = index_growth_sign * self.offset + + start = np.empty(num_values, dtype="int64") + start.fill(-1) + end = np.empty(num_values, dtype="int64") + end.fill(-1) + + start[0] = 0 + + # right endpoint is closed + if right_closed: + end[0] = 1 + # right endpoint is open + else: + end[0] = 0 + + zero = timedelta(0) + # start is start of slice interval (including) + # end is end of slice interval (not including) + for i in range(1, num_values): + end_bound = self.index[i] + start_bound = end_bound - offset_diff + + # left endpoint is closed + if left_closed: + start_bound -= Nano(1) + + # advance the start bound until we are + # within the constraint + start[i] = i + for j in range(start[i - 1], i): + start_diff = (self.index[j] - start_bound) * index_growth_sign + if start_diff > zero: + start[i] = j + break + + # end bound is previous end + # or current index + end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign + if end_diff == zero and not right_closed: + end[i] = end[i - 1] + 1 + elif end_diff <= zero: + end[i] = i + 1 + else: + end[i] = end[i - 1] + + # right endpoint is open + if not right_closed: + end[i] -= 1 + + return start, end + + +class ExpandingIndexer(BaseIndexer): + """Calculate expanding window bounds, mimicking df.expanding()""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray]: + return ( + np.zeros(num_values, dtype=np.int64), + np.arange(1, num_values + 1, dtype=np.int64), + ) + + +class FixedForwardWindowIndexer(BaseIndexer): + """ + Creates window boundaries for fixed-length windows that include the current row. + + Examples + -------- + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2) + >>> df.rolling(window=indexer, min_periods=1).sum() + B + 0 1.0 + 1 3.0 + 2 2.0 + 3 4.0 + 4 4.0 + """ + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray]: + if center: + raise ValueError("Forward-looking windows can't have center=True") + if closed is not None: + raise ValueError( + "Forward-looking windows don't support setting the closed argument" + ) + if step is None: + step = 1 + + start = np.arange(0, num_values, step, dtype="int64") + end = start + self.window_size + if self.window_size: + end = np.clip(end, 0, num_values) + + return start, end + + +class GroupbyIndexer(BaseIndexer): + """Calculate bounds to compute groupby rolling, mimicking df.groupby().rolling()""" + + def __init__( + self, + index_array: np.ndarray | None = None, + window_size: int | BaseIndexer = 0, + groupby_indices: dict | None = None, + window_indexer: type[BaseIndexer] = BaseIndexer, + indexer_kwargs: dict | None = None, + **kwargs, + ) -> None: + """ + Parameters + ---------- + index_array : np.ndarray or None + np.ndarray of the index of the original object that we are performing + a chained groupby operation over. This index has been pre-sorted relative to + the groups + window_size : int or BaseIndexer + window size during the windowing operation + groupby_indices : dict or None + dict of {group label: [positional index of rows belonging to the group]} + window_indexer : BaseIndexer + BaseIndexer class determining the start and end bounds of each group + indexer_kwargs : dict or None + Custom kwargs to be passed to window_indexer + **kwargs : + keyword arguments that will be available when get_window_bounds is called + """ + self.groupby_indices = groupby_indices or {} + self.window_indexer = window_indexer + self.indexer_kwargs = indexer_kwargs.copy() if indexer_kwargs else {} + super().__init__( + index_array=index_array, + window_size=self.indexer_kwargs.pop("window_size", window_size), + **kwargs, + ) + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray]: + # 1) For each group, get the indices that belong to the group + # 2) Use the indices to calculate the start & end bounds of the window + # 3) Append the window bounds in group order + start_arrays = [] + end_arrays = [] + window_indices_start = 0 + for key, indices in self.groupby_indices.items(): + index_array: np.ndarray | None + + if self.index_array is not None: + index_array = self.index_array.take(ensure_platform_int(indices)) + else: + index_array = self.index_array + indexer = self.window_indexer( + index_array=index_array, + window_size=self.window_size, + **self.indexer_kwargs, + ) + start, end = indexer.get_window_bounds( + len(indices), min_periods, center, closed, step + ) + start = start.astype(np.int64) + end = end.astype(np.int64) + assert len(start) == len( + end + ), "these should be equal in length from get_window_bounds" + # Cannot use groupby_indices as they might not be monotonic with the object + # we're rolling over + window_indices = np.arange( + window_indices_start, window_indices_start + len(indices) + ) + window_indices_start += len(indices) + # Extend as we'll be slicing window like [start, end) + window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype( + np.int64, copy=False + ) + start_arrays.append(window_indices.take(ensure_platform_int(start))) + end_arrays.append(window_indices.take(ensure_platform_int(end))) + if len(start_arrays) == 0: + return np.array([], dtype=np.int64), np.array([], dtype=np.int64) + start = np.concatenate(start_arrays) + end = np.concatenate(end_arrays) + return start, end + + +class ExponentialMovingWindowIndexer(BaseIndexer): + """Calculate ewm window bounds (the entire window)""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: int | None = None, + center: bool | None = None, + closed: str | None = None, + step: int | None = None, + ) -> tuple[np.ndarray, np.ndarray]: + return np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..55bb58f3108c3d7004058494284ea6fb4b2fca7f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexers/utils.py @@ -0,0 +1,553 @@ +""" +Low-dependency indexing utilities. +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, +) + +import numpy as np + +from pandas._libs import lib + +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_integer, + is_integer_dtype, + is_list_like, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ( + ABCIndex, + ABCSeries, +) + +if TYPE_CHECKING: + from pandas._typing import AnyArrayLike + + from pandas.core.frame import DataFrame + from pandas.core.indexes.base import Index + +# ----------------------------------------------------------- +# Indexer Identification + + +def is_valid_positional_slice(slc: slice) -> bool: + """ + Check if a slice object can be interpreted as a positional indexer. + + Parameters + ---------- + slc : slice + + Returns + ------- + bool + + Notes + ----- + A valid positional slice may also be interpreted as a label-based slice + depending on the index being sliced. + """ + return ( + lib.is_int_or_none(slc.start) + and lib.is_int_or_none(slc.stop) + and lib.is_int_or_none(slc.step) + ) + + +def is_list_like_indexer(key) -> bool: + """ + Check if we have a list-like indexer that is *not* a NamedTuple. + + Parameters + ---------- + key : object + + Returns + ------- + bool + """ + # allow a list_like, but exclude NamedTuples which can be indexers + return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple) + + +def is_scalar_indexer(indexer, ndim: int) -> bool: + """ + Return True if we are all scalar indexers. + + Parameters + ---------- + indexer : object + ndim : int + Number of dimensions in the object being indexed. + + Returns + ------- + bool + """ + if ndim == 1 and is_integer(indexer): + # GH37748: allow indexer to be an integer for Series + return True + if isinstance(indexer, tuple) and len(indexer) == ndim: + return all(is_integer(x) for x in indexer) + return False + + +def is_empty_indexer(indexer) -> bool: + """ + Check if we have an empty indexer. + + Parameters + ---------- + indexer : object + + Returns + ------- + bool + """ + if is_list_like(indexer) and not len(indexer): + return True + if not isinstance(indexer, tuple): + indexer = (indexer,) + return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) + + +# ----------------------------------------------------------- +# Indexer Validation + + +def check_setitem_lengths(indexer, value, values) -> bool: + """ + Validate that value and indexer are the same length. + + An special-case is allowed for when the indexer is a boolean array + and the number of true values equals the length of ``value``. In + this case, no exception is raised. + + Parameters + ---------- + indexer : sequence + Key for the setitem. + value : array-like + Value for the setitem. + values : array-like + Values being set into. + + Returns + ------- + bool + Whether this is an empty listlike setting which is a no-op. + + Raises + ------ + ValueError + When the indexer is an ndarray or list and the lengths don't match. + """ + no_op = False + + if isinstance(indexer, (np.ndarray, list)): + # We can ignore other listlikes because they are either + # a) not necessarily 1-D indexers, e.g. tuple + # b) boolean indexers e.g. BoolArray + if is_list_like(value): + if len(indexer) != len(value) and values.ndim == 1: + # boolean with truth values == len of the value is ok too + if isinstance(indexer, list): + indexer = np.array(indexer) + if not ( + isinstance(indexer, np.ndarray) + and indexer.dtype == np.bool_ + and indexer.sum() == len(value) + ): + raise ValueError( + "cannot set using a list-like indexer " + "with a different length than the value" + ) + if not len(indexer): + no_op = True + + elif isinstance(indexer, slice): + if is_list_like(value): + if len(value) != length_of_indexer(indexer, values) and values.ndim == 1: + # In case of two dimensional value is used row-wise and broadcasted + raise ValueError( + "cannot set using a slice indexer with a " + "different length than the value" + ) + if not len(value): + no_op = True + + return no_op + + +def validate_indices(indices: np.ndarray, n: int) -> None: + """ + Perform bounds-checking for an indexer. + + -1 is allowed for indicating missing values. + + Parameters + ---------- + indices : ndarray + n : int + Length of the array being indexed. + + Raises + ------ + ValueError + + Examples + -------- + >>> validate_indices(np.array([1, 2]), 3) # OK + + >>> validate_indices(np.array([1, -2]), 3) + Traceback (most recent call last): + ... + ValueError: negative dimensions are not allowed + + >>> validate_indices(np.array([1, 2, 3]), 3) + Traceback (most recent call last): + ... + IndexError: indices are out-of-bounds + + >>> validate_indices(np.array([-1, -1]), 0) # OK + + >>> validate_indices(np.array([0, 1]), 0) + Traceback (most recent call last): + ... + IndexError: indices are out-of-bounds + """ + if len(indices): + min_idx = indices.min() + if min_idx < -1: + msg = f"'indices' contains values less than allowed ({min_idx} < -1)" + raise ValueError(msg) + + max_idx = indices.max() + if max_idx >= n: + raise IndexError("indices are out-of-bounds") + + +# ----------------------------------------------------------- +# Indexer Conversion + + +def maybe_convert_indices(indices, n: int, verify: bool = True) -> np.ndarray: + """ + Attempt to convert indices into valid, positive indices. + + If we have negative indices, translate to positive here. + If we have indices that are out-of-bounds, raise an IndexError. + + Parameters + ---------- + indices : array-like + Array of indices that we are to convert. + n : int + Number of elements in the array that we are indexing. + verify : bool, default True + Check that all entries are between 0 and n - 1, inclusive. + + Returns + ------- + array-like + An array-like of positive indices that correspond to the ones + that were passed in initially to this function. + + Raises + ------ + IndexError + One of the converted indices either exceeded the number of, + elements (specified by `n`), or was still negative. + """ + if isinstance(indices, list): + indices = np.array(indices) + if len(indices) == 0: + # If `indices` is empty, np.array will return a float, + # and will cause indexing errors. + return np.empty(0, dtype=np.intp) + + mask = indices < 0 + if mask.any(): + indices = indices.copy() + indices[mask] += n + + if verify: + mask = (indices >= n) | (indices < 0) + if mask.any(): + raise IndexError("indices are out-of-bounds") + return indices + + +# ----------------------------------------------------------- +# Unsorted + + +def length_of_indexer(indexer, target=None) -> int: + """ + Return the expected length of target[indexer] + + Returns + ------- + int + """ + if target is not None and isinstance(indexer, slice): + target_len = len(target) + start = indexer.start + stop = indexer.stop + step = indexer.step + if start is None: + start = 0 + elif start < 0: + start += target_len + if stop is None or stop > target_len: + stop = target_len + elif stop < 0: + stop += target_len + if step is None: + step = 1 + elif step < 0: + start, stop = stop + 1, start + 1 + step = -step + return (stop - start + step - 1) // step + elif isinstance(indexer, (ABCSeries, ABCIndex, np.ndarray, list)): + if isinstance(indexer, list): + indexer = np.array(indexer) + + if indexer.dtype == bool: + # GH#25774 + return indexer.sum() + return len(indexer) + elif isinstance(indexer, range): + return (indexer.stop - indexer.start) // indexer.step + elif not is_list_like_indexer(indexer): + return 1 + raise AssertionError("cannot find the length of the indexer") + + +def disallow_ndim_indexing(result) -> None: + """ + Helper function to disallow multi-dimensional indexing on 1D Series/Index. + + GH#27125 indexer like idx[:, None] expands dim, but we cannot do that + and keep an index, so we used to return ndarray, which was deprecated + in GH#30588. + """ + if np.ndim(result) > 1: + raise ValueError( + "Multi-dimensional indexing (e.g. `obj[:, None]`) is no longer " + "supported. Convert to a numpy array before indexing instead." + ) + + +def unpack_1tuple(tup): + """ + If we have a length-1 tuple/list that contains a slice, unpack to just + the slice. + + Notes + ----- + The list case is deprecated. + """ + if len(tup) == 1 and isinstance(tup[0], slice): + # if we don't have a MultiIndex, we may still be able to handle + # a 1-tuple. see test_1tuple_without_multiindex + + if isinstance(tup, list): + # GH#31299 + raise ValueError( + "Indexing with a single-item list containing a " + "slice is not allowed. Pass a tuple instead.", + ) + + return tup[0] + return tup + + +def check_key_length(columns: Index, key, value: DataFrame) -> None: + """ + Checks if a key used as indexer has the same length as the columns it is + associated with. + + Parameters + ---------- + columns : Index The columns of the DataFrame to index. + key : A list-like of keys to index with. + value : DataFrame The value to set for the keys. + + Raises + ------ + ValueError: If the length of key is not equal to the number of columns in value + or if the number of columns referenced by key is not equal to number + of columns. + """ + if columns.is_unique: + if len(value.columns) != len(key): + raise ValueError("Columns must be same length as key") + else: + # Missing keys in columns are represented as -1 + if len(columns.get_indexer_non_unique(key)[0]) != len(value.columns): + raise ValueError("Columns must be same length as key") + + +def unpack_tuple_and_ellipses(item: tuple): + """ + Possibly unpack arr[..., n] to arr[n] + """ + if len(item) > 1: + # Note: we are assuming this indexing is being done on a 1D arraylike + if item[0] is Ellipsis: + item = item[1:] + elif item[-1] is Ellipsis: + item = item[:-1] + + if len(item) > 1: + raise IndexError("too many indices for array.") + + item = item[0] + return item + + +# ----------------------------------------------------------- +# Public indexer validation + + +def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: + """ + Check if `indexer` is a valid array indexer for `array`. + + For a boolean mask, `array` and `indexer` are checked to have the same + length. The dtype is validated, and if it is an integer or boolean + ExtensionArray, it is checked if there are missing values present, and + it is converted to the appropriate numpy array. Other dtypes will raise + an error. + + Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed + through as is. + + Parameters + ---------- + array : array-like + The array that is being indexed (only used for the length). + indexer : array-like or list-like + The array-like that's used to index. List-like input that is not yet + a numpy array or an ExtensionArray is converted to one. Other input + types are passed through as is. + + Returns + ------- + numpy.ndarray + The validated indexer as a numpy array that can be used to index. + + Raises + ------ + IndexError + When the lengths don't match. + ValueError + When `indexer` cannot be converted to a numpy ndarray to index + (e.g. presence of missing values). + + See Also + -------- + api.types.is_bool_dtype : Check if `key` is of boolean dtype. + + Examples + -------- + When checking a boolean mask, a boolean ndarray is returned when the + arguments are all valid. + + >>> mask = pd.array([True, False]) + >>> arr = pd.array([1, 2]) + >>> pd.api.indexers.check_array_indexer(arr, mask) + array([ True, False]) + + An IndexError is raised when the lengths don't match. + + >>> mask = pd.array([True, False, True]) + >>> pd.api.indexers.check_array_indexer(arr, mask) + Traceback (most recent call last): + ... + IndexError: Boolean index has wrong length: 3 instead of 2. + + NA values in a boolean array are treated as False. + + >>> mask = pd.array([True, pd.NA]) + >>> pd.api.indexers.check_array_indexer(arr, mask) + array([ True, False]) + + A numpy boolean mask will get passed through (if the length is correct): + + >>> mask = np.array([True, False]) + >>> pd.api.indexers.check_array_indexer(arr, mask) + array([ True, False]) + + Similarly for integer indexers, an integer ndarray is returned when it is + a valid indexer, otherwise an error is (for integer indexers, a matching + length is not required): + + >>> indexer = pd.array([0, 2], dtype="Int64") + >>> arr = pd.array([1, 2, 3]) + >>> pd.api.indexers.check_array_indexer(arr, indexer) + array([0, 2]) + + >>> indexer = pd.array([0, pd.NA], dtype="Int64") + >>> pd.api.indexers.check_array_indexer(arr, indexer) + Traceback (most recent call last): + ... + ValueError: Cannot index with an integer indexer containing NA values + + For non-integer/boolean dtypes, an appropriate error is raised: + + >>> indexer = np.array([0., 2.], dtype="float64") + >>> pd.api.indexers.check_array_indexer(arr, indexer) + Traceback (most recent call last): + ... + IndexError: arrays used as indices must be of integer or boolean type + """ + from pandas.core.construction import array as pd_array + + # whatever is not an array-like is returned as-is (possible valid array + # indexers that are not array-like: integer, slice, Ellipsis, None) + # In this context, tuples are not considered as array-like, as they have + # a specific meaning in indexing (multi-dimensional indexing) + if is_list_like(indexer): + if isinstance(indexer, tuple): + return indexer + else: + return indexer + + # convert list-likes to array + if not is_array_like(indexer): + indexer = pd_array(indexer) + if len(indexer) == 0: + # empty list is converted to float array by pd.array + indexer = np.array([], dtype=np.intp) + + dtype = indexer.dtype + if is_bool_dtype(dtype): + if isinstance(dtype, ExtensionDtype): + indexer = indexer.to_numpy(dtype=bool, na_value=False) + else: + indexer = np.asarray(indexer, dtype=bool) + + # GH26658 + if len(indexer) != len(array): + raise IndexError( + f"Boolean index has wrong length: " + f"{len(indexer)} instead of {len(array)}" + ) + elif is_integer_dtype(dtype): + try: + indexer = np.asarray(indexer, dtype=np.intp) + except ValueError as err: + raise ValueError( + "Cannot index with an integer indexer containing NA values" + ) from err + else: + raise IndexError("arrays used as indices must be of integer or boolean type") + + return indexer diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c56656f67aa097a8e3db5ed3fb20090099c9710 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/accessors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/accessors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0184e6bf43085c8b8bbea3c89f1b52c07ca726b2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/accessors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b05bccd27f633a2a498baab9528a9e177b7eac8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/category.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/category.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e339597b260b3e4cac52475f86ce4894188a054 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/category.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/datetimelike.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/datetimelike.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b8b751747052d58da2c3838c32cb54c3573d265 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/datetimelike.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/datetimes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/datetimes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ff85a3c511c2de9a8cc9a19edac50a8f0d6680a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/datetimes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/extension.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/extension.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b06b5bb19c219b387a6e606666c2cd569ac5efa6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/extension.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/frozen.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/frozen.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec8b707faf6f3ec3a2a7fa76647e28d7d6bed022 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/frozen.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/interval.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/interval.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..291f81146de2b9d37b213baed3c0e0e582502a4f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/interval.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/period.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/period.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b304227224bb1313b269becd7874b84f728fa74 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/period.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/range.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/range.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0d0126fe078a0b4dcd757823022062f3342484f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/range.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/timedeltas.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/timedeltas.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca5fabc105d1abda40741935813f0f37bb3efc40 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/__pycache__/timedeltas.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/accessors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/accessors.py new file mode 100644 index 0000000000000000000000000000000000000000..7e3ba4089ff60e9ab226f7536c6be1843be8ea59 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/accessors.py @@ -0,0 +1,643 @@ +""" +datetimelike delegation +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + cast, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_integer_dtype, + is_list_like, +) +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, + DatetimeTZDtype, + PeriodDtype, +) +from pandas.core.dtypes.generic import ABCSeries + +from pandas.core.accessor import ( + PandasDelegate, + delegate_names, +) +from pandas.core.arrays import ( + DatetimeArray, + PeriodArray, + TimedeltaArray, +) +from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.base import ( + NoNewAttributesMixin, + PandasObject, +) +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex + +if TYPE_CHECKING: + from pandas import ( + DataFrame, + Series, + ) + + +class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): + _hidden_attrs = PandasObject._hidden_attrs | { + "orig", + "name", + } + + def __init__(self, data: Series, orig) -> None: + if not isinstance(data, ABCSeries): + raise TypeError( + f"cannot convert an object of type {type(data)} to a datetimelike index" + ) + + self._parent = data + self.orig = orig + self.name = getattr(data, "name", None) + self._freeze() + + def _get_values(self): + data = self._parent + if lib.is_np_dtype(data.dtype, "M"): + return DatetimeIndex(data, copy=False, name=self.name) + + elif isinstance(data.dtype, DatetimeTZDtype): + return DatetimeIndex(data, copy=False, name=self.name) + + elif lib.is_np_dtype(data.dtype, "m"): + return TimedeltaIndex(data, copy=False, name=self.name) + + elif isinstance(data.dtype, PeriodDtype): + return PeriodArray(data, copy=False) + + raise TypeError( + f"cannot convert an object of type {type(data)} to a datetimelike index" + ) + + def _delegate_property_get(self, name: str): + from pandas import Series + + values = self._get_values() + + result = getattr(values, name) + + # maybe need to upcast (ints) + if isinstance(result, np.ndarray): + if is_integer_dtype(result): + result = result.astype("int64") + elif not is_list_like(result): + return result + + result = np.asarray(result) + + if self.orig is not None: + index = self.orig.index + else: + index = self._parent.index + # return the result as a Series + result = Series(result, index=index, name=self.name).__finalize__(self._parent) + + # setting this object will show a SettingWithCopyWarning/Error + result._is_copy = ( + "modifications to a property of a datetimelike " + "object are not supported and are discarded. " + "Change values on the original." + ) + + return result + + def _delegate_property_set(self, name: str, value, *args, **kwargs): + raise ValueError( + "modifications to a property of a datetimelike object are not supported. " + "Change values on the original." + ) + + def _delegate_method(self, name: str, *args, **kwargs): + from pandas import Series + + values = self._get_values() + + method = getattr(values, name) + result = method(*args, **kwargs) + + if not is_list_like(result): + return result + + result = Series(result, index=self._parent.index, name=self.name).__finalize__( + self._parent + ) + + # setting this object will show a SettingWithCopyWarning/Error + result._is_copy = ( + "modifications to a method of a datetimelike " + "object are not supported and are discarded. " + "Change values on the original." + ) + + return result + + +@delegate_names( + delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_ops, + typ="property", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) +@delegate_names( + delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_methods, + typ="method", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) +@delegate_names( + delegate=ArrowExtensionArray, + accessors=DatetimeArray._datetimelike_ops, + typ="property", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) +@delegate_names( + delegate=ArrowExtensionArray, + accessors=DatetimeArray._datetimelike_methods, + typ="method", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) +class ArrowTemporalProperties(PandasDelegate, PandasObject, NoNewAttributesMixin): + def __init__(self, data: Series, orig) -> None: + if not isinstance(data, ABCSeries): + raise TypeError( + f"cannot convert an object of type {type(data)} to a datetimelike index" + ) + + self._parent = data + self._orig = orig + self._freeze() + + def _delegate_property_get(self, name: str): + if not hasattr(self._parent.array, f"_dt_{name}"): + raise NotImplementedError( + f"dt.{name} is not supported for {self._parent.dtype}" + ) + result = getattr(self._parent.array, f"_dt_{name}") + + if not is_list_like(result): + return result + + if self._orig is not None: + index = self._orig.index + else: + index = self._parent.index + # return the result as a Series, which is by definition a copy + result = type(self._parent)( + result, index=index, name=self._parent.name + ).__finalize__(self._parent) + + return result + + def _delegate_method(self, name: str, *args, **kwargs): + if not hasattr(self._parent.array, f"_dt_{name}"): + raise NotImplementedError( + f"dt.{name} is not supported for {self._parent.dtype}" + ) + + result = getattr(self._parent.array, f"_dt_{name}")(*args, **kwargs) + + if self._orig is not None: + index = self._orig.index + else: + index = self._parent.index + # return the result as a Series, which is by definition a copy + result = type(self._parent)( + result, index=index, name=self._parent.name + ).__finalize__(self._parent) + + return result + + def to_pytimedelta(self): + return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta() + + def to_pydatetime(self): + # GH#20306 + warnings.warn( + f"The behavior of {type(self).__name__}.to_pydatetime is deprecated, " + "in a future version this will return a Series containing python " + "datetime objects instead of an ndarray. To retain the old behavior, " + "call `np.array` on the result", + FutureWarning, + stacklevel=find_stack_level(), + ) + return cast(ArrowExtensionArray, self._parent.array)._dt_to_pydatetime() + + def isocalendar(self) -> DataFrame: + from pandas import DataFrame + + result = ( + cast(ArrowExtensionArray, self._parent.array) + ._dt_isocalendar() + ._pa_array.combine_chunks() + ) + iso_calendar_df = DataFrame( + { + col: type(self._parent.array)(result.field(i)) # type: ignore[call-arg] + for i, col in enumerate(["year", "week", "day"]) + } + ) + return iso_calendar_df + + @property + def components(self) -> DataFrame: + from pandas import DataFrame + + components_df = DataFrame( + { + col: getattr(self._parent.array, f"_dt_{col}") + for col in [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] + } + ) + return components_df + + +@delegate_names( + delegate=DatetimeArray, + accessors=DatetimeArray._datetimelike_ops + ["unit"], + typ="property", +) +@delegate_names( + delegate=DatetimeArray, + accessors=DatetimeArray._datetimelike_methods + ["as_unit"], + typ="method", +) +class DatetimeProperties(Properties): + """ + Accessor object for datetimelike properties of the Series values. + + Examples + -------- + >>> seconds_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="s")) + >>> seconds_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:00:01 + 2 2000-01-01 00:00:02 + dtype: datetime64[ns] + >>> seconds_series.dt.second + 0 0 + 1 1 + 2 2 + dtype: int32 + + >>> hours_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="h")) + >>> hours_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 01:00:00 + 2 2000-01-01 02:00:00 + dtype: datetime64[ns] + >>> hours_series.dt.hour + 0 0 + 1 1 + 2 2 + dtype: int32 + + >>> quarters_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="QE")) + >>> quarters_series + 0 2000-03-31 + 1 2000-06-30 + 2 2000-09-30 + dtype: datetime64[ns] + >>> quarters_series.dt.quarter + 0 1 + 1 2 + 2 3 + dtype: int32 + + Returns a Series indexed like the original Series. + Raises TypeError if the Series does not contain datetimelike values. + """ + + def to_pydatetime(self) -> np.ndarray: + """ + Return the data as an array of :class:`datetime.datetime` objects. + + .. deprecated:: 2.1.0 + + The current behavior of dt.to_pydatetime is deprecated. + In a future version this will return a Series containing python + datetime objects instead of a ndarray. + + Timezone information is retained if present. + + .. warning:: + + Python's datetime uses microsecond resolution, which is lower than + pandas (nanosecond). The values are truncated. + + Returns + ------- + numpy.ndarray + Object dtype array containing native Python datetime objects. + + See Also + -------- + datetime.datetime : Standard library value for a datetime. + + Examples + -------- + >>> s = pd.Series(pd.date_range('20180310', periods=2)) + >>> s + 0 2018-03-10 + 1 2018-03-11 + dtype: datetime64[ns] + + >>> s.dt.to_pydatetime() + array([datetime.datetime(2018, 3, 10, 0, 0), + datetime.datetime(2018, 3, 11, 0, 0)], dtype=object) + + pandas' nanosecond precision is truncated to microseconds. + + >>> s = pd.Series(pd.date_range('20180310', periods=2, freq='ns')) + >>> s + 0 2018-03-10 00:00:00.000000000 + 1 2018-03-10 00:00:00.000000001 + dtype: datetime64[ns] + + >>> s.dt.to_pydatetime() + array([datetime.datetime(2018, 3, 10, 0, 0), + datetime.datetime(2018, 3, 10, 0, 0)], dtype=object) + """ + # GH#20306 + warnings.warn( + f"The behavior of {type(self).__name__}.to_pydatetime is deprecated, " + "in a future version this will return a Series containing python " + "datetime objects instead of an ndarray. To retain the old behavior, " + "call `np.array` on the result", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._get_values().to_pydatetime() + + @property + def freq(self): + return self._get_values().inferred_freq + + def isocalendar(self) -> DataFrame: + """ + Calculate year, week, and day according to the ISO 8601 standard. + + Returns + ------- + DataFrame + With columns year, week and day. + + See Also + -------- + Timestamp.isocalendar : Function return a 3-tuple containing ISO year, + week number, and weekday for the given Timestamp object. + datetime.date.isocalendar : Return a named tuple object with + three components: year, week and weekday. + + Examples + -------- + >>> ser = pd.to_datetime(pd.Series(["2010-01-01", pd.NaT])) + >>> ser.dt.isocalendar() + year week day + 0 2009 53 5 + 1 + >>> ser.dt.isocalendar().week + 0 53 + 1 + Name: week, dtype: UInt32 + """ + return self._get_values().isocalendar().set_index(self._parent.index) + + +@delegate_names( + delegate=TimedeltaArray, accessors=TimedeltaArray._datetimelike_ops, typ="property" +) +@delegate_names( + delegate=TimedeltaArray, + accessors=TimedeltaArray._datetimelike_methods, + typ="method", +) +class TimedeltaProperties(Properties): + """ + Accessor object for datetimelike properties of the Series values. + + Returns a Series indexed like the original Series. + Raises TypeError if the Series does not contain datetimelike values. + + Examples + -------- + >>> seconds_series = pd.Series( + ... pd.timedelta_range(start="1 second", periods=3, freq="s") + ... ) + >>> seconds_series + 0 0 days 00:00:01 + 1 0 days 00:00:02 + 2 0 days 00:00:03 + dtype: timedelta64[ns] + >>> seconds_series.dt.seconds + 0 1 + 1 2 + 2 3 + dtype: int32 + """ + + def to_pytimedelta(self) -> np.ndarray: + """ + Return an array of native :class:`datetime.timedelta` objects. + + Python's standard `datetime` library uses a different representation + timedelta's. This method converts a Series of pandas Timedeltas + to `datetime.timedelta` format with the same length as the original + Series. + + Returns + ------- + numpy.ndarray + Array of 1D containing data with `datetime.timedelta` type. + + See Also + -------- + datetime.timedelta : A duration expressing the difference + between two date, time, or datetime. + + Examples + -------- + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) + >>> s + 0 0 days + 1 1 days + 2 2 days + 3 3 days + 4 4 days + dtype: timedelta64[ns] + + >>> s.dt.to_pytimedelta() + array([datetime.timedelta(0), datetime.timedelta(days=1), + datetime.timedelta(days=2), datetime.timedelta(days=3), + datetime.timedelta(days=4)], dtype=object) + """ + return self._get_values().to_pytimedelta() + + @property + def components(self): + """ + Return a Dataframe of the components of the Timedeltas. + + Returns + ------- + DataFrame + + Examples + -------- + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s')) + >>> s + 0 0 days 00:00:00 + 1 0 days 00:00:01 + 2 0 days 00:00:02 + 3 0 days 00:00:03 + 4 0 days 00:00:04 + dtype: timedelta64[ns] + >>> s.dt.components + days hours minutes seconds milliseconds microseconds nanoseconds + 0 0 0 0 0 0 0 0 + 1 0 0 0 1 0 0 0 + 2 0 0 0 2 0 0 0 + 3 0 0 0 3 0 0 0 + 4 0 0 0 4 0 0 0 + """ + return ( + self._get_values() + .components.set_index(self._parent.index) + .__finalize__(self._parent) + ) + + @property + def freq(self): + return self._get_values().inferred_freq + + +@delegate_names( + delegate=PeriodArray, accessors=PeriodArray._datetimelike_ops, typ="property" +) +@delegate_names( + delegate=PeriodArray, accessors=PeriodArray._datetimelike_methods, typ="method" +) +class PeriodProperties(Properties): + """ + Accessor object for datetimelike properties of the Series values. + + Returns a Series indexed like the original Series. + Raises TypeError if the Series does not contain datetimelike values. + + Examples + -------- + >>> seconds_series = pd.Series( + ... pd.period_range( + ... start="2000-01-01 00:00:00", end="2000-01-01 00:00:03", freq="s" + ... ) + ... ) + >>> seconds_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:00:01 + 2 2000-01-01 00:00:02 + 3 2000-01-01 00:00:03 + dtype: period[s] + >>> seconds_series.dt.second + 0 0 + 1 1 + 2 2 + 3 3 + dtype: int64 + + >>> hours_series = pd.Series( + ... pd.period_range(start="2000-01-01 00:00", end="2000-01-01 03:00", freq="h") + ... ) + >>> hours_series + 0 2000-01-01 00:00 + 1 2000-01-01 01:00 + 2 2000-01-01 02:00 + 3 2000-01-01 03:00 + dtype: period[h] + >>> hours_series.dt.hour + 0 0 + 1 1 + 2 2 + 3 3 + dtype: int64 + + >>> quarters_series = pd.Series( + ... pd.period_range(start="2000-01-01", end="2000-12-31", freq="Q-DEC") + ... ) + >>> quarters_series + 0 2000Q1 + 1 2000Q2 + 2 2000Q3 + 3 2000Q4 + dtype: period[Q-DEC] + >>> quarters_series.dt.quarter + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + """ + + +class CombinedDatetimelikeProperties( + DatetimeProperties, TimedeltaProperties, PeriodProperties +): + def __new__(cls, data: Series): # pyright: ignore[reportInconsistentConstructor] + # CombinedDatetimelikeProperties isn't really instantiated. Instead + # we need to choose which parent (datetime or timedelta) is + # appropriate. Since we're checking the dtypes anyway, we'll just + # do all the validation here. + + if not isinstance(data, ABCSeries): + raise TypeError( + f"cannot convert an object of type {type(data)} to a datetimelike index" + ) + + orig = data if isinstance(data.dtype, CategoricalDtype) else None + if orig is not None: + data = data._constructor( + orig.array, + name=orig.name, + copy=False, + dtype=orig._values.categories.dtype, + index=orig.index, + ) + + if isinstance(data.dtype, ArrowDtype) and data.dtype.kind in "Mm": + return ArrowTemporalProperties(data, orig) + if lib.is_np_dtype(data.dtype, "M"): + return DatetimeProperties(data, orig) + elif isinstance(data.dtype, DatetimeTZDtype): + return DatetimeProperties(data, orig) + elif lib.is_np_dtype(data.dtype, "m"): + return TimedeltaProperties(data, orig) + elif isinstance(data.dtype, PeriodDtype): + return PeriodProperties(data, orig) + + raise AttributeError("Can only use .dt accessor with datetimelike values") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/api.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/api.py new file mode 100644 index 0000000000000000000000000000000000000000..15292953e72d00a8f57c34d2e2cc8a43f6863d39 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/api.py @@ -0,0 +1,388 @@ +from __future__ import annotations + +import textwrap +from typing import ( + TYPE_CHECKING, + cast, +) + +import numpy as np + +from pandas._libs import ( + NaT, + lib, +) +from pandas.errors import InvalidIndexError + +from pandas.core.dtypes.cast import find_common_type + +from pandas.core.algorithms import safe_sort +from pandas.core.indexes.base import ( + Index, + _new_Index, + ensure_index, + ensure_index_from_sequences, + get_unanimous_names, +) +from pandas.core.indexes.category import CategoricalIndex +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.interval import IntervalIndex +from pandas.core.indexes.multi import MultiIndex +from pandas.core.indexes.period import PeriodIndex +from pandas.core.indexes.range import RangeIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex + +if TYPE_CHECKING: + from pandas._typing import Axis +_sort_msg = textwrap.dedent( + """\ +Sorting because non-concatenation axis is not aligned. A future version +of pandas will change to not sort by default. + +To accept the future behavior, pass 'sort=False'. + +To retain the current behavior and silence the warning, pass 'sort=True'. +""" +) + + +__all__ = [ + "Index", + "MultiIndex", + "CategoricalIndex", + "IntervalIndex", + "RangeIndex", + "InvalidIndexError", + "TimedeltaIndex", + "PeriodIndex", + "DatetimeIndex", + "_new_Index", + "NaT", + "ensure_index", + "ensure_index_from_sequences", + "get_objs_combined_axis", + "union_indexes", + "get_unanimous_names", + "all_indexes_same", + "default_index", + "safe_sort_index", +] + + +def get_objs_combined_axis( + objs, + intersect: bool = False, + axis: Axis = 0, + sort: bool = True, + copy: bool = False, +) -> Index: + """ + Extract combined index: return intersection or union (depending on the + value of "intersect") of indexes on given axis, or None if all objects + lack indexes (e.g. they are numpy arrays). + + Parameters + ---------- + objs : list + Series or DataFrame objects, may be mix of the two. + intersect : bool, default False + If True, calculate the intersection between indexes. Otherwise, + calculate the union. + axis : {0 or 'index', 1 or 'outer'}, default 0 + The axis to extract indexes from. + sort : bool, default True + Whether the result index should come out sorted or not. + copy : bool, default False + If True, return a copy of the combined index. + + Returns + ------- + Index + """ + obs_idxes = [obj._get_axis(axis) for obj in objs] + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort, copy=copy) + + +def _get_distinct_objs(objs: list[Index]) -> list[Index]: + """ + Return a list with distinct elements of "objs" (different ids). + Preserves order. + """ + ids: set[int] = set() + res = [] + for obj in objs: + if id(obj) not in ids: + ids.add(id(obj)) + res.append(obj) + return res + + +def _get_combined_index( + indexes: list[Index], + intersect: bool = False, + sort: bool = False, + copy: bool = False, +) -> Index: + """ + Return the union or intersection of indexes. + + Parameters + ---------- + indexes : list of Index or list objects + When intersect=True, do not accept list of lists. + intersect : bool, default False + If True, calculate the intersection between indexes. Otherwise, + calculate the union. + sort : bool, default False + Whether the result index should come out sorted or not. + copy : bool, default False + If True, return a copy of the combined index. + + Returns + ------- + Index + """ + # TODO: handle index names! + indexes = _get_distinct_objs(indexes) + if len(indexes) == 0: + index = Index([]) + elif len(indexes) == 1: + index = indexes[0] + elif intersect: + index = indexes[0] + for other in indexes[1:]: + index = index.intersection(other) + else: + index = union_indexes(indexes, sort=False) + index = ensure_index(index) + + if sort: + index = safe_sort_index(index) + # GH 29879 + if copy: + index = index.copy() + + return index + + +def safe_sort_index(index: Index) -> Index: + """ + Returns the sorted index + + We keep the dtypes and the name attributes. + + Parameters + ---------- + index : an Index + + Returns + ------- + Index + """ + if index.is_monotonic_increasing: + return index + + try: + array_sorted = safe_sort(index) + except TypeError: + pass + else: + if isinstance(array_sorted, Index): + return array_sorted + + array_sorted = cast(np.ndarray, array_sorted) + if isinstance(index, MultiIndex): + index = MultiIndex.from_tuples(array_sorted, names=index.names) + else: + index = Index(array_sorted, name=index.name, dtype=index.dtype) + + return index + + +def union_indexes(indexes, sort: bool | None = True) -> Index: + """ + Return the union of indexes. + + The behavior of sort and names is not consistent. + + Parameters + ---------- + indexes : list of Index or list objects + sort : bool, default True + Whether the result index should come out sorted or not. + + Returns + ------- + Index + """ + if len(indexes) == 0: + raise AssertionError("Must have at least 1 Index to union") + if len(indexes) == 1: + result = indexes[0] + if isinstance(result, list): + if not sort: + result = Index(result) + else: + result = Index(sorted(result)) + return result + + indexes, kind = _sanitize_and_check(indexes) + + def _unique_indices(inds, dtype) -> Index: + """ + Concatenate indices and remove duplicates. + + Parameters + ---------- + inds : list of Index or list objects + dtype : dtype to set for the resulting Index + + Returns + ------- + Index + """ + if all(isinstance(ind, Index) for ind in inds): + inds = [ind.astype(dtype, copy=False) for ind in inds] + result = inds[0].unique() + other = inds[1].append(inds[2:]) + diff = other[result.get_indexer_for(other) == -1] + if len(diff): + result = result.append(diff.unique()) + if sort: + result = result.sort_values() + return result + + def conv(i): + if isinstance(i, Index): + i = i.tolist() + return i + + return Index( + lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort), + dtype=dtype, + ) + + def _find_common_index_dtype(inds): + """ + Finds a common type for the indexes to pass through to resulting index. + + Parameters + ---------- + inds: list of Index or list objects + + Returns + ------- + The common type or None if no indexes were given + """ + dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)] + if dtypes: + dtype = find_common_type(dtypes) + else: + dtype = None + + return dtype + + if kind == "special": + result = indexes[0] + + dtis = [x for x in indexes if isinstance(x, DatetimeIndex)] + dti_tzs = [x for x in dtis if x.tz is not None] + if len(dti_tzs) not in [0, len(dtis)]: + # TODO: this behavior is not tested (so may not be desired), + # but is kept in order to keep behavior the same when + # deprecating union_many + # test_frame_from_dict_with_mixed_indexes + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + + if len(dtis) == len(indexes): + sort = True + result = indexes[0] + + elif len(dtis) > 1: + # If we have mixed timezones, our casting behavior may depend on + # the order of indexes, which we don't want. + sort = False + + # TODO: what about Categorical[dt64]? + # test_frame_from_dict_with_mixed_indexes + indexes = [x.astype(object, copy=False) for x in indexes] + result = indexes[0] + + for other in indexes[1:]: + result = result.union(other, sort=None if sort else False) + return result + + elif kind == "array": + dtype = _find_common_index_dtype(indexes) + index = indexes[0] + if not all(index.equals(other) for other in indexes[1:]): + index = _unique_indices(indexes, dtype) + + name = get_unanimous_names(*indexes)[0] + if name != index.name: + index = index.rename(name) + return index + else: # kind='list' + dtype = _find_common_index_dtype(indexes) + return _unique_indices(indexes, dtype) + + +def _sanitize_and_check(indexes): + """ + Verify the type of indexes and convert lists to Index. + + Cases: + + - [list, list, ...]: Return ([list, list, ...], 'list') + - [list, Index, ...]: Return _sanitize_and_check([Index, Index, ...]) + Lists are sorted and converted to Index. + - [Index, Index, ...]: Return ([Index, Index, ...], TYPE) + TYPE = 'special' if at least one special type, 'array' otherwise. + + Parameters + ---------- + indexes : list of Index or list objects + + Returns + ------- + sanitized_indexes : list of Index or list objects + type : {'list', 'array', 'special'} + """ + kinds = list({type(index) for index in indexes}) + + if list in kinds: + if len(kinds) > 1: + indexes = [ + Index(list(x)) if not isinstance(x, Index) else x for x in indexes + ] + kinds.remove(list) + else: + return indexes, "list" + + if len(kinds) > 1 or Index not in kinds: + return indexes, "special" + else: + return indexes, "array" + + +def all_indexes_same(indexes) -> bool: + """ + Determine if all indexes contain the same elements. + + Parameters + ---------- + indexes : iterable of Index objects + + Returns + ------- + bool + True if all indexes contain the same elements, False otherwise. + """ + itr = iter(indexes) + first = next(itr) + return all(first.equals(index) for index in itr) + + +def default_index(n: int) -> RangeIndex: + rng = range(n) + return RangeIndex._simple_new(rng, name=None) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py new file mode 100644 index 0000000000000000000000000000000000000000..6822c2c99427eb5b4d3c7080047ccba538b4db6a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py @@ -0,0 +1,7877 @@ +from __future__ import annotations + +from collections import abc +from datetime import datetime +import functools +from itertools import zip_longest +import operator +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ClassVar, + Literal, + NoReturn, + cast, + final, + overload, +) +import warnings + +import numpy as np + +from pandas._config import ( + get_option, + using_copy_on_write, + using_pyarrow_string_dtype, +) + +from pandas._libs import ( + NaT, + algos as libalgos, + index as libindex, + lib, + writers, +) +from pandas._libs.internals import BlockValuesRefs +import pandas._libs.join as libjoin +from pandas._libs.lib import ( + is_datetime_array, + no_default, +) +from pandas._libs.tslibs import ( + IncompatibleFrequency, + OutOfBoundsDatetime, + Timestamp, + tz_compare, +) +from pandas._typing import ( + AnyAll, + ArrayLike, + Axes, + Axis, + DropKeep, + DtypeObj, + F, + IgnoreRaise, + IndexLabel, + JoinHow, + Level, + NaPosition, + ReindexMethod, + Self, + Shape, + npt, +) +from pandas.compat.numpy import function as nv +from pandas.errors import ( + DuplicateLabelError, + InvalidIndexError, +) +from pandas.util._decorators import ( + Appender, + cache_readonly, + deprecate_nonkeyword_arguments, + doc, +) +from pandas.util._exceptions import ( + find_stack_level, + rewrite_exception, +) + +from pandas.core.dtypes.astype import ( + astype_array, + astype_is_view, +) +from pandas.core.dtypes.cast import ( + LossySetitemError, + can_hold_element, + common_dtype_categorical_compat, + find_result_type, + infer_dtype_from, + maybe_cast_pointwise_result, + np_can_hold_element, +) +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_object, + ensure_platform_int, + is_any_real_numeric_dtype, + is_bool_dtype, + is_ea_or_datetimelike_dtype, + is_float, + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_scalar, + is_signed_integer_dtype, + is_string_dtype, + needs_i8_conversion, + pandas_dtype, + validate_all_hashable, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PeriodDtype, + SparseDtype, +) +from pandas.core.dtypes.generic import ( + ABCCategoricalIndex, + ABCDataFrame, + ABCDatetimeIndex, + ABCIntervalIndex, + ABCMultiIndex, + ABCPeriodIndex, + ABCRangeIndex, + ABCSeries, + ABCTimedeltaIndex, +) +from pandas.core.dtypes.inference import is_dict_like +from pandas.core.dtypes.missing import ( + array_equivalent, + is_valid_na_for_dtype, + isna, +) + +from pandas.core import ( + arraylike, + nanops, + ops, +) +from pandas.core.accessor import CachedAccessor +import pandas.core.algorithms as algos +from pandas.core.array_algos.putmask import ( + setitem_datetimelike_compat, + validate_putmask, +) +from pandas.core.arrays import ( + ArrowExtensionArray, + BaseMaskedArray, + Categorical, + DatetimeArray, + ExtensionArray, + TimedeltaArray, +) +from pandas.core.arrays.string_ import ( + StringArray, + StringDtype, +) +from pandas.core.base import ( + IndexOpsMixin, + PandasObject, +) +import pandas.core.common as com +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, + sanitize_array, +) +from pandas.core.indexers import ( + disallow_ndim_indexing, + is_valid_positional_slice, +) +from pandas.core.indexes.frozen import FrozenList +from pandas.core.missing import clean_reindex_fill_method +from pandas.core.ops import get_op_result_name +from pandas.core.ops.invalid import make_invalid_op +from pandas.core.sorting import ( + ensure_key_mapped, + get_group_index_sorter, + nargsort, +) +from pandas.core.strings.accessor import StringMethods + +from pandas.io.formats.printing import ( + PrettyDict, + default_pprint, + format_object_summary, + pprint_thing, +) + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterable, + Sequence, + ) + + from pandas import ( + CategoricalIndex, + DataFrame, + MultiIndex, + Series, + ) + from pandas.core.arrays import ( + IntervalArray, + PeriodArray, + ) + +__all__ = ["Index"] + +_unsortable_types = frozenset(("mixed", "mixed-integer")) + +_index_doc_kwargs: dict[str, str] = { + "klass": "Index", + "inplace": "", + "target_klass": "Index", + "raises_section": "", + "unique": "Index", + "duplicated": "np.ndarray", +} +_index_shared_docs: dict[str, str] = {} +str_t = str + +_dtype_obj = np.dtype("object") + +_masked_engines = { + "Complex128": libindex.MaskedComplex128Engine, + "Complex64": libindex.MaskedComplex64Engine, + "Float64": libindex.MaskedFloat64Engine, + "Float32": libindex.MaskedFloat32Engine, + "UInt64": libindex.MaskedUInt64Engine, + "UInt32": libindex.MaskedUInt32Engine, + "UInt16": libindex.MaskedUInt16Engine, + "UInt8": libindex.MaskedUInt8Engine, + "Int64": libindex.MaskedInt64Engine, + "Int32": libindex.MaskedInt32Engine, + "Int16": libindex.MaskedInt16Engine, + "Int8": libindex.MaskedInt8Engine, + "boolean": libindex.MaskedBoolEngine, + "double[pyarrow]": libindex.MaskedFloat64Engine, + "float64[pyarrow]": libindex.MaskedFloat64Engine, + "float32[pyarrow]": libindex.MaskedFloat32Engine, + "float[pyarrow]": libindex.MaskedFloat32Engine, + "uint64[pyarrow]": libindex.MaskedUInt64Engine, + "uint32[pyarrow]": libindex.MaskedUInt32Engine, + "uint16[pyarrow]": libindex.MaskedUInt16Engine, + "uint8[pyarrow]": libindex.MaskedUInt8Engine, + "int64[pyarrow]": libindex.MaskedInt64Engine, + "int32[pyarrow]": libindex.MaskedInt32Engine, + "int16[pyarrow]": libindex.MaskedInt16Engine, + "int8[pyarrow]": libindex.MaskedInt8Engine, + "bool[pyarrow]": libindex.MaskedBoolEngine, +} + + +def _maybe_return_indexers(meth: F) -> F: + """ + Decorator to simplify 'return_indexers' checks in Index.join. + """ + + @functools.wraps(meth) + def join( + self, + other: Index, + *, + how: JoinHow = "left", + level=None, + return_indexers: bool = False, + sort: bool = False, + ): + join_index, lidx, ridx = meth(self, other, how=how, level=level, sort=sort) + if not return_indexers: + return join_index + + if lidx is not None: + lidx = ensure_platform_int(lidx) + if ridx is not None: + ridx = ensure_platform_int(ridx) + return join_index, lidx, ridx + + return cast(F, join) + + +def _new_Index(cls, d): + """ + This is called upon unpickling, rather than the default which doesn't + have arguments and breaks __new__. + """ + # required for backward compat, because PI can't be instantiated with + # ordinals through __new__ GH #13277 + if issubclass(cls, ABCPeriodIndex): + from pandas.core.indexes.period import _new_PeriodIndex + + return _new_PeriodIndex(cls, **d) + + if issubclass(cls, ABCMultiIndex): + if "labels" in d and "codes" not in d: + # GH#23752 "labels" kwarg has been replaced with "codes" + d["codes"] = d.pop("labels") + + # Since this was a valid MultiIndex at pickle-time, we don't need to + # check validty at un-pickle time. + d["verify_integrity"] = False + + elif "dtype" not in d and "data" in d: + # Prevent Index.__new__ from conducting inference; + # "data" key not in RangeIndex + d["dtype"] = d["data"].dtype + return cls.__new__(cls, **d) + + +class Index(IndexOpsMixin, PandasObject): + """ + Immutable sequence used for indexing and alignment. + + The basic object storing axis labels for all pandas objects. + + .. versionchanged:: 2.0.0 + + Index can hold all numpy numeric dtypes (except float16). Previously only + int64/uint64/float64 dtypes were accepted. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Index. If not specified, this will be + inferred from `data`. + See the :ref:`user guide ` for more usages. + copy : bool, default False + Copy input data. + name : object + Name to be stored in the index. + tupleize_cols : bool (default: True) + When True, attempt to create a MultiIndex if possible. + + See Also + -------- + RangeIndex : Index implementing a monotonic integer range. + CategoricalIndex : Index of :class:`Categorical` s. + MultiIndex : A multi-level, or hierarchical Index. + IntervalIndex : An Index of :class:`Interval` s. + DatetimeIndex : Index of datetime64 data. + TimedeltaIndex : Index of timedelta64 data. + PeriodIndex : Index of Period data. + + Notes + ----- + An Index instance can **only** contain hashable objects. + An Index instance *can not* hold numpy float16 dtype. + + Examples + -------- + >>> pd.Index([1, 2, 3]) + Index([1, 2, 3], dtype='int64') + + >>> pd.Index(list('abc')) + Index(['a', 'b', 'c'], dtype='object') + + >>> pd.Index([1, 2, 3], dtype="uint8") + Index([1, 2, 3], dtype='uint8') + """ + + # similar to __array_priority__, positions Index after Series and DataFrame + # but before ExtensionArray. Should NOT be overridden by subclasses. + __pandas_priority__ = 2000 + + # Cython methods; see github.com/cython/cython/issues/2647 + # for why we need to wrap these instead of making them class attributes + # Moreover, cython will choose the appropriate-dtyped sub-function + # given the dtypes of the passed arguments + + @final + def _left_indexer_unique(self, other: Self) -> npt.NDArray[np.intp]: + # Caller is responsible for ensuring other.dtype == self.dtype + sv = self._get_join_target() + ov = other._get_join_target() + # similar but not identical to ov.searchsorted(sv) + return libjoin.left_join_indexer_unique(sv, ov) + + @final + def _left_indexer( + self, other: Self + ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: + # Caller is responsible for ensuring other.dtype == self.dtype + sv = self._get_join_target() + ov = other._get_join_target() + joined_ndarray, lidx, ridx = libjoin.left_join_indexer(sv, ov) + joined = self._from_join_target(joined_ndarray) + return joined, lidx, ridx + + @final + def _inner_indexer( + self, other: Self + ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: + # Caller is responsible for ensuring other.dtype == self.dtype + sv = self._get_join_target() + ov = other._get_join_target() + joined_ndarray, lidx, ridx = libjoin.inner_join_indexer(sv, ov) + joined = self._from_join_target(joined_ndarray) + return joined, lidx, ridx + + @final + def _outer_indexer( + self, other: Self + ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]: + # Caller is responsible for ensuring other.dtype == self.dtype + sv = self._get_join_target() + ov = other._get_join_target() + joined_ndarray, lidx, ridx = libjoin.outer_join_indexer(sv, ov) + joined = self._from_join_target(joined_ndarray) + return joined, lidx, ridx + + _typ: str = "index" + _data: ExtensionArray | np.ndarray + _data_cls: type[ExtensionArray] | tuple[type[np.ndarray], type[ExtensionArray]] = ( + np.ndarray, + ExtensionArray, + ) + _id: object | None = None + _name: Hashable = None + # MultiIndex.levels previously allowed setting the index name. We + # don't allow this anymore, and raise if it happens rather than + # failing silently. + _no_setting_name: bool = False + _comparables: list[str] = ["name"] + _attributes: list[str] = ["name"] + + @cache_readonly + def _can_hold_strings(self) -> bool: + return not is_numeric_dtype(self.dtype) + + _engine_types: dict[np.dtype | ExtensionDtype, type[libindex.IndexEngine]] = { + np.dtype(np.int8): libindex.Int8Engine, + np.dtype(np.int16): libindex.Int16Engine, + np.dtype(np.int32): libindex.Int32Engine, + np.dtype(np.int64): libindex.Int64Engine, + np.dtype(np.uint8): libindex.UInt8Engine, + np.dtype(np.uint16): libindex.UInt16Engine, + np.dtype(np.uint32): libindex.UInt32Engine, + np.dtype(np.uint64): libindex.UInt64Engine, + np.dtype(np.float32): libindex.Float32Engine, + np.dtype(np.float64): libindex.Float64Engine, + np.dtype(np.complex64): libindex.Complex64Engine, + np.dtype(np.complex128): libindex.Complex128Engine, + } + + @property + def _engine_type( + self, + ) -> type[libindex.IndexEngine | libindex.ExtensionEngine]: + return self._engine_types.get(self.dtype, libindex.ObjectEngine) + + # whether we support partial string indexing. Overridden + # in DatetimeIndex and PeriodIndex + _supports_partial_string_indexing = False + + _accessors = {"str"} + + str = CachedAccessor("str", StringMethods) + + _references = None + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + data=None, + dtype=None, + copy: bool = False, + name=None, + tupleize_cols: bool = True, + ) -> Self: + from pandas.core.indexes.range import RangeIndex + + name = maybe_extract_name(name, data, cls) + + if dtype is not None: + dtype = pandas_dtype(dtype) + + data_dtype = getattr(data, "dtype", None) + + refs = None + if not copy and isinstance(data, (ABCSeries, Index)): + refs = data._references + + is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray)) + + # range + if isinstance(data, (range, RangeIndex)): + result = RangeIndex(start=data, copy=copy, name=name) + if dtype is not None: + return result.astype(dtype, copy=False) + # error: Incompatible return value type (got "MultiIndex", + # expected "Self") + return result # type: ignore[return-value] + + elif is_ea_or_datetimelike_dtype(dtype): + # non-EA dtype indexes have special casting logic, so we punt here + pass + + elif is_ea_or_datetimelike_dtype(data_dtype): + pass + + elif isinstance(data, (np.ndarray, Index, ABCSeries)): + if isinstance(data, ABCMultiIndex): + data = data._values + + if data.dtype.kind not in "iufcbmM": + # GH#11836 we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced + data = com.asarray_tuplesafe(data, dtype=_dtype_obj) + + elif is_scalar(data): + raise cls._raise_scalar_data_error(data) + elif hasattr(data, "__array__"): + return cls(np.asarray(data), dtype=dtype, copy=copy, name=name) + elif not is_list_like(data) and not isinstance(data, memoryview): + # 2022-11-16 the memoryview check is only necessary on some CI + # builds, not clear why + raise cls._raise_scalar_data_error(data) + + else: + if tupleize_cols: + # GH21470: convert iterable to list before determining if empty + if is_iterator(data): + data = list(data) + + if data and all(isinstance(e, tuple) for e in data): + # we must be all tuples, otherwise don't construct + # 10697 + from pandas.core.indexes.multi import MultiIndex + + # error: Incompatible return value type (got "MultiIndex", + # expected "Self") + return MultiIndex.from_tuples( # type: ignore[return-value] + data, names=name + ) + # other iterable of some kind + + if not isinstance(data, (list, tuple)): + # we allow set/frozenset, which Series/sanitize_array does not, so + # cast to list here + data = list(data) + if len(data) == 0: + # unlike Series, we default to object dtype: + data = np.array(data, dtype=object) + + if len(data) and isinstance(data[0], tuple): + # Ensure we get 1-D array of tuples instead of 2D array. + data = com.asarray_tuplesafe(data, dtype=_dtype_obj) + + try: + arr = sanitize_array(data, None, dtype=dtype, copy=copy) + except ValueError as err: + if "index must be specified when data is not list-like" in str(err): + raise cls._raise_scalar_data_error(data) from err + if "Data must be 1-dimensional" in str(err): + raise ValueError("Index data must be 1-dimensional") from err + raise + arr = ensure_wrapped_if_datetimelike(arr) + + klass = cls._dtype_to_subclass(arr.dtype) + + arr = klass._ensure_array(arr, arr.dtype, copy=False) + result = klass._simple_new(arr, name, refs=refs) + if dtype is None and is_pandas_object and data_dtype == np.object_: + if result.dtype != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The Index " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + return result # type: ignore[return-value] + + @classmethod + def _ensure_array(cls, data, dtype, copy: bool): + """ + Ensure we have a valid array to pass to _simple_new. + """ + if data.ndim > 1: + # GH#13601, GH#20285, GH#27125 + raise ValueError("Index data must be 1-dimensional") + elif dtype == np.float16: + # float16 not supported (no indexing engine) + raise NotImplementedError("float16 indexes are not supported") + + if copy: + # asarray_tuplesafe does not always copy underlying data, + # so need to make sure that this happens + data = data.copy() + return data + + @final + @classmethod + def _dtype_to_subclass(cls, dtype: DtypeObj): + # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 + + if isinstance(dtype, ExtensionDtype): + return dtype.index_class + + if dtype.kind == "M": + from pandas import DatetimeIndex + + return DatetimeIndex + + elif dtype.kind == "m": + from pandas import TimedeltaIndex + + return TimedeltaIndex + + elif dtype.kind == "O": + # NB: assuming away MultiIndex + return Index + + elif issubclass(dtype.type, str) or is_numeric_dtype(dtype): + return Index + + raise NotImplementedError(dtype) + + # NOTE for new Index creation: + + # - _simple_new: It returns new Index with the same type as the caller. + # All metadata (such as name) must be provided by caller's responsibility. + # Using _shallow_copy is recommended because it fills these metadata + # otherwise specified. + + # - _shallow_copy: It returns new Index with the same type (using + # _simple_new), but fills caller's metadata otherwise specified. Passed + # kwargs will overwrite corresponding metadata. + + # See each method's docstring. + + @classmethod + def _simple_new( + cls, values: ArrayLike, name: Hashable | None = None, refs=None + ) -> Self: + """ + We require that we have a dtype compat for the values. If we are passed + a non-dtype compat, then coerce using the constructor. + + Must be careful not to recurse. + """ + assert isinstance(values, cls._data_cls), type(values) + + result = object.__new__(cls) + result._data = values + result._name = name + result._cache = {} + result._reset_identity() + if refs is not None: + result._references = refs + else: + result._references = BlockValuesRefs() + result._references.add_index_reference(result) + + return result + + @classmethod + def _with_infer(cls, *args, **kwargs): + """ + Constructor that uses the 1.0.x behavior inferring numeric dtypes + for ndarray[object] inputs. + """ + result = cls(*args, **kwargs) + + if result.dtype == _dtype_obj and not result._is_multi: + # error: Argument 1 to "maybe_convert_objects" has incompatible type + # "Union[ExtensionArray, ndarray[Any, Any]]"; expected + # "ndarray[Any, Any]" + values = lib.maybe_convert_objects(result._values) # type: ignore[arg-type] + if values.dtype.kind in "iufb": + return Index(values, name=result.name) + + return result + + @cache_readonly + def _constructor(self) -> type[Self]: + return type(self) + + @final + def _maybe_check_unique(self) -> None: + """ + Check that an Index has no duplicates. + + This is typically only called via + `NDFrame.flags.allows_duplicate_labels.setter` when it's set to + True (duplicates aren't allowed). + + Raises + ------ + DuplicateLabelError + When the index is not unique. + """ + if not self.is_unique: + msg = """Index has duplicates.""" + duplicates = self._format_duplicate_message() + msg += f"\n{duplicates}" + + raise DuplicateLabelError(msg) + + @final + def _format_duplicate_message(self) -> DataFrame: + """ + Construct the DataFrame for a DuplicateLabelError. + + This returns a DataFrame indicating the labels and positions + of duplicates in an index. This should only be called when it's + already known that duplicates are present. + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'a']) + >>> idx._format_duplicate_message() + positions + label + a [0, 2] + """ + from pandas import Series + + duplicates = self[self.duplicated(keep="first")].unique() + assert len(duplicates) + + out = ( + Series(np.arange(len(self)), copy=False) + .groupby(self, observed=False) + .agg(list)[duplicates] + ) + if self._is_multi: + # test_format_duplicate_labels_message_multi + # error: "Type[Index]" has no attribute "from_tuples" [attr-defined] + out.index = type(self).from_tuples(out.index) # type: ignore[attr-defined] + + if self.nlevels == 1: + out = out.rename_axis("label") + return out.to_frame(name="positions") + + # -------------------------------------------------------------------- + # Index Internals Methods + + def _shallow_copy(self, values, name: Hashable = no_default) -> Self: + """ + Create a new Index with the same class as the caller, don't copy the + data, use the same object attributes with passed in attributes taking + precedence. + + *this is an internal non-public method* + + Parameters + ---------- + values : the values to create the new Index, optional + name : Label, defaults to self.name + """ + name = self._name if name is no_default else name + + return self._simple_new(values, name=name, refs=self._references) + + def _view(self) -> Self: + """ + fastpath to make a shallow copy, i.e. new object with same data. + """ + result = self._simple_new(self._values, name=self._name, refs=self._references) + + result._cache = self._cache + return result + + @final + def _rename(self, name: Hashable) -> Self: + """ + fastpath for rename if new name is already validated. + """ + result = self._view() + result._name = name + return result + + @final + def is_(self, other) -> bool: + """ + More flexible, faster check like ``is`` but that works through views. + + Note: this is *not* the same as ``Index.identical()``, which checks + that metadata is also the same. + + Parameters + ---------- + other : object + Other object to compare against. + + Returns + ------- + bool + True if both have same underlying data, False otherwise. + + See Also + -------- + Index.identical : Works like ``Index.is_`` but also checks metadata. + + Examples + -------- + >>> idx1 = pd.Index(['1', '2', '3']) + >>> idx1.is_(idx1.view()) + True + + >>> idx1.is_(idx1.copy()) + False + """ + if self is other: + return True + elif not hasattr(other, "_id"): + return False + elif self._id is None or other._id is None: + return False + else: + return self._id is other._id + + @final + def _reset_identity(self) -> None: + """ + Initializes or resets ``_id`` attribute with new object. + """ + self._id = object() + + @final + def _cleanup(self) -> None: + self._engine.clear_mapping() + + @cache_readonly + def _engine( + self, + ) -> libindex.IndexEngine | libindex.ExtensionEngine | libindex.MaskedIndexEngine: + # For base class (object dtype) we get ObjectEngine + target_values = self._get_engine_target() + + if isinstance(self._values, ArrowExtensionArray) and self.dtype.kind in "Mm": + import pyarrow as pa + + pa_type = self._values._pa_array.type + if pa.types.is_timestamp(pa_type): + target_values = self._values._to_datetimearray() + return libindex.DatetimeEngine(target_values._ndarray) + elif pa.types.is_duration(pa_type): + target_values = self._values._to_timedeltaarray() + return libindex.TimedeltaEngine(target_values._ndarray) + + if isinstance(target_values, ExtensionArray): + if isinstance(target_values, (BaseMaskedArray, ArrowExtensionArray)): + try: + return _masked_engines[target_values.dtype.name](target_values) + except KeyError: + # Not supported yet e.g. decimal + pass + elif self._engine_type is libindex.ObjectEngine: + return libindex.ExtensionEngine(target_values) + + target_values = cast(np.ndarray, target_values) + # to avoid a reference cycle, bind `target_values` to a local variable, so + # `self` is not passed into the lambda. + if target_values.dtype == bool: + return libindex.BoolEngine(target_values) + elif target_values.dtype == np.complex64: + return libindex.Complex64Engine(target_values) + elif target_values.dtype == np.complex128: + return libindex.Complex128Engine(target_values) + elif needs_i8_conversion(self.dtype): + # We need to keep M8/m8 dtype when initializing the Engine, + # but don't want to change _get_engine_target bc it is used + # elsewhere + # error: Item "ExtensionArray" of "Union[ExtensionArray, + # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] + target_values = self._data._ndarray # type: ignore[union-attr] + + # error: Argument 1 to "ExtensionEngine" has incompatible type + # "ndarray[Any, Any]"; expected "ExtensionArray" + return self._engine_type(target_values) # type: ignore[arg-type] + + @final + @cache_readonly + def _dir_additions_for_owner(self) -> set[str_t]: + """ + Add the string-like labels to the owner dataframe/series dir output. + + If this is a MultiIndex, it's first level values are used. + """ + return { + c + for c in self.unique(level=0)[: get_option("display.max_dir_items")] + if isinstance(c, str) and c.isidentifier() + } + + # -------------------------------------------------------------------- + # Array-Like Methods + + # ndarray compat + def __len__(self) -> int: + """ + Return the length of the Index. + """ + return len(self._data) + + def __array__(self, dtype=None, copy=None) -> np.ndarray: + """ + The array interface, return my values. + """ + return np.asarray(self._data, dtype=dtype) + + def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): + if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): + return NotImplemented + + result = arraylike.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + if "out" in kwargs: + # e.g. test_dti_isub_tdi + return arraylike.dispatch_ufunc_with_out( + self, ufunc, method, *inputs, **kwargs + ) + + if method == "reduce": + result = arraylike.dispatch_reduction_ufunc( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + new_inputs = [x if x is not self else x._values for x in inputs] + result = getattr(ufunc, method)(*new_inputs, **kwargs) + if ufunc.nout == 2: + # i.e. np.divmod, np.modf, np.frexp + return tuple(self.__array_wrap__(x) for x in result) + elif method == "reduce": + result = lib.item_from_zerodim(result) + return result + + if result.dtype == np.float16: + result = result.astype(np.float32) + + return self.__array_wrap__(result) + + @final + def __array_wrap__(self, result, context=None, return_scalar=False): + """ + Gets called after a ufunc and other functions e.g. np.split. + """ + result = lib.item_from_zerodim(result) + if (not isinstance(result, Index) and is_bool_dtype(result.dtype)) or np.ndim( + result + ) > 1: + # exclude Index to avoid warning from is_bool_dtype deprecation; + # in the Index case it doesn't matter which path we go down. + # reached in plotting tests with e.g. np.nonzero(index) + return result + + return Index(result, name=self.name) + + @cache_readonly + def dtype(self) -> DtypeObj: + """ + Return the dtype object of the underlying data. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.dtype + dtype('int64') + """ + return self._data.dtype + + @final + def ravel(self, order: str_t = "C") -> Self: + """ + Return a view on self. + + Returns + ------- + Index + + See Also + -------- + numpy.ndarray.ravel : Return a flattened array. + + Examples + -------- + >>> s = pd.Series([1, 2, 3], index=['a', 'b', 'c']) + >>> s.index.ravel() + Index(['a', 'b', 'c'], dtype='object') + """ + return self[:] + + def view(self, cls=None): + # we need to see if we are subclassing an + # index type here + if cls is not None and not hasattr(cls, "_typ"): + dtype = cls + if isinstance(cls, str): + dtype = pandas_dtype(cls) + + if needs_i8_conversion(dtype): + idx_cls = self._dtype_to_subclass(dtype) + arr = self.array.view(dtype) + if isinstance(arr, ExtensionArray): + # here we exclude non-supported dt64/td64 dtypes + return idx_cls._simple_new( + arr, name=self.name, refs=self._references + ) + return arr + + result = self._data.view(cls) + else: + if cls is not None: + warnings.warn( + # GH#55709 + f"Passing a type in {type(self).__name__}.view is deprecated " + "and will raise in a future version. " + "Call view without any argument to retain the old behavior.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + result = self._view() + if isinstance(result, Index): + result._id = self._id + return result + + def astype(self, dtype, copy: bool = True): + """ + Create an Index with values cast to dtypes. + + The class of a new Index is determined by dtype. When conversion is + impossible, a TypeError exception is raised. + + Parameters + ---------- + dtype : numpy dtype or pandas type + Note that any signed integer `dtype` is treated as ``'int64'``, + and any unsigned integer `dtype` is treated as ``'uint64'``, + regardless of the size. + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. + + Returns + ------- + Index + Index with values cast to specified dtype. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.astype('float') + Index([1.0, 2.0, 3.0], dtype='float64') + """ + if dtype is not None: + dtype = pandas_dtype(dtype) + + if self.dtype == dtype: + # Ensure that self.astype(self.dtype) is self + return self.copy() if copy else self + + values = self._data + if isinstance(values, ExtensionArray): + with rewrite_exception(type(values).__name__, type(self).__name__): + new_values = values.astype(dtype, copy=copy) + + elif isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + # Note: for RangeIndex and CategoricalDtype self vs self._values + # behaves differently here. + new_values = cls._from_sequence(self, dtype=dtype, copy=copy) + + else: + # GH#13149 specifically use astype_array instead of astype + new_values = astype_array(values, dtype=dtype, copy=copy) + + # pass copy=False because any copying will be done in the astype above + result = Index(new_values, name=self.name, dtype=new_values.dtype, copy=False) + if ( + not copy + and self._references is not None + and astype_is_view(self.dtype, dtype) + ): + result._references = self._references + result._references.add_index_reference(result) + return result + + _index_shared_docs[ + "take" + ] = """ + Return a new %(klass)s of the values selected by the indices. + + For internal compatibility with numpy arrays. + + Parameters + ---------- + indices : array-like + Indices to be taken. + axis : int, optional + The axis over which to select values, always 0. + allow_fill : bool, default True + fill_value : scalar, default None + If allow_fill=True and fill_value is not None, indices specified by + -1 are regarded as NA. If Index doesn't hold NA, raise ValueError. + + Returns + ------- + Index + An index formed of elements at the given indices. Will be the same + type as self, except for RangeIndex. + + See Also + -------- + numpy.ndarray.take: Return an array formed from the + elements of a at the given indices. + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx.take([2, 2, 1, 2]) + Index(['c', 'c', 'b', 'c'], dtype='object') + """ + + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take( + self, + indices, + axis: Axis = 0, + allow_fill: bool = True, + fill_value=None, + **kwargs, + ) -> Self: + if kwargs: + nv.validate_take((), kwargs) + if is_scalar(indices): + raise TypeError("Expected indices to be array-like") + indices = ensure_platform_int(indices) + allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) + + # Note: we discard fill_value and use self._na_value, only relevant + # in the case where allow_fill is True and fill_value is not None + values = self._values + if isinstance(values, np.ndarray): + taken = algos.take( + values, indices, allow_fill=allow_fill, fill_value=self._na_value + ) + else: + # algos.take passes 'axis' keyword which not all EAs accept + taken = values.take( + indices, allow_fill=allow_fill, fill_value=self._na_value + ) + return self._constructor._simple_new(taken, name=self.name) + + @final + def _maybe_disallow_fill(self, allow_fill: bool, fill_value, indices) -> bool: + """ + We only use pandas-style take when allow_fill is True _and_ + fill_value is not None. + """ + if allow_fill and fill_value is not None: + # only fill if we are passing a non-None fill_value + if self._can_hold_na: + if (indices < -1).any(): + raise ValueError( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + else: + cls_name = type(self).__name__ + raise ValueError( + f"Unable to fill values because {cls_name} cannot contain NA" + ) + else: + allow_fill = False + return allow_fill + + _index_shared_docs[ + "repeat" + ] = """ + Repeat elements of a %(klass)s. + + Returns a new %(klass)s where each element of the current %(klass)s + is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int or array of ints + The number of repetitions for each element. This should be a + non-negative integer. Repeating 0 times will return an empty + %(klass)s. + axis : None + Must be ``None``. Has no effect but is accepted for compatibility + with numpy. + + Returns + ------- + %(klass)s + Newly created %(klass)s with repeated elements. + + See Also + -------- + Series.repeat : Equivalent function for Series. + numpy.repeat : Similar method for :class:`numpy.ndarray`. + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx + Index(['a', 'b', 'c'], dtype='object') + >>> idx.repeat(2) + Index(['a', 'a', 'b', 'b', 'c', 'c'], dtype='object') + >>> idx.repeat([1, 2, 3]) + Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object') + """ + + @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) + def repeat(self, repeats, axis: None = None) -> Self: + repeats = ensure_platform_int(repeats) + nv.validate_repeat((), {"axis": axis}) + res_values = self._values.repeat(repeats) + + # _constructor so RangeIndex-> Index with an int64 dtype + return self._constructor._simple_new(res_values, name=self.name) + + # -------------------------------------------------------------------- + # Copying Methods + + def copy( + self, + name: Hashable | None = None, + deep: bool = False, + ) -> Self: + """ + Make a copy of this object. + + Name is set on the new object. + + Parameters + ---------- + name : Label, optional + Set name for new object. + deep : bool, default False + + Returns + ------- + Index + Index refer to new object which is a copy of this object. + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'c']) + >>> new_idx = idx.copy() + >>> idx is new_idx + False + """ + + name = self._validate_names(name=name, deep=deep)[0] + if deep: + new_data = self._data.copy() + new_index = type(self)._simple_new(new_data, name=name) + else: + new_index = self._rename(name=name) + return new_index + + @final + def __copy__(self, **kwargs) -> Self: + return self.copy(**kwargs) + + @final + def __deepcopy__(self, memo=None) -> Self: + """ + Parameters + ---------- + memo, default None + Standard signature. Unused + """ + return self.copy(deep=True) + + # -------------------------------------------------------------------- + # Rendering Methods + + @final + def __repr__(self) -> str_t: + """ + Return a string representation for this object. + """ + klass_name = type(self).__name__ + data = self._format_data() + attrs = self._format_attrs() + attrs_str = [f"{k}={v}" for k, v in attrs] + prepr = ", ".join(attrs_str) + + return f"{klass_name}({data}{prepr})" + + @property + def _formatter_func(self): + """ + Return the formatter function. + """ + return default_pprint + + @final + def _format_data(self, name=None) -> str_t: + """ + Return the formatted data as a unicode string. + """ + # do we want to justify (only do so for non-objects) + is_justify = True + + if self.inferred_type == "string": + is_justify = False + elif isinstance(self.dtype, CategoricalDtype): + self = cast("CategoricalIndex", self) + if is_object_dtype(self.categories.dtype): + is_justify = False + elif isinstance(self, ABCRangeIndex): + # We will do the relevant formatting via attrs + return "" + + return format_object_summary( + self, + self._formatter_func, + is_justify=is_justify, + name=name, + line_break_each_value=self._is_multi, + ) + + def _format_attrs(self) -> list[tuple[str_t, str_t | int | bool | None]]: + """ + Return a list of tuples of the (attr,formatted_value). + """ + attrs: list[tuple[str_t, str_t | int | bool | None]] = [] + + if not self._is_multi: + attrs.append(("dtype", f"'{self.dtype}'")) + + if self.name is not None: + attrs.append(("name", default_pprint(self.name))) + elif self._is_multi and any(x is not None for x in self.names): + attrs.append(("names", default_pprint(self.names))) + + max_seq_items = get_option("display.max_seq_items") or len(self) + if len(self) > max_seq_items: + attrs.append(("length", len(self))) + return attrs + + @final + def _get_level_names(self) -> Hashable | Sequence[Hashable]: + """ + Return a name or list of names with None replaced by the level number. + """ + if self._is_multi: + return [ + level if name is None else name for level, name in enumerate(self.names) + ] + else: + return 0 if self.name is None else self.name + + @final + def _mpl_repr(self) -> np.ndarray: + # how to represent ourselves to matplotlib + if isinstance(self.dtype, np.dtype) and self.dtype.kind != "M": + return cast(np.ndarray, self.values) + return self.astype(object, copy=False)._values + + def format( + self, + name: bool = False, + formatter: Callable | None = None, + na_rep: str_t = "NaN", + ) -> list[str_t]: + """ + Render a string representation of the Index. + """ + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + header = [] + if name: + header.append( + pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) + + if formatter is not None: + return header + list(self.map(formatter)) + + return self._format_with_header(header=header, na_rep=na_rep) + + _default_na_rep = "NaN" + + @final + def _format_flat( + self, + *, + include_name: bool, + formatter: Callable | None = None, + ) -> list[str_t]: + """ + Render a string representation of the Index. + """ + header = [] + if include_name: + header.append( + pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) + + if formatter is not None: + return header + list(self.map(formatter)) + + return self._format_with_header(header=header, na_rep=self._default_na_rep) + + def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str_t]: + from pandas.io.formats.format import format_array + + values = self._values + + if ( + is_object_dtype(values.dtype) + or is_string_dtype(values.dtype) + or isinstance(self.dtype, (IntervalDtype, CategoricalDtype)) + ): + # TODO: why do we need different justify for these cases? + justify = "all" + else: + justify = "left" + # passing leading_space=False breaks test_format_missing, + # test_index_repr_in_frame_with_nan, but would otherwise make + # trim_front unnecessary + formatted = format_array(values, None, justify=justify) + result = trim_front(formatted) + return header + result + + def _get_values_for_csv( + self, + *, + na_rep: str_t = "", + decimal: str_t = ".", + float_format=None, + date_format=None, + quoting=None, + ) -> npt.NDArray[np.object_]: + return get_values_for_csv( + self._values, + na_rep=na_rep, + decimal=decimal, + float_format=float_format, + date_format=date_format, + quoting=quoting, + ) + + def _summary(self, name=None) -> str_t: + """ + Return a summarized representation. + + Parameters + ---------- + name : str + name to use in the summary representation + + Returns + ------- + String with a summarized representation of the index + """ + if len(self) > 0: + head = self[0] + if hasattr(head, "format") and not isinstance(head, str): + head = head.format() + elif needs_i8_conversion(self.dtype): + # e.g. Timedelta, display as values, not quoted + head = self._formatter_func(head).replace("'", "") + tail = self[-1] + if hasattr(tail, "format") and not isinstance(tail, str): + tail = tail.format() + elif needs_i8_conversion(self.dtype): + # e.g. Timedelta, display as values, not quoted + tail = self._formatter_func(tail).replace("'", "") + + index_summary = f", {head} to {tail}" + else: + index_summary = "" + + if name is None: + name = type(self).__name__ + return f"{name}: {len(self)} entries{index_summary}" + + # -------------------------------------------------------------------- + # Conversion Methods + + def to_flat_index(self) -> Self: + """ + Identity method. + + This is implemented for compatibility with subclass implementations + when chaining. + + Returns + ------- + pd.Index + Caller. + + See Also + -------- + MultiIndex.to_flat_index : Subclass implementation. + """ + return self + + @final + def to_series(self, index=None, name: Hashable | None = None) -> Series: + """ + Create a Series with both index and values equal to the index keys. + + Useful with map for returning an indexer based on an index. + + Parameters + ---------- + index : Index, optional + Index of resulting Series. If None, defaults to original index. + name : str, optional + Name of resulting Series. If None, defaults to name of original + index. + + Returns + ------- + Series + The dtype will be based on the type of the Index values. + + See Also + -------- + Index.to_frame : Convert an Index to a DataFrame. + Series.to_frame : Convert Series to DataFrame. + + Examples + -------- + >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + + By default, the original index and original name is reused. + + >>> idx.to_series() + animal + Ant Ant + Bear Bear + Cow Cow + Name: animal, dtype: object + + To enforce a new index, specify new labels to ``index``: + + >>> idx.to_series(index=[0, 1, 2]) + 0 Ant + 1 Bear + 2 Cow + Name: animal, dtype: object + + To override the name of the resulting column, specify ``name``: + + >>> idx.to_series(name='zoo') + animal + Ant Ant + Bear Bear + Cow Cow + Name: zoo, dtype: object + """ + from pandas import Series + + if index is None: + index = self._view() + if name is None: + name = self.name + + return Series(self._values.copy(), index=index, name=name) + + def to_frame( + self, index: bool = True, name: Hashable = lib.no_default + ) -> DataFrame: + """ + Create a DataFrame with a column containing the Index. + + Parameters + ---------- + index : bool, default True + Set the index of the returned DataFrame as the original Index. + + name : object, defaults to index.name + The passed name should substitute for the index name (if it has + one). + + Returns + ------- + DataFrame + DataFrame containing the original Index data. + + See Also + -------- + Index.to_series : Convert an Index to a Series. + Series.to_frame : Convert Series to DataFrame. + + Examples + -------- + >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame() + animal + animal + Ant Ant + Bear Bear + Cow Cow + + By default, the original Index is reused. To enforce a new Index: + + >>> idx.to_frame(index=False) + animal + 0 Ant + 1 Bear + 2 Cow + + To override the name of the resulting column, specify `name`: + + >>> idx.to_frame(index=False, name='zoo') + zoo + 0 Ant + 1 Bear + 2 Cow + """ + from pandas import DataFrame + + if name is lib.no_default: + name = self._get_level_names() + result = DataFrame({name: self}, copy=not using_copy_on_write()) + + if index: + result.index = self + return result + + # -------------------------------------------------------------------- + # Name-Centric Methods + + @property + def name(self) -> Hashable: + """ + Return Index or MultiIndex name. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3], name='x') + >>> idx + Index([1, 2, 3], dtype='int64', name='x') + >>> idx.name + 'x' + """ + return self._name + + @name.setter + def name(self, value: Hashable) -> None: + if self._no_setting_name: + # Used in MultiIndex.levels to avoid silently ignoring name updates. + raise RuntimeError( + "Cannot set name on a level of a MultiIndex. Use " + "'MultiIndex.set_names' instead." + ) + maybe_extract_name(value, None, type(self)) + self._name = value + + @final + def _validate_names( + self, name=None, names=None, deep: bool = False + ) -> list[Hashable]: + """ + Handles the quirks of having a singular 'name' parameter for general + Index and plural 'names' parameter for MultiIndex. + """ + from copy import deepcopy + + if names is not None and name is not None: + raise TypeError("Can only provide one of `names` and `name`") + if names is None and name is None: + new_names = deepcopy(self.names) if deep else self.names + elif names is not None: + if not is_list_like(names): + raise TypeError("Must pass list-like as `names`.") + new_names = names + elif not is_list_like(name): + new_names = [name] + else: + new_names = name + + if len(new_names) != len(self.names): + raise ValueError( + f"Length of new names must be {len(self.names)}, got {len(new_names)}" + ) + + # All items in 'new_names' need to be hashable + validate_all_hashable(*new_names, error_name=f"{type(self).__name__}.name") + + return new_names + + def _get_default_index_names( + self, names: Hashable | Sequence[Hashable] | None = None, default=None + ) -> list[Hashable]: + """ + Get names of index. + + Parameters + ---------- + names : int, str or 1-dimensional list, default None + Index names to set. + default : str + Default name of index. + + Raises + ------ + TypeError + if names not str or list-like + """ + from pandas.core.indexes.multi import MultiIndex + + if names is not None: + if isinstance(names, (int, str)): + names = [names] + + if not isinstance(names, list) and names is not None: + raise ValueError("Index names must be str or 1-dimensional list") + + if not names: + if isinstance(self, MultiIndex): + names = com.fill_missing_names(self.names) + else: + names = [default] if self.name is None else [self.name] + + return names + + def _get_names(self) -> FrozenList: + return FrozenList((self.name,)) + + def _set_names(self, values, *, level=None) -> None: + """ + Set new names on index. Each name has to be a hashable type. + + Parameters + ---------- + values : str or sequence + name(s) to set + level : int, level name, or sequence of int/level names (default None) + If the index is a MultiIndex (hierarchical), level(s) to set (None + for all levels). Otherwise level must be None + + Raises + ------ + TypeError if each name is not hashable. + """ + if not is_list_like(values): + raise ValueError("Names must be a list-like") + if len(values) != 1: + raise ValueError(f"Length of new names must be 1, got {len(values)}") + + # GH 20527 + # All items in 'name' need to be hashable: + validate_all_hashable(*values, error_name=f"{type(self).__name__}.name") + + self._name = values[0] + + names = property(fset=_set_names, fget=_get_names) + + @overload + def set_names(self, names, *, level=..., inplace: Literal[False] = ...) -> Self: + ... + + @overload + def set_names(self, names, *, level=..., inplace: Literal[True]) -> None: + ... + + @overload + def set_names(self, names, *, level=..., inplace: bool = ...) -> Self | None: + ... + + def set_names(self, names, *, level=None, inplace: bool = False) -> Self | None: + """ + Set Index or MultiIndex name. + + Able to set new names partially and by level. + + Parameters + ---------- + + names : label or list of label or dict-like for MultiIndex + Name(s) to set. + + .. versionchanged:: 1.3.0 + + level : int, label or list of int or label, optional + If the index is a MultiIndex and names is not dict-like, level(s) to set + (None for all levels). Otherwise level must be None. + + .. versionchanged:: 1.3.0 + + inplace : bool, default False + Modifies the object directly, instead of creating a new Index or + MultiIndex. + + Returns + ------- + Index or None + The same type as the caller or None if ``inplace=True``. + + See Also + -------- + Index.rename : Able to set new names without level. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx + Index([1, 2, 3, 4], dtype='int64') + >>> idx.set_names('quarter') + Index([1, 2, 3, 4], dtype='int64', name='quarter') + + >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], + ... [2018, 2019]]) + >>> idx + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + ) + >>> idx = idx.set_names(['kind', 'year']) + >>> idx.set_names('species', level=0) + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['species', 'year']) + + When renaming levels with a dict, levels can not be passed. + + >>> idx.set_names({'kind': 'snake'}) + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['snake', 'year']) + """ + if level is not None and not isinstance(self, ABCMultiIndex): + raise ValueError("Level must be None for non-MultiIndex") + + if level is not None and not is_list_like(level) and is_list_like(names): + raise TypeError("Names must be a string when a single level is provided.") + + if not is_list_like(names) and level is None and self.nlevels > 1: + raise TypeError("Must pass list-like as `names`.") + + if is_dict_like(names) and not isinstance(self, ABCMultiIndex): + raise TypeError("Can only pass dict-like as `names` for MultiIndex.") + + if is_dict_like(names) and level is not None: + raise TypeError("Can not pass level for dictlike `names`.") + + if isinstance(self, ABCMultiIndex) and is_dict_like(names) and level is None: + # Transform dict to list of new names and corresponding levels + level, names_adjusted = [], [] + for i, name in enumerate(self.names): + if name in names.keys(): + level.append(i) + names_adjusted.append(names[name]) + names = names_adjusted + + if not is_list_like(names): + names = [names] + if level is not None and not is_list_like(level): + level = [level] + + if inplace: + idx = self + else: + idx = self._view() + + idx._set_names(names, level=level) + if not inplace: + return idx + return None + + @overload + def rename(self, name, *, inplace: Literal[False] = ...) -> Self: + ... + + @overload + def rename(self, name, *, inplace: Literal[True]) -> None: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "name"], name="rename" + ) + def rename(self, name, inplace: bool = False) -> Self | None: + """ + Alter Index or MultiIndex name. + + Able to set new names without level. Defaults to returning new index. + Length of names must match number of levels in MultiIndex. + + Parameters + ---------- + name : label or list of labels + Name(s) to set. + inplace : bool, default False + Modifies the object directly, instead of creating a new Index or + MultiIndex. + + Returns + ------- + Index or None + The same type as the caller or None if ``inplace=True``. + + See Also + -------- + Index.set_names : Able to set new names partially and by level. + + Examples + -------- + >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score') + >>> idx.rename('grade') + Index(['A', 'C', 'A', 'B'], dtype='object', name='grade') + + >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], + ... [2018, 2019]], + ... names=['kind', 'year']) + >>> idx + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['kind', 'year']) + >>> idx.rename(['species', 'year']) + MultiIndex([('python', 2018), + ('python', 2019), + ( 'cobra', 2018), + ( 'cobra', 2019)], + names=['species', 'year']) + >>> idx.rename('species') + Traceback (most recent call last): + TypeError: Must pass list-like as `names`. + """ + return self.set_names([name], inplace=inplace) + + # -------------------------------------------------------------------- + # Level-Centric Methods + + @property + def nlevels(self) -> int: + """ + Number of levels. + """ + return 1 + + def _sort_levels_monotonic(self) -> Self: + """ + Compat with MultiIndex. + """ + return self + + @final + def _validate_index_level(self, level) -> None: + """ + Validate index level. + + For single-level Index getting level number is a no-op, but some + verification must be done like in MultiIndex. + + """ + if isinstance(level, int): + if level < 0 and level != -1: + raise IndexError( + "Too many levels: Index has only 1 level, " + f"{level} is not a valid level number" + ) + if level > 0: + raise IndexError( + f"Too many levels: Index has only 1 level, not {level + 1}" + ) + elif level != self.name: + raise KeyError( + f"Requested level ({level}) does not match index name ({self.name})" + ) + + def _get_level_number(self, level) -> int: + self._validate_index_level(level) + return 0 + + def sortlevel( + self, + level=None, + ascending: bool | list[bool] = True, + sort_remaining=None, + na_position: NaPosition = "first", + ): + """ + For internal compatibility with the Index API. + + Sort the Index. This is for compat with MultiIndex + + Parameters + ---------- + ascending : bool, default True + False to sort in descending order + na_position : {'first' or 'last'}, default 'first' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + .. versionadded:: 2.1.0 + + level, sort_remaining are compat parameters + + Returns + ------- + Index + """ + if not isinstance(ascending, (list, bool)): + raise TypeError( + "ascending must be a single bool value or" + "a list of bool values of length 1" + ) + + if isinstance(ascending, list): + if len(ascending) != 1: + raise TypeError("ascending must be a list of bool values of length 1") + ascending = ascending[0] + + if not isinstance(ascending, bool): + raise TypeError("ascending must be a bool value") + + return self.sort_values( + return_indexer=True, ascending=ascending, na_position=na_position + ) + + def _get_level_values(self, level) -> Index: + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatibility. + + Parameters + ---------- + level : int or str + It is either the integer position or the name of the level. + + Returns + ------- + Index + Calling object, as there is only one level in the Index. + + See Also + -------- + MultiIndex.get_level_values : Get values for a level of a MultiIndex. + + Notes + ----- + For Index, level should be 0, since there are no multiple levels. + + Examples + -------- + >>> idx = pd.Index(list('abc')) + >>> idx + Index(['a', 'b', 'c'], dtype='object') + + Get level values by supplying `level` as integer: + + >>> idx.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object') + """ + self._validate_index_level(level) + return self + + get_level_values = _get_level_values + + @final + def droplevel(self, level: IndexLabel = 0): + """ + Return index with requested level(s) removed. + + If resulting index has only 1 level left, the result will be + of Index type, not MultiIndex. The original index is not modified inplace. + + Parameters + ---------- + level : int, str, or list-like, default 0 + If a string is given, must be the name of a level + If list-like, elements must be names or indexes of levels. + + Returns + ------- + Index or MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays( + ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) + >>> mi + MultiIndex([(1, 3, 5), + (2, 4, 6)], + names=['x', 'y', 'z']) + + >>> mi.droplevel() + MultiIndex([(3, 5), + (4, 6)], + names=['y', 'z']) + + >>> mi.droplevel(2) + MultiIndex([(1, 3), + (2, 4)], + names=['x', 'y']) + + >>> mi.droplevel('z') + MultiIndex([(1, 3), + (2, 4)], + names=['x', 'y']) + + >>> mi.droplevel(['x', 'y']) + Index([5, 6], dtype='int64', name='z') + """ + if not isinstance(level, (tuple, list)): + level = [level] + + levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + + return self._drop_level_numbers(levnums) + + @final + def _drop_level_numbers(self, levnums: list[int]): + """ + Drop MultiIndex levels by level _number_, not name. + """ + + if not levnums and not isinstance(self, ABCMultiIndex): + return self + if len(levnums) >= self.nlevels: + raise ValueError( + f"Cannot remove {len(levnums)} levels from an index with " + f"{self.nlevels} levels: at least one level must be left." + ) + # The two checks above guarantee that here self is a MultiIndex + self = cast("MultiIndex", self) + + new_levels = list(self.levels) + new_codes = list(self.codes) + new_names = list(self.names) + + for i in levnums: + new_levels.pop(i) + new_codes.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + lev = new_levels[0] + + if len(lev) == 0: + # If lev is empty, lev.take will fail GH#42055 + if len(new_codes[0]) == 0: + # GH#45230 preserve RangeIndex here + # see test_reset_index_empty_rangeindex + result = lev[:0] + else: + res_values = algos.take(lev._values, new_codes[0], allow_fill=True) + # _constructor instead of type(lev) for RangeIndex compat GH#35230 + result = lev._constructor._simple_new(res_values, name=new_names[0]) + else: + # set nan if needed + mask = new_codes[0] == -1 + result = new_levels[0].take(new_codes[0]) + if mask.any(): + result = result.putmask(mask, np.nan) + + result._name = new_names[0] + + return result + else: + from pandas.core.indexes.multi import MultiIndex + + return MultiIndex( + levels=new_levels, + codes=new_codes, + names=new_names, + verify_integrity=False, + ) + + # -------------------------------------------------------------------- + # Introspection Methods + + @cache_readonly + @final + def _can_hold_na(self) -> bool: + if isinstance(self.dtype, ExtensionDtype): + return self.dtype._can_hold_na + if self.dtype.kind in "iub": + return False + return True + + @property + def is_monotonic_increasing(self) -> bool: + """ + Return a boolean if the values are equal or increasing. + + Returns + ------- + bool + + See Also + -------- + Index.is_monotonic_decreasing : Check if the values are equal or decreasing. + + Examples + -------- + >>> pd.Index([1, 2, 3]).is_monotonic_increasing + True + >>> pd.Index([1, 2, 2]).is_monotonic_increasing + True + >>> pd.Index([1, 3, 2]).is_monotonic_increasing + False + """ + return self._engine.is_monotonic_increasing + + @property + def is_monotonic_decreasing(self) -> bool: + """ + Return a boolean if the values are equal or decreasing. + + Returns + ------- + bool + + See Also + -------- + Index.is_monotonic_increasing : Check if the values are equal or increasing. + + Examples + -------- + >>> pd.Index([3, 2, 1]).is_monotonic_decreasing + True + >>> pd.Index([3, 2, 2]).is_monotonic_decreasing + True + >>> pd.Index([3, 1, 2]).is_monotonic_decreasing + False + """ + return self._engine.is_monotonic_decreasing + + @final + @property + def _is_strictly_monotonic_increasing(self) -> bool: + """ + Return if the index is strictly monotonic increasing + (only increasing) values. + + Examples + -------- + >>> Index([1, 2, 3])._is_strictly_monotonic_increasing + True + >>> Index([1, 2, 2])._is_strictly_monotonic_increasing + False + >>> Index([1, 3, 2])._is_strictly_monotonic_increasing + False + """ + return self.is_unique and self.is_monotonic_increasing + + @final + @property + def _is_strictly_monotonic_decreasing(self) -> bool: + """ + Return if the index is strictly monotonic decreasing + (only decreasing) values. + + Examples + -------- + >>> Index([3, 2, 1])._is_strictly_monotonic_decreasing + True + >>> Index([3, 2, 2])._is_strictly_monotonic_decreasing + False + >>> Index([3, 1, 2])._is_strictly_monotonic_decreasing + False + """ + return self.is_unique and self.is_monotonic_decreasing + + @cache_readonly + def is_unique(self) -> bool: + """ + Return if the index has unique values. + + Returns + ------- + bool + + See Also + -------- + Index.has_duplicates : Inverse method that checks if it has duplicate values. + + Examples + -------- + >>> idx = pd.Index([1, 5, 7, 7]) + >>> idx.is_unique + False + + >>> idx = pd.Index([1, 5, 7]) + >>> idx.is_unique + True + + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.is_unique + False + + >>> idx = pd.Index(["Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.is_unique + True + """ + return self._engine.is_unique + + @final + @property + def has_duplicates(self) -> bool: + """ + Check if the Index has duplicate values. + + Returns + ------- + bool + Whether or not the Index has duplicate values. + + See Also + -------- + Index.is_unique : Inverse method that checks if it has unique values. + + Examples + -------- + >>> idx = pd.Index([1, 5, 7, 7]) + >>> idx.has_duplicates + True + + >>> idx = pd.Index([1, 5, 7]) + >>> idx.has_duplicates + False + + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.has_duplicates + True + + >>> idx = pd.Index(["Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.has_duplicates + False + """ + return not self.is_unique + + @final + def is_boolean(self) -> bool: + """ + Check if the Index only consists of booleans. + + .. deprecated:: 2.0.0 + Use `pandas.api.types.is_bool_dtype` instead. + + Returns + ------- + bool + Whether or not the Index only consists of booleans. + + See Also + -------- + is_integer : Check if the Index only consists of integers (deprecated). + is_floating : Check if the Index is a floating type (deprecated). + is_numeric : Check if the Index only consists of numeric data (deprecated). + is_object : Check if the Index is of the object dtype (deprecated). + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects (deprecated). + + Examples + -------- + >>> idx = pd.Index([True, False, True]) + >>> idx.is_boolean() # doctest: +SKIP + True + + >>> idx = pd.Index(["True", "False", "True"]) + >>> idx.is_boolean() # doctest: +SKIP + False + + >>> idx = pd.Index([True, False, "True"]) + >>> idx.is_boolean() # doctest: +SKIP + False + """ + warnings.warn( + f"{type(self).__name__}.is_boolean is deprecated. " + "Use pandas.api.types.is_bool_type instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.inferred_type in ["boolean"] + + @final + def is_integer(self) -> bool: + """ + Check if the Index only consists of integers. + + .. deprecated:: 2.0.0 + Use `pandas.api.types.is_integer_dtype` instead. + + Returns + ------- + bool + Whether or not the Index only consists of integers. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans (deprecated). + is_floating : Check if the Index is a floating type (deprecated). + is_numeric : Check if the Index only consists of numeric data (deprecated). + is_object : Check if the Index is of the object dtype. (deprecated). + is_categorical : Check if the Index holds categorical data (deprecated). + is_interval : Check if the Index holds Interval objects (deprecated). + + Examples + -------- + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx.is_integer() # doctest: +SKIP + True + + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_integer() # doctest: +SKIP + False + + >>> idx = pd.Index(["Apple", "Mango", "Watermelon"]) + >>> idx.is_integer() # doctest: +SKIP + False + """ + warnings.warn( + f"{type(self).__name__}.is_integer is deprecated. " + "Use pandas.api.types.is_integer_dtype instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.inferred_type in ["integer"] + + @final + def is_floating(self) -> bool: + """ + Check if the Index is a floating type. + + .. deprecated:: 2.0.0 + Use `pandas.api.types.is_float_dtype` instead + + The Index may consist of only floats, NaNs, or a mix of floats, + integers, or NaNs. + + Returns + ------- + bool + Whether or not the Index only consists of only consists of floats, NaNs, or + a mix of floats, integers, or NaNs. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans (deprecated). + is_integer : Check if the Index only consists of integers (deprecated). + is_numeric : Check if the Index only consists of numeric data (deprecated). + is_object : Check if the Index is of the object dtype. (deprecated). + is_categorical : Check if the Index holds categorical data (deprecated). + is_interval : Check if the Index holds Interval objects (deprecated). + + Examples + -------- + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_floating() # doctest: +SKIP + True + + >>> idx = pd.Index([1.0, 2.0, np.nan, 4.0]) + >>> idx.is_floating() # doctest: +SKIP + True + + >>> idx = pd.Index([1, 2, 3, 4, np.nan]) + >>> idx.is_floating() # doctest: +SKIP + True + + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx.is_floating() # doctest: +SKIP + False + """ + warnings.warn( + f"{type(self).__name__}.is_floating is deprecated. " + "Use pandas.api.types.is_float_dtype instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"] + + @final + def is_numeric(self) -> bool: + """ + Check if the Index only consists of numeric data. + + .. deprecated:: 2.0.0 + Use `pandas.api.types.is_numeric_dtype` instead. + + Returns + ------- + bool + Whether or not the Index only consists of numeric data. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans (deprecated). + is_integer : Check if the Index only consists of integers (deprecated). + is_floating : Check if the Index is a floating type (deprecated). + is_object : Check if the Index is of the object dtype. (deprecated). + is_categorical : Check if the Index holds categorical data (deprecated). + is_interval : Check if the Index holds Interval objects (deprecated). + + Examples + -------- + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_numeric() # doctest: +SKIP + True + + >>> idx = pd.Index([1, 2, 3, 4.0]) + >>> idx.is_numeric() # doctest: +SKIP + True + + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx.is_numeric() # doctest: +SKIP + True + + >>> idx = pd.Index([1, 2, 3, 4.0, np.nan]) + >>> idx.is_numeric() # doctest: +SKIP + True + + >>> idx = pd.Index([1, 2, 3, 4.0, np.nan, "Apple"]) + >>> idx.is_numeric() # doctest: +SKIP + False + """ + warnings.warn( + f"{type(self).__name__}.is_numeric is deprecated. " + "Use pandas.api.types.is_any_real_numeric_dtype instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.inferred_type in ["integer", "floating"] + + @final + def is_object(self) -> bool: + """ + Check if the Index is of the object dtype. + + .. deprecated:: 2.0.0 + Use `pandas.api.types.is_object_dtype` instead. + + Returns + ------- + bool + Whether or not the Index is of the object dtype. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans (deprecated). + is_integer : Check if the Index only consists of integers (deprecated). + is_floating : Check if the Index is a floating type (deprecated). + is_numeric : Check if the Index only consists of numeric data (deprecated). + is_categorical : Check if the Index holds categorical data (deprecated). + is_interval : Check if the Index holds Interval objects (deprecated). + + Examples + -------- + >>> idx = pd.Index(["Apple", "Mango", "Watermelon"]) + >>> idx.is_object() # doctest: +SKIP + True + + >>> idx = pd.Index(["Apple", "Mango", 2.0]) + >>> idx.is_object() # doctest: +SKIP + True + + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.is_object() # doctest: +SKIP + False + + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_object() # doctest: +SKIP + False + """ + warnings.warn( + f"{type(self).__name__}.is_object is deprecated." + "Use pandas.api.types.is_object_dtype instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return is_object_dtype(self.dtype) + + @final + def is_categorical(self) -> bool: + """ + Check if the Index holds categorical data. + + .. deprecated:: 2.0.0 + Use `isinstance(index.dtype, pd.CategoricalDtype)` instead. + + Returns + ------- + bool + True if the Index is categorical. + + See Also + -------- + CategoricalIndex : Index for categorical data. + is_boolean : Check if the Index only consists of booleans (deprecated). + is_integer : Check if the Index only consists of integers (deprecated). + is_floating : Check if the Index is a floating type (deprecated). + is_numeric : Check if the Index only consists of numeric data (deprecated). + is_object : Check if the Index is of the object dtype. (deprecated). + is_interval : Check if the Index holds Interval objects (deprecated). + + Examples + -------- + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.is_categorical() # doctest: +SKIP + True + + >>> idx = pd.Index([1, 3, 5, 7]) + >>> idx.is_categorical() # doctest: +SKIP + False + + >>> s = pd.Series(["Peter", "Victor", "Elisabeth", "Mar"]) + >>> s + 0 Peter + 1 Victor + 2 Elisabeth + 3 Mar + dtype: object + >>> s.index.is_categorical() # doctest: +SKIP + False + """ + warnings.warn( + f"{type(self).__name__}.is_categorical is deprecated." + "Use pandas.api.types.is_categorical_dtype instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return self.inferred_type in ["categorical"] + + @final + def is_interval(self) -> bool: + """ + Check if the Index holds Interval objects. + + .. deprecated:: 2.0.0 + Use `isinstance(index.dtype, pd.IntervalDtype)` instead. + + Returns + ------- + bool + Whether or not the Index holds Interval objects. + + See Also + -------- + IntervalIndex : Index for Interval objects. + is_boolean : Check if the Index only consists of booleans (deprecated). + is_integer : Check if the Index only consists of integers (deprecated). + is_floating : Check if the Index is a floating type (deprecated). + is_numeric : Check if the Index only consists of numeric data (deprecated). + is_object : Check if the Index is of the object dtype. (deprecated). + is_categorical : Check if the Index holds categorical data (deprecated). + + Examples + -------- + >>> idx = pd.Index([pd.Interval(left=0, right=5), + ... pd.Interval(left=5, right=10)]) + >>> idx.is_interval() # doctest: +SKIP + True + + >>> idx = pd.Index([1, 3, 5, 7]) + >>> idx.is_interval() # doctest: +SKIP + False + """ + warnings.warn( + f"{type(self).__name__}.is_interval is deprecated." + "Use pandas.api.types.is_interval_dtype instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.inferred_type in ["interval"] + + @final + def _holds_integer(self) -> bool: + """ + Whether the type is an integer type. + """ + return self.inferred_type in ["integer", "mixed-integer"] + + @final + def holds_integer(self) -> bool: + """ + Whether the type is an integer type. + + .. deprecated:: 2.0.0 + Use `pandas.api.types.infer_dtype` instead + """ + warnings.warn( + f"{type(self).__name__}.holds_integer is deprecated. " + "Use pandas.api.types.infer_dtype instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self._holds_integer() + + @cache_readonly + def inferred_type(self) -> str_t: + """ + Return a string of the type inferred from the values. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.inferred_type + 'integer' + """ + return lib.infer_dtype(self._values, skipna=False) + + @cache_readonly + @final + def _is_all_dates(self) -> bool: + """ + Whether or not the index values only consist of dates. + """ + if needs_i8_conversion(self.dtype): + return True + elif self.dtype != _dtype_obj: + # TODO(ExtensionIndex): 3rd party EA might override? + # Note: this includes IntervalIndex, even when the left/right + # contain datetime-like objects. + return False + elif self._is_multi: + return False + return is_datetime_array(ensure_object(self._values)) + + @final + @cache_readonly + def _is_multi(self) -> bool: + """ + Cached check equivalent to isinstance(self, MultiIndex) + """ + return isinstance(self, ABCMultiIndex) + + # -------------------------------------------------------------------- + # Pickle Methods + + def __reduce__(self): + d = {"data": self._data, "name": self.name} + return _new_Index, (type(self), d), None + + # -------------------------------------------------------------------- + # Null Handling Methods + + @cache_readonly + def _na_value(self): + """The expected NA value to use with this index.""" + dtype = self.dtype + if isinstance(dtype, np.dtype): + if dtype.kind in "mM": + return NaT + return np.nan + return dtype.na_value + + @cache_readonly + def _isnan(self) -> npt.NDArray[np.bool_]: + """ + Return if each value is NaN. + """ + if self._can_hold_na: + return isna(self) + else: + # shouldn't reach to this condition by checking hasnans beforehand + values = np.empty(len(self), dtype=np.bool_) + values.fill(False) + return values + + @cache_readonly + def hasnans(self) -> bool: + """ + Return True if there are any NaNs. + + Enables various performance speedups. + + Returns + ------- + bool + + Examples + -------- + >>> s = pd.Series([1, 2, 3], index=['a', 'b', None]) + >>> s + a 1 + b 2 + None 3 + dtype: int64 + >>> s.index.hasnans + True + """ + if self._can_hold_na: + return bool(self._isnan.any()) + else: + return False + + @final + def isna(self) -> npt.NDArray[np.bool_]: + """ + Detect missing values. + + Return a boolean same-sized object indicating if the values are NA. + NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get + mapped to ``True`` values. + Everything else get mapped to ``False`` values. Characters such as + empty strings `''` or :attr:`numpy.inf` are not considered NA values. + + Returns + ------- + numpy.ndarray[bool] + A boolean array of whether my values are NA. + + See Also + -------- + Index.notna : Boolean inverse of isna. + Index.dropna : Omit entries with missing values. + isna : Top-level isna. + Series.isna : Detect missing values in Series object. + + Examples + -------- + Show which entries in a pandas.Index are NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.nan]) + >>> idx + Index([5.2, 6.0, nan], dtype='float64') + >>> idx.isna() + array([False, False, True]) + + Empty strings are not considered NA values. None is considered an NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.isna() + array([False, False, False, True]) + + For datetimes, `NaT` (Not a Time) is considered as an NA value. + + >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), + ... pd.Timestamp(''), None, pd.NaT]) + >>> idx + DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], + dtype='datetime64[ns]', freq=None) + >>> idx.isna() + array([False, True, True, True]) + """ + return self._isnan + + isnull = isna + + @final + def notna(self) -> npt.NDArray[np.bool_]: + """ + Detect existing (non-missing) values. + + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to ``True``. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values. + NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` + values. + + Returns + ------- + numpy.ndarray[bool] + Boolean array to indicate which entries are not NA. + + See Also + -------- + Index.notnull : Alias of notna. + Index.isna: Inverse of notna. + notna : Top-level notna. + + Examples + -------- + Show which entries in an Index are not NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.nan]) + >>> idx + Index([5.2, 6.0, nan], dtype='float64') + >>> idx.notna() + array([ True, True, False]) + + Empty strings are not considered NA values. None is considered a NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.notna() + array([ True, True, True, False]) + """ + return ~self.isna() + + notnull = notna + + def fillna(self, value=None, downcast=lib.no_default): + """ + Fill NA/NaN values with the specified value. + + Parameters + ---------- + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). + + .. deprecated:: 2.1.0 + + Returns + ------- + Index + + See Also + -------- + DataFrame.fillna : Fill NaN values of a DataFrame. + Series.fillna : Fill NaN Values of a Series. + + Examples + -------- + >>> idx = pd.Index([np.nan, np.nan, 3]) + >>> idx.fillna(0) + Index([0.0, 0.0, 3.0], dtype='float64') + """ + if not is_scalar(value): + raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}") + if downcast is not lib.no_default: + warnings.warn( + f"The 'downcast' keyword in {type(self).__name__}.fillna is " + "deprecated and will be removed in a future version. " + "It was previously silently ignored.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + downcast = None + + if self.hasnans: + result = self.putmask(self._isnan, value) + if downcast is None: + # no need to care metadata other than name + # because it can't have freq if it has NaTs + # _with_infer needed for test_fillna_categorical + return Index._with_infer(result, name=self.name) + raise NotImplementedError( + f"{type(self).__name__}.fillna does not support 'downcast' " + "argument values other than 'None'." + ) + return self._view() + + def dropna(self, how: AnyAll = "any") -> Self: + """ + Return Index without NA/NaN values. + + Parameters + ---------- + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. + + Returns + ------- + Index + + Examples + -------- + >>> idx = pd.Index([1, np.nan, 3]) + >>> idx.dropna() + Index([1.0, 3.0], dtype='float64') + """ + if how not in ("any", "all"): + raise ValueError(f"invalid how option: {how}") + + if self.hasnans: + res_values = self._values[~self._isnan] + return type(self)._simple_new(res_values, name=self.name) + return self._view() + + # -------------------------------------------------------------------- + # Uniqueness Methods + + def unique(self, level: Hashable | None = None) -> Self: + """ + Return unique values in the index. + + Unique values are returned in order of appearance, this does NOT sort. + + Parameters + ---------- + level : int or hashable, optional + Only return values from specified level (for MultiIndex). + If int, gets the level by integer position, else by level name. + + Returns + ------- + Index + + See Also + -------- + unique : Numpy array of unique values in that column. + Series.unique : Return unique values of Series object. + + Examples + -------- + >>> idx = pd.Index([1, 1, 2, 3, 3]) + >>> idx.unique() + Index([1, 2, 3], dtype='int64') + """ + if level is not None: + self._validate_index_level(level) + + if self.is_unique: + return self._view() + + result = super().unique() + return self._shallow_copy(result) + + def drop_duplicates(self, *, keep: DropKeep = "first") -> Self: + """ + Return Index with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + Returns + ------- + Index + + See Also + -------- + Series.drop_duplicates : Equivalent method on Series. + DataFrame.drop_duplicates : Equivalent method on DataFrame. + Index.duplicated : Related method on Index, indicating duplicate + Index values. + + Examples + -------- + Generate an pandas.Index with duplicate values. + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + + The `keep` parameter controls which duplicate values are removed. + The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> idx.drop_duplicates(keep='first') + Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') + + The value 'last' keeps the last occurrence for each set of duplicated + entries. + + >>> idx.drop_duplicates(keep='last') + Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') + + The value ``False`` discards all sets of duplicated entries. + + >>> idx.drop_duplicates(keep=False) + Index(['cow', 'beetle', 'hippo'], dtype='object') + """ + if self.is_unique: + return self._view() + + return super().drop_duplicates(keep=keep) + + def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: + """ + Indicate duplicate index values. + + Duplicated values are indicated as ``True`` values in the resulting + array. Either all duplicates, all except the first, or all except the + last occurrence of duplicates can be indicated. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + The value or values in a set of duplicates to mark as missing. + + - 'first' : Mark duplicates as ``True`` except for the first + occurrence. + - 'last' : Mark duplicates as ``True`` except for the last + occurrence. + - ``False`` : Mark all duplicates as ``True``. + + Returns + ------- + np.ndarray[bool] + + See Also + -------- + Series.duplicated : Equivalent method on pandas.Series. + DataFrame.duplicated : Equivalent method on pandas.DataFrame. + Index.drop_duplicates : Remove duplicate values from Index. + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set to False and all others to True: + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> idx.duplicated() + array([False, False, True, False, True]) + + which is equivalent to + + >>> idx.duplicated(keep='first') + array([False, False, True, False, True]) + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True: + + >>> idx.duplicated(keep='last') + array([ True, False, True, False, False]) + + By setting keep on ``False``, all duplicates are True: + + >>> idx.duplicated(keep=False) + array([ True, False, True, False, True]) + """ + if self.is_unique: + # fastpath available bc we are immutable + return np.zeros(len(self), dtype=bool) + return self._duplicated(keep=keep) + + # -------------------------------------------------------------------- + # Arithmetic & Logical Methods + + def __iadd__(self, other): + # alias for __add__ + return self + other + + @final + def __nonzero__(self) -> NoReturn: + raise ValueError( + f"The truth value of a {type(self).__name__} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." + ) + + __bool__ = __nonzero__ + + # -------------------------------------------------------------------- + # Set Operation Methods + + def _get_reconciled_name_object(self, other): + """ + If the result of a set operation will be self, + return self, unless the name changes, in which + case make a shallow copy of self. + """ + name = get_op_result_name(self, other) + if self.name is not name: + return self.rename(name) + return self + + @final + def _validate_sort_keyword(self, sort): + if sort not in [None, False, True]: + raise ValueError( + "The 'sort' keyword only takes the values of " + f"None, True, or False; {sort} was passed." + ) + + @final + def _dti_setop_align_tzs(self, other: Index, setop: str_t) -> tuple[Index, Index]: + """ + With mismatched timezones, cast both to UTC. + """ + # Caller is responsibelf or checking + # `self.dtype != other.dtype` + if ( + isinstance(self, ABCDatetimeIndex) + and isinstance(other, ABCDatetimeIndex) + and self.tz is not None + and other.tz is not None + ): + # GH#39328, GH#45357 + left = self.tz_convert("UTC") + right = other.tz_convert("UTC") + return left, right + return self, other + + @final + def union(self, other, sort=None): + """ + Form the union of two Index objects. + + If the Index objects are incompatible, both Index objects will be + cast to dtype('object') first. + + Parameters + ---------- + other : Index or array-like + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` or `other` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * False : do not sort the result. + * True : Sort the result (which may raise TypeError). + + Returns + ------- + Index + + Examples + -------- + Union matching dtypes + + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.union(idx2) + Index([1, 2, 3, 4, 5, 6], dtype='int64') + + Union mismatched dtypes + + >>> idx1 = pd.Index(['a', 'b', 'c', 'd']) + >>> idx2 = pd.Index([1, 2, 3, 4]) + >>> idx1.union(idx2) + Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object') + + MultiIndex case + + >>> idx1 = pd.MultiIndex.from_arrays( + ... [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ... ) + >>> idx1 + MultiIndex([(1, 'Red'), + (1, 'Blue'), + (2, 'Red'), + (2, 'Blue')], + ) + >>> idx2 = pd.MultiIndex.from_arrays( + ... [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] + ... ) + >>> idx2 + MultiIndex([(3, 'Red'), + (3, 'Green'), + (2, 'Red'), + (2, 'Green')], + ) + >>> idx1.union(idx2) + MultiIndex([(1, 'Blue'), + (1, 'Red'), + (2, 'Blue'), + (2, 'Green'), + (2, 'Red'), + (3, 'Green'), + (3, 'Red')], + ) + >>> idx1.union(idx2, sort=False) + MultiIndex([(1, 'Red'), + (1, 'Blue'), + (2, 'Red'), + (2, 'Blue'), + (3, 'Red'), + (3, 'Green'), + (2, 'Green')], + ) + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, result_name = self._convert_can_do_setop(other) + + if self.dtype != other.dtype: + if ( + isinstance(self, ABCMultiIndex) + and not is_object_dtype(_unpack_nested_dtype(other)) + and len(other) > 0 + ): + raise NotImplementedError( + "Can only union MultiIndex with MultiIndex or Index of tuples, " + "try mi.to_flat_index().union(other) instead." + ) + self, other = self._dti_setop_align_tzs(other, "union") + + dtype = self._find_common_type_compat(other) + left = self.astype(dtype, copy=False) + right = other.astype(dtype, copy=False) + return left.union(right, sort=sort) + + elif not len(other) or self.equals(other): + # NB: whether this (and the `if not len(self)` check below) come before + # or after the dtype equality check above affects the returned dtype + result = self._get_reconciled_name_object(other) + if sort is True: + return result.sort_values() + return result + + elif not len(self): + result = other._get_reconciled_name_object(self) + if sort is True: + return result.sort_values() + return result + + result = self._union(other, sort=sort) + + return self._wrap_setop_result(other, result) + + def _union(self, other: Index, sort: bool | None): + """ + Specific union logic should go here. In subclasses, union behavior + should be overwritten here rather than in `self.union`. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Whether to sort the resulting index. + + * True : sort the result + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + + Returns + ------- + Index + """ + lvals = self._values + rvals = other._values + + if ( + sort in (None, True) + and self.is_monotonic_increasing + and other.is_monotonic_increasing + and not (self.has_duplicates and other.has_duplicates) + and self._can_use_libjoin + and other._can_use_libjoin + ): + # Both are monotonic and at least one is unique, so can use outer join + # (actually don't need either unique, but without this restriction + # test_union_same_value_duplicated_in_both fails) + try: + return self._outer_indexer(other)[0] + except (TypeError, IncompatibleFrequency): + # incomparable objects; should only be for object dtype + value_list = list(lvals) + + # worth making this faster? a very unusual case + value_set = set(lvals) + value_list.extend([x for x in rvals if x not in value_set]) + # If objects are unorderable, we must have object dtype. + return np.array(value_list, dtype=object) + + elif not other.is_unique: + # other has duplicates + result_dups = algos.union_with_duplicates(self, other) + return _maybe_try_sort(result_dups, sort) + + # The rest of this method is analogous to Index._intersection_via_get_indexer + + # Self may have duplicates; other already checked as unique + # find indexes of things in "other" that are not in "self" + if self._index_as_unique: + indexer = self.get_indexer(other) + missing = (indexer == -1).nonzero()[0] + else: + missing = algos.unique1d(self.get_indexer_non_unique(other)[1]) + + result: Index | MultiIndex | ArrayLike + if self._is_multi: + # Preserve MultiIndex to avoid losing dtypes + result = self.append(other.take(missing)) + + else: + if len(missing) > 0: + other_diff = rvals.take(missing) + result = concat_compat((lvals, other_diff)) + else: + result = lvals + + if not self.is_monotonic_increasing or not other.is_monotonic_increasing: + # if both are monotonic then result should already be sorted + result = _maybe_try_sort(result, sort) + + return result + + @final + def _wrap_setop_result(self, other: Index, result) -> Index: + name = get_op_result_name(self, other) + if isinstance(result, Index): + if result.name != name: + result = result.rename(name) + else: + result = self._shallow_copy(result, name=name) + return result + + @final + def intersection(self, other, sort: bool = False): + # default sort keyword is different here from other setops intentionally + # done in GH#25063 + """ + Form the intersection of two Index objects. + + This returns a new Index with elements common to the index and `other`. + + Parameters + ---------- + other : Index or array-like + sort : True, False or None, default False + Whether to sort the resulting index. + + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + * False : do not sort the result. + * True : Sort the result (which may raise TypeError). + + Returns + ------- + Index + + Examples + -------- + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.intersection(idx2) + Index([3, 4], dtype='int64') + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, result_name = self._convert_can_do_setop(other) + + if self.dtype != other.dtype: + self, other = self._dti_setop_align_tzs(other, "intersection") + + if self.equals(other): + if not self.is_unique: + result = self.unique()._get_reconciled_name_object(other) + else: + result = self._get_reconciled_name_object(other) + if sort is True: + result = result.sort_values() + return result + + if len(self) == 0 or len(other) == 0: + # fastpath; we need to be careful about having commutativity + + if self._is_multi or other._is_multi: + # _convert_can_do_setop ensures that we have both or neither + # We retain self.levels + return self[:0].rename(result_name) + + dtype = self._find_common_type_compat(other) + if self.dtype == dtype: + # Slicing allows us to retain DTI/TDI.freq, RangeIndex + + # Note: self[:0] vs other[:0] affects + # 1) which index's `freq` we get in DTI/TDI cases + # This may be a historical artifact, i.e. no documented + # reason for this choice. + # 2) The `step` we get in RangeIndex cases + if len(self) == 0: + return self[:0].rename(result_name) + else: + return other[:0].rename(result_name) + + return Index([], dtype=dtype, name=result_name) + + elif not self._should_compare(other): + # We can infer that the intersection is empty. + if isinstance(self, ABCMultiIndex): + return self[:0].rename(result_name) + return Index([], name=result_name) + + elif self.dtype != other.dtype: + dtype = self._find_common_type_compat(other) + this = self.astype(dtype, copy=False) + other = other.astype(dtype, copy=False) + return this.intersection(other, sort=sort) + + result = self._intersection(other, sort=sort) + return self._wrap_intersection_result(other, result) + + def _intersection(self, other: Index, sort: bool = False): + """ + intersection specialized to the case with matching dtypes. + """ + if ( + self.is_monotonic_increasing + and other.is_monotonic_increasing + and self._can_use_libjoin + and other._can_use_libjoin + ): + try: + res_indexer, indexer, _ = self._inner_indexer(other) + except TypeError: + # non-comparable; should only be for object dtype + pass + else: + # TODO: algos.unique1d should preserve DTA/TDA + if is_numeric_dtype(self.dtype): + # This is faster, because Index.unique() checks for uniqueness + # before calculating the unique values. + res = algos.unique1d(res_indexer) + else: + result = self.take(indexer) + res = result.drop_duplicates() + return ensure_wrapped_if_datetimelike(res) + + res_values = self._intersection_via_get_indexer(other, sort=sort) + res_values = _maybe_try_sort(res_values, sort) + return res_values + + def _wrap_intersection_result(self, other, result): + # We will override for MultiIndex to handle empty results + return self._wrap_setop_result(other, result) + + @final + def _intersection_via_get_indexer( + self, other: Index | MultiIndex, sort + ) -> ArrayLike | MultiIndex: + """ + Find the intersection of two Indexes using get_indexer. + + Returns + ------- + np.ndarray or ExtensionArray or MultiIndex + The returned array will be unique. + """ + left_unique = self.unique() + right_unique = other.unique() + + # even though we are unique, we need get_indexer_for for IntervalIndex + indexer = left_unique.get_indexer_for(right_unique) + + mask = indexer != -1 + + taker = indexer.take(mask.nonzero()[0]) + if sort is False: + # sort bc we want the elements in the same order they are in self + # unnecessary in the case with sort=None bc we will sort later + taker = np.sort(taker) + + result: MultiIndex | ExtensionArray | np.ndarray + if isinstance(left_unique, ABCMultiIndex): + result = left_unique.take(taker) + else: + result = left_unique.take(taker)._values + return result + + @final + def difference(self, other, sort=None): + """ + Return a new Index with elements of index not in `other`. + + This is the set difference of two Index objects. + + Parameters + ---------- + other : Index or array-like + sort : bool or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by pandas. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. + * True : Sort the result (which may raise TypeError). + + Returns + ------- + Index + + Examples + -------- + >>> idx1 = pd.Index([2, 1, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.difference(idx2) + Index([1, 2], dtype='int64') + >>> idx1.difference(idx2, sort=False) + Index([2, 1], dtype='int64') + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, result_name = self._convert_can_do_setop(other) + + # Note: we do NOT call _dti_setop_align_tzs here, as there + # is no requirement that .difference be commutative, so it does + # not cast to object. + + if self.equals(other): + # Note: we do not (yet) sort even if sort=None GH#24959 + return self[:0].rename(result_name) + + if len(other) == 0: + # Note: we do not (yet) sort even if sort=None GH#24959 + result = self.unique().rename(result_name) + if sort is True: + return result.sort_values() + return result + + if not self._should_compare(other): + # Nothing matches -> difference is everything + result = self.unique().rename(result_name) + if sort is True: + return result.sort_values() + return result + + result = self._difference(other, sort=sort) + return self._wrap_difference_result(other, result) + + def _difference(self, other, sort): + # overridden by RangeIndex + this = self + if isinstance(self, ABCCategoricalIndex) and self.hasnans and other.hasnans: + this = this.dropna() + other = other.unique() + the_diff = this[other.get_indexer_for(this) == -1] + the_diff = the_diff if this.is_unique else the_diff.unique() + the_diff = _maybe_try_sort(the_diff, sort) + return the_diff + + def _wrap_difference_result(self, other, result): + # We will override for MultiIndex to handle empty results + return self._wrap_setop_result(other, result) + + def symmetric_difference(self, other, result_name=None, sort=None): + """ + Compute the symmetric difference of two Index objects. + + Parameters + ---------- + other : Index or array-like + result_name : str + sort : bool or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by pandas. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. + * True : Sort the result (which may raise TypeError). + + Returns + ------- + Index + + Notes + ----- + ``symmetric_difference`` contains elements that appear in either + ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by + ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates + dropped. + + Examples + -------- + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([2, 3, 4, 5]) + >>> idx1.symmetric_difference(idx2) + Index([1, 5], dtype='int64') + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, result_name_update = self._convert_can_do_setop(other) + if result_name is None: + result_name = result_name_update + + if self.dtype != other.dtype: + self, other = self._dti_setop_align_tzs(other, "symmetric_difference") + + if not self._should_compare(other): + return self.union(other, sort=sort).rename(result_name) + + elif self.dtype != other.dtype: + dtype = self._find_common_type_compat(other) + this = self.astype(dtype, copy=False) + that = other.astype(dtype, copy=False) + return this.symmetric_difference(that, sort=sort).rename(result_name) + + this = self.unique() + other = other.unique() + indexer = this.get_indexer_for(other) + + # {this} minus {other} + common_indexer = indexer.take((indexer != -1).nonzero()[0]) + left_indexer = np.setdiff1d( + np.arange(this.size), common_indexer, assume_unique=True + ) + left_diff = this.take(left_indexer) + + # {other} minus {this} + right_indexer = (indexer == -1).nonzero()[0] + right_diff = other.take(right_indexer) + + res_values = left_diff.append(right_diff) + result = _maybe_try_sort(res_values, sort) + + if not self._is_multi: + return Index(result, name=result_name, dtype=res_values.dtype) + else: + left_diff = cast("MultiIndex", left_diff) + if len(result) == 0: + # result might be an Index, if other was an Index + return left_diff.remove_unused_levels().set_names(result_name) + return result.set_names(result_name) + + @final + def _assert_can_do_setop(self, other) -> bool: + if not is_list_like(other): + raise TypeError("Input must be Index or array-like") + return True + + def _convert_can_do_setop(self, other) -> tuple[Index, Hashable]: + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = get_op_result_name(self, other) + return other, result_name + + # -------------------------------------------------------------------- + # Indexing Methods + + def get_loc(self, key): + """ + Get integer location, slice or boolean mask for requested label. + + Parameters + ---------- + key : label + + Returns + ------- + int if unique index, slice if monotonic index, else mask + + Examples + -------- + >>> unique_index = pd.Index(list('abc')) + >>> unique_index.get_loc('b') + 1 + + >>> monotonic_index = pd.Index(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) + + >>> non_monotonic_index = pd.Index(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True]) + """ + casted_key = self._maybe_cast_indexer(key) + try: + return self._engine.get_loc(casted_key) + except KeyError as err: + if isinstance(casted_key, slice) or ( + isinstance(casted_key, abc.Iterable) + and any(isinstance(x, slice) for x in casted_key) + ): + raise InvalidIndexError(key) + raise KeyError(key) from err + except TypeError: + # If we have a listlike key, _check_indexing_error will raise + # InvalidIndexError. Otherwise we fall through and re-raise + # the TypeError. + self._check_indexing_error(key) + raise + + @final + def get_indexer( + self, + target, + method: ReindexMethod | None = None, + limit: int | None = None, + tolerance=None, + ) -> npt.NDArray[np.intp]: + """ + Compute indexer and mask for new index given the current index. + + The indexer should be then used as an input to ndarray.take to align the + current data to the new index. + + Parameters + ---------- + target : Index + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + limit : int, optional + Maximum number of consecutive labels in ``target`` to match for + inexact matches. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations must + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. + + Returns + ------- + np.ndarray[np.intp] + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. + + Notes + ----- + Returns -1 for unmatched values, for further explanation see the + example below. + + Examples + -------- + >>> index = pd.Index(['c', 'a', 'b']) + >>> index.get_indexer(['a', 'b', 'x']) + array([ 1, 2, -1]) + + Notice that the return value is an array of locations in ``index`` + and ``x`` is marked by -1, as it is not in ``index``. + """ + method = clean_reindex_fill_method(method) + orig_target = target + target = self._maybe_cast_listlike_indexer(target) + + self._check_indexing_method(method, limit, tolerance) + + if not self._index_as_unique: + raise InvalidIndexError(self._requires_unique_msg) + + if len(target) == 0: + return np.array([], dtype=np.intp) + + if not self._should_compare(target) and not self._should_partial_index(target): + # IntervalIndex get special treatment bc numeric scalars can be + # matched to Interval scalars + return self._get_indexer_non_comparable(target, method=method, unique=True) + + if isinstance(self.dtype, CategoricalDtype): + # _maybe_cast_listlike_indexer ensures target has our dtype + # (could improve perf by doing _should_compare check earlier?) + assert self.dtype == target.dtype + + indexer = self._engine.get_indexer(target.codes) + if self.hasnans and target.hasnans: + # After _maybe_cast_listlike_indexer, target elements which do not + # belong to some category are changed to NaNs + # Mask to track actual NaN values compared to inserted NaN values + # GH#45361 + target_nans = isna(orig_target) + loc = self.get_loc(np.nan) + mask = target.isna() + indexer[target_nans] = loc + indexer[mask & ~target_nans] = -1 + return indexer + + if isinstance(target.dtype, CategoricalDtype): + # potential fastpath + # get an indexer for unique categories then propagate to codes via take_nd + # get_indexer instead of _get_indexer needed for MultiIndex cases + # e.g. test_append_different_columns_types + categories_indexer = self.get_indexer(target.categories) + + indexer = algos.take_nd(categories_indexer, target.codes, fill_value=-1) + + if (not self._is_multi and self.hasnans) and target.hasnans: + # Exclude MultiIndex because hasnans raises NotImplementedError + # we should only get here if we are unique, so loc is an integer + # GH#41934 + loc = self.get_loc(np.nan) + mask = target.isna() + indexer[mask] = loc + + return ensure_platform_int(indexer) + + pself, ptarget = self._maybe_downcast_for_indexing(target) + if pself is not self or ptarget is not target: + return pself.get_indexer( + ptarget, method=method, limit=limit, tolerance=tolerance + ) + + if self.dtype == target.dtype and self.equals(target): + # Only call equals if we have same dtype to avoid inference/casting + return np.arange(len(target), dtype=np.intp) + + if self.dtype != target.dtype and not self._should_partial_index(target): + # _should_partial_index e.g. IntervalIndex with numeric scalars + # that can be matched to Interval scalars. + dtype = self._find_common_type_compat(target) + + this = self.astype(dtype, copy=False) + target = target.astype(dtype, copy=False) + return this._get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) + + return self._get_indexer(target, method, limit, tolerance) + + def _get_indexer( + self, + target: Index, + method: str_t | None = None, + limit: int | None = None, + tolerance=None, + ) -> npt.NDArray[np.intp]: + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance, target) + + if method in ["pad", "backfill"]: + indexer = self._get_fill_indexer(target, method, limit, tolerance) + elif method == "nearest": + indexer = self._get_nearest_indexer(target, limit, tolerance) + else: + if target._is_multi and self._is_multi: + engine = self._engine + # error: Item "IndexEngine" of "Union[IndexEngine, ExtensionEngine]" + # has no attribute "_extract_level_codes" + tgt_values = engine._extract_level_codes( # type: ignore[union-attr] + target + ) + else: + tgt_values = target._get_engine_target() + + indexer = self._engine.get_indexer(tgt_values) + + return ensure_platform_int(indexer) + + @final + def _should_partial_index(self, target: Index) -> bool: + """ + Should we attempt partial-matching indexing? + """ + if isinstance(self.dtype, IntervalDtype): + if isinstance(target.dtype, IntervalDtype): + return False + # "Index" has no attribute "left" + return self.left._should_compare(target) # type: ignore[attr-defined] + return False + + @final + def _check_indexing_method( + self, + method: str_t | None, + limit: int | None = None, + tolerance=None, + ) -> None: + """ + Raise if we have a get_indexer `method` that is not supported or valid. + """ + if method not in [None, "bfill", "backfill", "pad", "ffill", "nearest"]: + # in practice the clean_reindex_fill_method call would raise + # before we get here + raise ValueError("Invalid fill method") # pragma: no cover + + if self._is_multi: + if method == "nearest": + raise NotImplementedError( + "method='nearest' not implemented yet " + "for MultiIndex; see GitHub issue 9365" + ) + if method in ("pad", "backfill"): + if tolerance is not None: + raise NotImplementedError( + "tolerance not implemented yet for MultiIndex" + ) + + if isinstance(self.dtype, (IntervalDtype, CategoricalDtype)): + # GH#37871 for now this is only for IntervalIndex and CategoricalIndex + if method is not None: + raise NotImplementedError( + f"method {method} not yet implemented for {type(self).__name__}" + ) + + if method is None: + if tolerance is not None: + raise ValueError( + "tolerance argument only valid if doing pad, " + "backfill or nearest reindexing" + ) + if limit is not None: + raise ValueError( + "limit argument only valid if doing pad, " + "backfill or nearest reindexing" + ) + + def _convert_tolerance(self, tolerance, target: np.ndarray | Index) -> np.ndarray: + # override this method on subclasses + tolerance = np.asarray(tolerance) + if target.size != tolerance.size and tolerance.size > 1: + raise ValueError("list-like tolerance size must match target index size") + elif is_numeric_dtype(self) and not np.issubdtype(tolerance.dtype, np.number): + if tolerance.ndim > 0: + raise ValueError( + f"tolerance argument for {type(self).__name__} with dtype " + f"{self.dtype} must contain numeric elements if it is list type" + ) + + raise ValueError( + f"tolerance argument for {type(self).__name__} with dtype {self.dtype} " + f"must be numeric if it is a scalar: {repr(tolerance)}" + ) + return tolerance + + @final + def _get_fill_indexer( + self, target: Index, method: str_t, limit: int | None = None, tolerance=None + ) -> npt.NDArray[np.intp]: + if self._is_multi: + if not (self.is_monotonic_increasing or self.is_monotonic_decreasing): + raise ValueError("index must be monotonic increasing or decreasing") + encoded = self.append(target)._engine.values # type: ignore[union-attr] + self_encoded = Index(encoded[: len(self)]) + target_encoded = Index(encoded[len(self) :]) + return self_encoded._get_fill_indexer( + target_encoded, method, limit, tolerance + ) + + if self.is_monotonic_increasing and target.is_monotonic_increasing: + target_values = target._get_engine_target() + own_values = self._get_engine_target() + if not isinstance(target_values, np.ndarray) or not isinstance( + own_values, np.ndarray + ): + raise NotImplementedError + + if method == "pad": + indexer = libalgos.pad(own_values, target_values, limit=limit) + else: + # i.e. "backfill" + indexer = libalgos.backfill(own_values, target_values, limit=limit) + else: + indexer = self._get_fill_indexer_searchsorted(target, method, limit) + if tolerance is not None and len(self): + indexer = self._filter_indexer_tolerance(target, indexer, tolerance) + return indexer + + @final + def _get_fill_indexer_searchsorted( + self, target: Index, method: str_t, limit: int | None = None + ) -> npt.NDArray[np.intp]: + """ + Fallback pad/backfill get_indexer that works for monotonic decreasing + indexes and non-monotonic targets. + """ + if limit is not None: + raise ValueError( + f"limit argument for {repr(method)} method only well-defined " + "if index and target are monotonic" + ) + + side: Literal["left", "right"] = "left" if method == "pad" else "right" + + # find exact matches first (this simplifies the algorithm) + indexer = self.get_indexer(target) + nonexact = indexer == -1 + indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side) + if side == "left": + # searchsorted returns "indices into a sorted array such that, + # if the corresponding elements in v were inserted before the + # indices, the order of a would be preserved". + # Thus, we need to subtract 1 to find values to the left. + indexer[nonexact] -= 1 + # This also mapped not found values (values of 0 from + # np.searchsorted) to -1, which conveniently is also our + # sentinel for missing values + else: + # Mark indices to the right of the largest value as not found + indexer[indexer == len(self)] = -1 + return indexer + + @final + def _get_nearest_indexer( + self, target: Index, limit: int | None, tolerance + ) -> npt.NDArray[np.intp]: + """ + Get the indexer for the nearest index labels; requires an index with + values that can be subtracted from each other (e.g., not strings or + tuples). + """ + if not len(self): + return self._get_fill_indexer(target, "pad") + + left_indexer = self.get_indexer(target, "pad", limit=limit) + right_indexer = self.get_indexer(target, "backfill", limit=limit) + + left_distances = self._difference_compat(target, left_indexer) + right_distances = self._difference_compat(target, right_indexer) + + op = operator.lt if self.is_monotonic_increasing else operator.le + indexer = np.where( + # error: Argument 1&2 has incompatible type "Union[ExtensionArray, + # ndarray[Any, Any]]"; expected "Union[SupportsDunderLE, + # SupportsDunderGE, SupportsDunderGT, SupportsDunderLT]" + op(left_distances, right_distances) # type: ignore[arg-type] + | (right_indexer == -1), + left_indexer, + right_indexer, + ) + if tolerance is not None: + indexer = self._filter_indexer_tolerance(target, indexer, tolerance) + return indexer + + @final + def _filter_indexer_tolerance( + self, + target: Index, + indexer: npt.NDArray[np.intp], + tolerance, + ) -> npt.NDArray[np.intp]: + distance = self._difference_compat(target, indexer) + + return np.where(distance <= tolerance, indexer, -1) + + @final + def _difference_compat( + self, target: Index, indexer: npt.NDArray[np.intp] + ) -> ArrayLike: + # Compatibility for PeriodArray, for which __sub__ returns an ndarray[object] + # of DateOffset objects, which do not support __abs__ (and would be slow + # if they did) + + if isinstance(self.dtype, PeriodDtype): + # Note: we only get here with matching dtypes + own_values = cast("PeriodArray", self._data)._ndarray + target_values = cast("PeriodArray", target._data)._ndarray + diff = own_values[indexer] - target_values + else: + # error: Unsupported left operand type for - ("ExtensionArray") + diff = self._values[indexer] - target._values # type: ignore[operator] + return abs(diff) + + # -------------------------------------------------------------------- + # Indexer Conversion Methods + + @final + def _validate_positional_slice(self, key: slice) -> None: + """ + For positional indexing, a slice must have either int or None + for each of start, stop, and step. + """ + self._validate_indexer("positional", key.start, "iloc") + self._validate_indexer("positional", key.stop, "iloc") + self._validate_indexer("positional", key.step, "iloc") + + def _convert_slice_indexer(self, key: slice, kind: Literal["loc", "getitem"]): + """ + Convert a slice indexer. + + By definition, these are labels unless 'iloc' is passed in. + Floats are not allowed as the start, step, or stop of the slice. + + Parameters + ---------- + key : label of the slice bound + kind : {'loc', 'getitem'} + """ + + # potentially cast the bounds to integers + start, stop, step = key.start, key.stop, key.step + + # figure out if this is a positional indexer + is_index_slice = is_valid_positional_slice(key) + + # TODO(GH#50617): once Series.__[gs]etitem__ is removed we should be able + # to simplify this. + if lib.is_np_dtype(self.dtype, "f"): + # We always treat __getitem__ slicing as label-based + # translate to locations + if kind == "getitem" and is_index_slice and not start == stop and step != 0: + # exclude step=0 from the warning because it will raise anyway + # start/stop both None e.g. [:] or [::-1] won't change. + # exclude start==stop since it will be empty either way, or + # will be [:] or [::-1] which won't change + warnings.warn( + # GH#49612 + "The behavior of obj[i:j] with a float-dtype index is " + "deprecated. In a future version, this will be treated as " + "positional instead of label-based. For label-based slicing, " + "use obj.loc[i:j] instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.slice_indexer(start, stop, step) + + if kind == "getitem": + # called from the getitem slicers, validate that we are in fact integers + if is_index_slice: + # In this case the _validate_indexer checks below are redundant + return key + elif self.dtype.kind in "iu": + # Note: these checks are redundant if we know is_index_slice + self._validate_indexer("slice", key.start, "getitem") + self._validate_indexer("slice", key.stop, "getitem") + self._validate_indexer("slice", key.step, "getitem") + return key + + # convert the slice to an indexer here; checking that the user didn't + # pass a positional slice to loc + is_positional = is_index_slice and self._should_fallback_to_positional + + # if we are mixed and have integers + if is_positional: + try: + # Validate start & stop + if start is not None: + self.get_loc(start) + if stop is not None: + self.get_loc(stop) + is_positional = False + except KeyError: + pass + + if com.is_null_slice(key): + # It doesn't matter if we are positional or label based + indexer = key + elif is_positional: + if kind == "loc": + # GH#16121, GH#24612, GH#31810 + raise TypeError( + "Slicing a positional slice with .loc is not allowed, " + "Use .loc with labels or .iloc with positions instead.", + ) + indexer = key + else: + indexer = self.slice_indexer(start, stop, step) + + return indexer + + @final + def _raise_invalid_indexer( + self, + form: Literal["slice", "positional"], + key, + reraise: lib.NoDefault | None | Exception = lib.no_default, + ) -> None: + """ + Raise consistent invalid indexer message. + """ + msg = ( + f"cannot do {form} indexing on {type(self).__name__} with these " + f"indexers [{key}] of type {type(key).__name__}" + ) + if reraise is not lib.no_default: + raise TypeError(msg) from reraise + raise TypeError(msg) + + # -------------------------------------------------------------------- + # Reindex Methods + + @final + def _validate_can_reindex(self, indexer: np.ndarray) -> None: + """ + Check if we are allowing reindexing with this particular indexer. + + Parameters + ---------- + indexer : an integer ndarray + + Raises + ------ + ValueError if its a duplicate axis + """ + # trying to reindex on an axis with duplicates + if not self._index_as_unique and len(indexer): + raise ValueError("cannot reindex on an axis with duplicate labels") + + def reindex( + self, + target, + method: ReindexMethod | None = None, + level=None, + limit: int | None = None, + tolerance: float | None = None, + ) -> tuple[Index, npt.NDArray[np.intp] | None]: + """ + Create index with target's values. + + Parameters + ---------- + target : an iterable + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + level : int, optional + Level of multiindex. + limit : int, optional + Maximum number of consecutive labels in ``target`` to match for + inexact matches. + tolerance : int or float, optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations must + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. + + Returns + ------- + new_index : pd.Index + Resulting index. + indexer : np.ndarray[np.intp] or None + Indices of output values in original index. + + Raises + ------ + TypeError + If ``method`` passed along with ``level``. + ValueError + If non-unique multi-index + ValueError + If non-unique index and ``method`` or ``limit`` passed. + + See Also + -------- + Series.reindex : Conform Series to new index with optional filling logic. + DataFrame.reindex : Conform DataFrame to new index with optional filling logic. + + Examples + -------- + >>> idx = pd.Index(['car', 'bike', 'train', 'tractor']) + >>> idx + Index(['car', 'bike', 'train', 'tractor'], dtype='object') + >>> idx.reindex(['car', 'bike']) + (Index(['car', 'bike'], dtype='object'), array([0, 1])) + """ + # GH6552: preserve names when reindexing to non-named target + # (i.e. neither Index nor Series). + preserve_names = not hasattr(target, "name") + + # GH7774: preserve dtype/tz if target is empty and not an Index. + target = ensure_has_len(target) # target may be an iterator + + if not isinstance(target, Index) and len(target) == 0: + if level is not None and self._is_multi: + # "Index" has no attribute "levels"; maybe "nlevels"? + idx = self.levels[level] # type: ignore[attr-defined] + else: + idx = self + target = idx[:0] + else: + target = ensure_index(target) + + if level is not None and ( + isinstance(self, ABCMultiIndex) or isinstance(target, ABCMultiIndex) + ): + if method is not None: + raise TypeError("Fill method not supported if level passed") + + # TODO: tests where passing `keep_order=not self._is_multi` + # makes a difference for non-MultiIndex case + target, indexer, _ = self._join_level( + target, level, how="right", keep_order=not self._is_multi + ) + + else: + if self.equals(target): + indexer = None + else: + if self._index_as_unique: + indexer = self.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) + elif self._is_multi: + raise ValueError("cannot handle a non-unique multi-index!") + elif not self.is_unique: + # GH#42568 + raise ValueError("cannot reindex on an axis with duplicate labels") + else: + indexer, _ = self.get_indexer_non_unique(target) + + target = self._wrap_reindex_result(target, indexer, preserve_names) + return target, indexer + + def _wrap_reindex_result(self, target, indexer, preserve_names: bool): + target = self._maybe_preserve_names(target, preserve_names) + return target + + def _maybe_preserve_names(self, target: Index, preserve_names: bool): + if preserve_names and target.nlevels == 1 and target.name != self.name: + target = target.copy(deep=False) + target.name = self.name + return target + + @final + def _reindex_non_unique( + self, target: Index + ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp] | None]: + """ + Create a new index with target's values (move/add/delete values as + necessary) use with non-unique Index and a possibly non-unique target. + + Parameters + ---------- + target : an iterable + + Returns + ------- + new_index : pd.Index + Resulting index. + indexer : np.ndarray[np.intp] + Indices of output values in original index. + new_indexer : np.ndarray[np.intp] or None + + """ + target = ensure_index(target) + if len(target) == 0: + # GH#13691 + return self[:0], np.array([], dtype=np.intp), None + + indexer, missing = self.get_indexer_non_unique(target) + check = indexer != -1 + new_labels: Index | np.ndarray = self.take(indexer[check]) + new_indexer = None + + if len(missing): + length = np.arange(len(indexer), dtype=np.intp) + + missing = ensure_platform_int(missing) + missing_labels = target.take(missing) + missing_indexer = length[~check] + cur_labels = self.take(indexer[check]).values + cur_indexer = length[check] + + # Index constructor below will do inference + new_labels = np.empty((len(indexer),), dtype=object) + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels + + # GH#38906 + if not len(self): + new_indexer = np.arange(0, dtype=np.intp) + + # a unique indexer + elif target.is_unique: + # see GH5553, make sure we use the right indexer + new_indexer = np.arange(len(indexer), dtype=np.intp) + new_indexer[cur_indexer] = np.arange(len(cur_labels)) + new_indexer[missing_indexer] = -1 + + # we have a non_unique selector, need to use the original + # indexer here + else: + # need to retake to have the same size as the indexer + indexer[~check] = -1 + + # reset the new indexer to account for the new size + new_indexer = np.arange(len(self.take(indexer)), dtype=np.intp) + new_indexer[~check] = -1 + + if not isinstance(self, ABCMultiIndex): + new_index = Index(new_labels, name=self.name) + else: + new_index = type(self).from_tuples(new_labels, names=self.names) + return new_index, indexer, new_indexer + + # -------------------------------------------------------------------- + # Join Methods + + @overload + def join( + self, + other: Index, + *, + how: JoinHow = ..., + level: Level = ..., + return_indexers: Literal[True], + sort: bool = ..., + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + ... + + @overload + def join( + self, + other: Index, + *, + how: JoinHow = ..., + level: Level = ..., + return_indexers: Literal[False] = ..., + sort: bool = ..., + ) -> Index: + ... + + @overload + def join( + self, + other: Index, + *, + how: JoinHow = ..., + level: Level = ..., + return_indexers: bool = ..., + sort: bool = ..., + ) -> Index | tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + ... + + @final + @_maybe_return_indexers + def join( + self, + other: Index, + *, + how: JoinHow = "left", + level: Level | None = None, + return_indexers: bool = False, + sort: bool = False, + ) -> Index | tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + """ + Compute join_index and indexers to conform data structures to the new index. + + Parameters + ---------- + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : int or level name, default None + return_indexers : bool, default False + sort : bool, default False + Sort the join keys lexicographically in the result Index. If False, + the order of the join keys depends on the join type (how keyword). + + Returns + ------- + join_index, (left_indexer, right_indexer) + + Examples + -------- + >>> idx1 = pd.Index([1, 2, 3]) + >>> idx2 = pd.Index([4, 5, 6]) + >>> idx1.join(idx2, how='outer') + Index([1, 2, 3, 4, 5, 6], dtype='int64') + """ + other = ensure_index(other) + sort = sort or how == "outer" + + if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex): + if (self.tz is None) ^ (other.tz is None): + # Raise instead of casting to object below. + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + + if not self._is_multi and not other._is_multi: + # We have specific handling for MultiIndex below + pself, pother = self._maybe_downcast_for_indexing(other) + if pself is not self or pother is not other: + return pself.join( + pother, how=how, level=level, return_indexers=True, sort=sort + ) + + # try to figure out the join level + # GH3662 + if level is None and (self._is_multi or other._is_multi): + # have the same levels/names so a simple join + if self.names == other.names: + pass + else: + return self._join_multi(other, how=how) + + # join on the level + if level is not None and (self._is_multi or other._is_multi): + return self._join_level(other, level, how=how) + + if len(self) == 0 or len(other) == 0: + try: + return self._join_empty(other, how, sort) + except TypeError: + # object dtype; non-comparable objects + pass + + if self.dtype != other.dtype: + dtype = self._find_common_type_compat(other) + this = self.astype(dtype, copy=False) + other = other.astype(dtype, copy=False) + return this.join(other, how=how, return_indexers=True) + elif ( + isinstance(self, ABCCategoricalIndex) + and isinstance(other, ABCCategoricalIndex) + and not self.ordered + and not self.categories.equals(other.categories) + ): + # dtypes are "equal" but categories are in different order + other = Index(other._values.reorder_categories(self.categories)) + + _validate_join_method(how) + + if ( + self.is_monotonic_increasing + and other.is_monotonic_increasing + and self._can_use_libjoin + and other._can_use_libjoin + and (self.is_unique or other.is_unique) + ): + try: + return self._join_monotonic(other, how=how) + except TypeError: + # object dtype; non-comparable objects + pass + elif not self.is_unique or not other.is_unique: + return self._join_non_unique(other, how=how, sort=sort) + + return self._join_via_get_indexer(other, how, sort) + + @final + def _join_empty( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + assert len(self) == 0 or len(other) == 0 + _validate_join_method(how) + + lidx: np.ndarray | None + ridx: np.ndarray | None + + if len(other): + how = cast(JoinHow, {"left": "right", "right": "left"}.get(how, how)) + join_index, ridx, lidx = other._join_empty(self, how, sort) + elif how in ["left", "outer"]: + if sort and not self.is_monotonic_increasing: + lidx = self.argsort() + join_index = self.take(lidx) + else: + lidx = None + join_index = self._view() + ridx = np.broadcast_to(np.intp(-1), len(join_index)) + else: + join_index = other._view() + lidx = np.array([], dtype=np.intp) + ridx = None + return join_index, lidx, ridx + + @final + def _join_via_get_indexer( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + # Fallback if we do not have any fastpaths available based on + # uniqueness/monotonicity + + # Note: at this point we have checked matching dtypes + + if how == "left": + join_index = self.sort_values() if sort else self + elif how == "right": + join_index = other.sort_values() if sort else other + elif how == "inner": + join_index = self.intersection(other, sort=sort) + elif how == "outer": + try: + join_index = self.union(other, sort=sort) + except TypeError: + join_index = self.union(other) + try: + join_index = _maybe_try_sort(join_index, sort) + except TypeError: + pass + + if join_index is self: + lindexer = None + else: + lindexer = self.get_indexer_for(join_index) + if join_index is other: + rindexer = None + else: + rindexer = other.get_indexer_for(join_index) + return join_index, lindexer, rindexer + + @final + def _join_multi(self, other: Index, how: JoinHow): + from pandas.core.indexes.multi import MultiIndex + from pandas.core.reshape.merge import restore_dropped_levels_multijoin + + # figure out join names + self_names_list = list(com.not_none(*self.names)) + other_names_list = list(com.not_none(*other.names)) + self_names_order = self_names_list.index + other_names_order = other_names_list.index + self_names = set(self_names_list) + other_names = set(other_names_list) + overlap = self_names & other_names + + # need at least 1 in common + if not overlap: + raise ValueError("cannot join with no overlapping index names") + + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): + # Drop the non-matching levels from left and right respectively + ldrop_names = sorted(self_names - overlap, key=self_names_order) + rdrop_names = sorted(other_names - overlap, key=other_names_order) + + # if only the order differs + if not len(ldrop_names + rdrop_names): + self_jnlevels = self + other_jnlevels = other.reorder_levels(self.names) + else: + self_jnlevels = self.droplevel(ldrop_names) + other_jnlevels = other.droplevel(rdrop_names) + + # Join left and right + # Join on same leveled multi-index frames is supported + join_idx, lidx, ridx = self_jnlevels.join( + other_jnlevels, how=how, return_indexers=True + ) + + # Restore the dropped levels + # Returned index level order is + # common levels, ldrop_names, rdrop_names + dropped_names = ldrop_names + rdrop_names + + # error: Argument 5/6 to "restore_dropped_levels_multijoin" has + # incompatible type "Optional[ndarray[Any, dtype[signedinteger[Any + # ]]]]"; expected "ndarray[Any, dtype[signedinteger[Any]]]" + levels, codes, names = restore_dropped_levels_multijoin( + self, + other, + dropped_names, + join_idx, + lidx, # type: ignore[arg-type] + ridx, # type: ignore[arg-type] + ) + + # Re-create the multi-index + multi_join_idx = MultiIndex( + levels=levels, codes=codes, names=names, verify_integrity=False + ) + + multi_join_idx = multi_join_idx.remove_unused_levels() + + # maintain the order of the index levels + if how == "right": + level_order = other_names_list + ldrop_names + else: + level_order = self_names_list + rdrop_names + multi_join_idx = multi_join_idx.reorder_levels(level_order) + + return multi_join_idx, lidx, ridx + + jl = next(iter(overlap)) + + # Case where only one index is multi + # make the indices into mi's that match + flip_order = False + if isinstance(self, MultiIndex): + self, other = other, self + flip_order = True + # flip if join method is right or left + flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"} + how = flip.get(how, how) + + level = other.names.index(jl) + result = self._join_level(other, level, how=how) + + if flip_order: + return result[0], result[2], result[1] + return result + + @final + def _join_non_unique( + self, other: Index, how: JoinHow = "left", sort: bool = False + ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]: + from pandas.core.reshape.merge import get_join_indexers_non_unique + + # We only get here if dtypes match + assert self.dtype == other.dtype + + left_idx, right_idx = get_join_indexers_non_unique( + self._values, other._values, how=how, sort=sort + ) + mask = left_idx == -1 + + join_idx = self.take(left_idx) + right = other.take(right_idx) + join_index = join_idx.putmask(mask, right) + if isinstance(join_index, ABCMultiIndex) and how == "outer": + # test_join_index_levels + join_index = join_index._sort_levels_monotonic() + return join_index, left_idx, right_idx + + @final + def _join_level( + self, other: Index, level, how: JoinHow = "left", keep_order: bool = True + ) -> tuple[MultiIndex, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + """ + The join method *only* affects the level of the resulting + MultiIndex. Otherwise it just exactly aligns the Index data to the + labels of the level in the MultiIndex. + + If ```keep_order == True```, the order of the data indexed by the + MultiIndex will not be changed; otherwise, it will tie out + with `other`. + """ + from pandas.core.indexes.multi import MultiIndex + + def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]: + """ + Returns sorter for the inner most level while preserving the + order of higher levels. + + Parameters + ---------- + labels : list[np.ndarray] + Each ndarray has signed integer dtype, not necessarily identical. + + Returns + ------- + np.ndarray[np.intp] + """ + if labels[0].size == 0: + return np.empty(0, dtype=np.intp) + + if len(labels) == 1: + return get_group_index_sorter(ensure_platform_int(labels[0])) + + # find indexers of beginning of each set of + # same-key labels w.r.t all but last level + tic = labels[0][:-1] != labels[0][1:] + for lab in labels[1:-1]: + tic |= lab[:-1] != lab[1:] + + starts = np.hstack(([True], tic, [True])).nonzero()[0] + lab = ensure_int64(labels[-1]) + return lib.get_level_sorter(lab, ensure_platform_int(starts)) + + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): + raise TypeError("Join on level between two MultiIndex objects is ambiguous") + + left, right = self, other + + flip_order = not isinstance(self, MultiIndex) + if flip_order: + left, right = right, left + flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"} + how = flip.get(how, how) + + assert isinstance(left, MultiIndex) + + level = left._get_level_number(level) + old_level = left.levels[level] + + if not right.is_unique: + raise NotImplementedError( + "Index._join_level on non-unique index is not implemented" + ) + + new_level, left_lev_indexer, right_lev_indexer = old_level.join( + right, how=how, return_indexers=True + ) + + if left_lev_indexer is None: + if keep_order or len(left) == 0: + left_indexer = None + join_index = left + else: # sort the leaves + left_indexer = _get_leaf_sorter(left.codes[: level + 1]) + join_index = left[left_indexer] + + else: + left_lev_indexer = ensure_platform_int(left_lev_indexer) + rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) + old_codes = left.codes[level] + + taker = old_codes[old_codes != -1] + new_lev_codes = rev_indexer.take(taker) + + new_codes = list(left.codes) + new_codes[level] = new_lev_codes + + new_levels = list(left.levels) + new_levels[level] = new_level + + if keep_order: # just drop missing values. o.w. keep order + left_indexer = np.arange(len(left), dtype=np.intp) + left_indexer = cast(np.ndarray, left_indexer) + mask = new_lev_codes != -1 + if not mask.all(): + new_codes = [lab[mask] for lab in new_codes] + left_indexer = left_indexer[mask] + + else: # tie out the order with other + if level == 0: # outer most level, take the fast route + max_new_lev = 0 if len(new_lev_codes) == 0 else new_lev_codes.max() + ngroups = 1 + max_new_lev + left_indexer, counts = libalgos.groupsort_indexer( + new_lev_codes, ngroups + ) + + # missing values are placed first; drop them! + left_indexer = left_indexer[counts[0] :] + new_codes = [lab[left_indexer] for lab in new_codes] + + else: # sort the leaves + mask = new_lev_codes != -1 + mask_all = mask.all() + if not mask_all: + new_codes = [lab[mask] for lab in new_codes] + + left_indexer = _get_leaf_sorter(new_codes[: level + 1]) + new_codes = [lab[left_indexer] for lab in new_codes] + + # left_indexers are w.r.t masked frame. + # reverse to original frame! + if not mask_all: + left_indexer = mask.nonzero()[0][left_indexer] + + join_index = MultiIndex( + levels=new_levels, + codes=new_codes, + names=left.names, + verify_integrity=False, + ) + + if right_lev_indexer is not None: + right_indexer = right_lev_indexer.take(join_index.codes[level]) + else: + right_indexer = join_index.codes[level] + + if flip_order: + left_indexer, right_indexer = right_indexer, left_indexer + + left_indexer = ( + None if left_indexer is None else ensure_platform_int(left_indexer) + ) + right_indexer = ( + None if right_indexer is None else ensure_platform_int(right_indexer) + ) + return join_index, left_indexer, right_indexer + + @final + def _join_monotonic( + self, other: Index, how: JoinHow = "left" + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + # We only get here with matching dtypes and both monotonic increasing + assert other.dtype == self.dtype + assert self._can_use_libjoin and other._can_use_libjoin + + if self.equals(other): + # This is a convenient place for this check, but its correctness + # does not depend on monotonicity, so it could go earlier + # in the calling method. + ret_index = other if how == "right" else self + return ret_index, None, None + + ridx: npt.NDArray[np.intp] | None + lidx: npt.NDArray[np.intp] | None + + if self.is_unique and other.is_unique: + # We can perform much better than the general case + if how == "left": + join_index = self + lidx = None + ridx = self._left_indexer_unique(other) + elif how == "right": + join_index = other + lidx = other._left_indexer_unique(self) + ridx = None + elif how == "inner": + join_array, lidx, ridx = self._inner_indexer(other) + join_index = self._wrap_joined_index(join_array, other, lidx, ridx) + elif how == "outer": + join_array, lidx, ridx = self._outer_indexer(other) + join_index = self._wrap_joined_index(join_array, other, lidx, ridx) + else: + if how == "left": + join_array, lidx, ridx = self._left_indexer(other) + elif how == "right": + join_array, ridx, lidx = other._left_indexer(self) + elif how == "inner": + join_array, lidx, ridx = self._inner_indexer(other) + elif how == "outer": + join_array, lidx, ridx = self._outer_indexer(other) + + assert lidx is not None + assert ridx is not None + + join_index = self._wrap_joined_index(join_array, other, lidx, ridx) + + lidx = None if lidx is None else ensure_platform_int(lidx) + ridx = None if ridx is None else ensure_platform_int(ridx) + return join_index, lidx, ridx + + def _wrap_joined_index( + self, + joined: ArrayLike, + other: Self, + lidx: npt.NDArray[np.intp], + ridx: npt.NDArray[np.intp], + ) -> Self: + assert other.dtype == self.dtype + + if isinstance(self, ABCMultiIndex): + name = self.names if self.names == other.names else None + # error: Incompatible return value type (got "MultiIndex", + # expected "Self") + mask = lidx == -1 + join_idx = self.take(lidx) + right = cast("MultiIndex", other.take(ridx)) + join_index = join_idx.putmask(mask, right)._sort_levels_monotonic() + return join_index.set_names(name) # type: ignore[return-value] + else: + name = get_op_result_name(self, other) + return self._constructor._with_infer(joined, name=name, dtype=self.dtype) + + @final + @cache_readonly + def _can_use_libjoin(self) -> bool: + """ + Whether we can use the fastpaths implemented in _libs.join. + + This is driven by whether (in monotonic increasing cases that are + guaranteed not to have NAs) we can convert to a np.ndarray without + making a copy. If we cannot, this negates the performance benefit + of using libjoin. + """ + if type(self) is Index: + # excludes EAs, but include masks, we get here with monotonic + # values only, meaning no NA + return ( + isinstance(self.dtype, np.dtype) + or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) + or self.dtype == "string[python]" + ) + # Exclude index types where the conversion to numpy converts to object dtype, + # which negates the performance benefit of libjoin + # Subclasses should override to return False if _get_join_target is + # not zero-copy. + # TODO: exclude RangeIndex (which allocates memory)? + # Doing so seems to break test_concat_datetime_timezone + return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex)) + + # -------------------------------------------------------------------- + # Uncategorized Methods + + @property + def values(self) -> ArrayLike: + """ + Return an array representing the data in the Index. + + .. warning:: + + We recommend using :attr:`Index.array` or + :meth:`Index.to_numpy`, depending on whether you need + a reference to the underlying data or a NumPy array. + + Returns + ------- + array: numpy.ndarray or ExtensionArray + + See Also + -------- + Index.array : Reference to the underlying data. + Index.to_numpy : A NumPy array representing the underlying data. + + Examples + -------- + For :class:`pandas.Index`: + + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.values + array([1, 2, 3]) + + For :class:`pandas.IntervalIndex`: + + >>> idx = pd.interval_range(start=0, end=5) + >>> idx.values + + [(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]] + Length: 5, dtype: interval[int64, right] + """ + if using_copy_on_write(): + data = self._data + if isinstance(data, np.ndarray): + data = data.view() + data.flags.writeable = False + return data + return self._data + + @cache_readonly + @doc(IndexOpsMixin.array) + def array(self) -> ExtensionArray: + array = self._data + if isinstance(array, np.ndarray): + from pandas.core.arrays.numpy_ import NumpyExtensionArray + + array = NumpyExtensionArray(array) + return array + + @property + def _values(self) -> ExtensionArray | np.ndarray: + """ + The best array representation. + + This is an ndarray or ExtensionArray. + + ``_values`` are consistent between ``Series`` and ``Index``. + + It may differ from the public '.values' method. + + index | values | _values | + ----------------- | --------------- | ------------- | + Index | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | + DatetimeIndex | ndarray[M8ns] | DatetimeArray | + DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | + PeriodIndex | ndarray[object] | PeriodArray | + IntervalIndex | IntervalArray | IntervalArray | + + See Also + -------- + values : Values + """ + return self._data + + def _get_engine_target(self) -> ArrayLike: + """ + Get the ndarray or ExtensionArray that we can pass to the IndexEngine + constructor. + """ + vals = self._values + if isinstance(vals, StringArray): + # GH#45652 much more performant than ExtensionEngine + return vals._ndarray + if isinstance(vals, ArrowExtensionArray) and self.dtype.kind in "Mm": + import pyarrow as pa + + pa_type = vals._pa_array.type + if pa.types.is_timestamp(pa_type): + vals = vals._to_datetimearray() + return vals._ndarray.view("i8") + elif pa.types.is_duration(pa_type): + vals = vals._to_timedeltaarray() + return vals._ndarray.view("i8") + if ( + type(self) is Index + and isinstance(self._values, ExtensionArray) + and not isinstance(self._values, BaseMaskedArray) + and not ( + isinstance(self._values, ArrowExtensionArray) + and is_numeric_dtype(self.dtype) + # Exclude decimal + and self.dtype.kind != "O" + ) + ): + # TODO(ExtensionIndex): remove special-case, just use self._values + return self._values.astype(object) + return vals + + @final + def _get_join_target(self) -> np.ndarray: + """ + Get the ndarray or ExtensionArray that we can pass to the join + functions. + """ + if isinstance(self._values, BaseMaskedArray): + # This is only used if our array is monotonic, so no NAs present + return self._values._data + elif isinstance(self._values, ArrowExtensionArray): + # This is only used if our array is monotonic, so no missing values + # present + return self._values.to_numpy() + + # TODO: exclude ABCRangeIndex case here as it copies + target = self._get_engine_target() + if not isinstance(target, np.ndarray): + raise ValueError("_can_use_libjoin should return False.") + return target + + def _from_join_target(self, result: np.ndarray) -> ArrayLike: + """ + Cast the ndarray returned from one of the libjoin.foo_indexer functions + back to type(self._data). + """ + if isinstance(self.values, BaseMaskedArray): + return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_)) + elif isinstance(self.values, (ArrowExtensionArray, StringArray)): + return type(self.values)._from_sequence(result, dtype=self.dtype) + return result + + @doc(IndexOpsMixin._memory_usage) + def memory_usage(self, deep: bool = False) -> int: + result = self._memory_usage(deep=deep) + + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result + + @final + def where(self, cond, other=None) -> Index: + """ + Replace values where the condition is False. + + The replacement is taken from other. + + Parameters + ---------- + cond : bool array-like with the same length as self + Condition to select the values on. + other : scalar, or array-like, default None + Replacement if the condition is False. + + Returns + ------- + pandas.Index + A copy of self with values replaced from other + where the condition is False. + + See Also + -------- + Series.where : Same method for Series. + DataFrame.where : Same method for DataFrame. + + Examples + -------- + >>> idx = pd.Index(['car', 'bike', 'train', 'tractor']) + >>> idx + Index(['car', 'bike', 'train', 'tractor'], dtype='object') + >>> idx.where(idx.isin(['car', 'train']), 'other') + Index(['car', 'other', 'train', 'other'], dtype='object') + """ + if isinstance(self, ABCMultiIndex): + raise NotImplementedError( + ".where is not supported for MultiIndex operations" + ) + cond = np.asarray(cond, dtype=bool) + return self.putmask(~cond, other) + + # construction helpers + @final + @classmethod + def _raise_scalar_data_error(cls, data): + # We return the TypeError so that we can raise it from the constructor + # in order to keep mypy happy + raise TypeError( + f"{cls.__name__}(...) must be called with a collection of some " + f"kind, {repr(data) if not isinstance(data, np.generic) else str(data)} " + "was passed" + ) + + def _validate_fill_value(self, value): + """ + Check if the value can be inserted into our array without casting, + and convert it to an appropriate native type if necessary. + + Raises + ------ + TypeError + If the value cannot be inserted into an array of this dtype. + """ + dtype = self.dtype + if isinstance(dtype, np.dtype) and dtype.kind not in "mM": + # return np_can_hold_element(dtype, value) + try: + return np_can_hold_element(dtype, value) + except LossySetitemError as err: + # re-raise as TypeError for consistency + raise TypeError from err + elif not can_hold_element(self._values, value): + raise TypeError + return value + + def _is_memory_usage_qualified(self) -> bool: + """ + Return a boolean if we need a qualified .info display. + """ + return is_object_dtype(self.dtype) + + def __contains__(self, key: Any) -> bool: + """ + Return a boolean indicating whether the provided key is in the index. + + Parameters + ---------- + key : label + The key to check if it is present in the index. + + Returns + ------- + bool + Whether the key search is in the index. + + Raises + ------ + TypeError + If the key is not hashable. + + See Also + -------- + Index.isin : Returns an ndarray of boolean dtype indicating whether the + list-like key is in the index. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx + Index([1, 2, 3, 4], dtype='int64') + + >>> 2 in idx + True + >>> 6 in idx + False + """ + hash(key) + try: + return key in self._engine + except (OverflowError, TypeError, ValueError): + return False + + # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 + # Incompatible types in assignment (expression has type "None", base class + # "object" defined the type as "Callable[[object], int]") + __hash__: ClassVar[None] # type: ignore[assignment] + + @final + def __setitem__(self, key, value) -> None: + raise TypeError("Index does not support mutable operations") + + def __getitem__(self, key): + """ + Override numpy.ndarray's __getitem__ method to work as desired. + + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). + + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. + + """ + getitem = self._data.__getitem__ + + if is_integer(key) or is_float(key): + # GH#44051 exclude bool, which would return a 2d ndarray + key = com.cast_scalar_indexer(key) + return getitem(key) + + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization com.is_bool_indexer and ndim checks. + return self._getitem_slice(key) + + if com.is_bool_indexer(key): + # if we have list[bools, length=1e5] then doing this check+convert + # takes 166 µs + 2.1 ms and cuts the ndarray.__getitem__ + # time below from 3.8 ms to 496 µs + # if we already have ndarray[bool], the overhead is 1.4 µs or .25% + if isinstance(getattr(key, "dtype", None), ExtensionDtype): + key = key.to_numpy(dtype=bool, na_value=False) + else: + key = np.asarray(key, dtype=bool) + + if not isinstance(self.dtype, ExtensionDtype): + if len(key) == 0 and len(key) != len(self): + warnings.warn( + "Using a boolean indexer with length 0 on an Index with " + "length greater than 0 is deprecated and will raise in a " + "future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + result = getitem(key) + # Because we ruled out integer above, we always get an arraylike here + if result.ndim > 1: + disallow_ndim_indexing(result) + + # NB: Using _constructor._simple_new would break if MultiIndex + # didn't override __getitem__ + return self._constructor._simple_new(result, name=self._name) + + def _getitem_slice(self, slobj: slice) -> Self: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + res = self._data[slobj] + result = type(self)._simple_new(res, name=self._name, refs=self._references) + if "_engine" in self._cache: + reverse = slobj.step is not None and slobj.step < 0 + result._engine._update_from_sliced(self._engine, reverse=reverse) # type: ignore[union-attr] + + return result + + @final + def _can_hold_identifiers_and_holds_name(self, name) -> bool: + """ + Faster check for ``name in self`` when we know `name` is a Python + identifier (e.g. in NDFrame.__getattr__, which hits this to support + . key lookup). For indexes that can't hold identifiers (everything + but object & categorical) we just return False. + + https://github.com/pandas-dev/pandas/issues/19764 + """ + if ( + is_object_dtype(self.dtype) + or is_string_dtype(self.dtype) + or isinstance(self.dtype, CategoricalDtype) + ): + return name in self + return False + + def append(self, other: Index | Sequence[Index]) -> Index: + """ + Append a collection of Index options together. + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + Index + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx.append(pd.Index([4])) + Index([1, 2, 3, 4], dtype='int64') + """ + to_concat = [self] + + if isinstance(other, (list, tuple)): + to_concat += list(other) + else: + # error: Argument 1 to "append" of "list" has incompatible type + # "Union[Index, Sequence[Index]]"; expected "Index" + to_concat.append(other) # type: ignore[arg-type] + + for obj in to_concat: + if not isinstance(obj, Index): + raise TypeError("all inputs must be Index") + + names = {obj.name for obj in to_concat} + name = None if len(names) > 1 else self.name + + return self._concat(to_concat, name) + + def _concat(self, to_concat: list[Index], name: Hashable) -> Index: + """ + Concatenate multiple Index objects. + """ + to_concat_vals = [x._values for x in to_concat] + + result = concat_compat(to_concat_vals) + + return Index._with_infer(result, name=name) + + def putmask(self, mask, value) -> Index: + """ + Return a new Index of the values set with the mask. + + Returns + ------- + Index + + See Also + -------- + numpy.ndarray.putmask : Changes elements of an array + based on conditional and input values. + + Examples + -------- + >>> idx1 = pd.Index([1, 2, 3]) + >>> idx2 = pd.Index([5, 6, 7]) + >>> idx1.putmask([True, False, False], idx2) + Index([5, 2, 3], dtype='int64') + """ + mask, noop = validate_putmask(self._values, mask) + if noop: + return self.copy() + + if self.dtype != object and is_valid_na_for_dtype(value, self.dtype): + # e.g. None -> np.nan, see also Block._standardize_fill_value + value = self._na_value + + try: + converted = self._validate_fill_value(value) + except (LossySetitemError, ValueError, TypeError) as err: + if is_object_dtype(self.dtype): # pragma: no cover + raise err + + # See also: Block.coerce_to_target_dtype + dtype = self._find_common_type_compat(value) + return self.astype(dtype).putmask(mask, value) + + values = self._values.copy() + + if isinstance(values, np.ndarray): + converted = setitem_datetimelike_compat(values, mask.sum(), converted) + np.putmask(values, mask, converted) + + else: + # Note: we use the original value here, not converted, as + # _validate_fill_value is not idempotent + values._putmask(mask, value) + + return self._shallow_copy(values) + + def equals(self, other: Any) -> bool: + """ + Determine if two Index object are equal. + + The things that are being compared are: + + * The elements inside the Index object. + * The order of the elements inside the Index object. + + Parameters + ---------- + other : Any + The other object to compare against. + + Returns + ------- + bool + True if "other" is an Index and it has the same elements and order + as the calling index; False otherwise. + + Examples + -------- + >>> idx1 = pd.Index([1, 2, 3]) + >>> idx1 + Index([1, 2, 3], dtype='int64') + >>> idx1.equals(pd.Index([1, 2, 3])) + True + + The elements inside are compared + + >>> idx2 = pd.Index(["1", "2", "3"]) + >>> idx2 + Index(['1', '2', '3'], dtype='object') + + >>> idx1.equals(idx2) + False + + The order is compared + + >>> ascending_idx = pd.Index([1, 2, 3]) + >>> ascending_idx + Index([1, 2, 3], dtype='int64') + >>> descending_idx = pd.Index([3, 2, 1]) + >>> descending_idx + Index([3, 2, 1], dtype='int64') + >>> ascending_idx.equals(descending_idx) + False + + The dtype is *not* compared + + >>> int64_idx = pd.Index([1, 2, 3], dtype='int64') + >>> int64_idx + Index([1, 2, 3], dtype='int64') + >>> uint64_idx = pd.Index([1, 2, 3], dtype='uint64') + >>> uint64_idx + Index([1, 2, 3], dtype='uint64') + >>> int64_idx.equals(uint64_idx) + True + """ + if self.is_(other): + return True + + if not isinstance(other, Index): + return False + + if len(self) != len(other): + # quickly return if the lengths are different + return False + + if ( + isinstance(self.dtype, StringDtype) + and self.dtype.storage == "pyarrow_numpy" + and other.dtype != self.dtype + ): + # special case for object behavior + return other.equals(self.astype(object)) + + if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): + # if other is not object, use other's logic for coercion + return other.equals(self) + + if isinstance(other, ABCMultiIndex): + # d-level MultiIndex can equal d-tuple Index + return other.equals(self) + + if isinstance(self._values, ExtensionArray): + # Dispatch to the ExtensionArray's .equals method. + if not isinstance(other, type(self)): + return False + + earr = cast(ExtensionArray, self._data) + return earr.equals(other._data) + + if isinstance(other.dtype, ExtensionDtype): + # All EA-backed Index subclasses override equals + return other.equals(self) + + return array_equivalent(self._values, other._values) + + @final + def identical(self, other) -> bool: + """ + Similar to equals, but checks that object attributes and types are also equal. + + Returns + ------- + bool + If two Index objects have equal elements and same type True, + otherwise False. + + Examples + -------- + >>> idx1 = pd.Index(['1', '2', '3']) + >>> idx2 = pd.Index(['1', '2', '3']) + >>> idx2.identical(idx1) + True + + >>> idx1 = pd.Index(['1', '2', '3'], name="A") + >>> idx2 = pd.Index(['1', '2', '3'], name="B") + >>> idx2.identical(idx1) + False + """ + return ( + self.equals(other) + and all( + getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables + ) + and type(self) == type(other) + and self.dtype == other.dtype + ) + + @final + def asof(self, label): + """ + Return the label from the index, or, if not present, the previous one. + + Assuming that the index is sorted, return the passed index label if it + is in the index, or return the previous index label if the passed one + is not in the index. + + Parameters + ---------- + label : object + The label up to which the method returns the latest index label. + + Returns + ------- + object + The passed label if it is in the index. The previous label if the + passed label is not in the sorted index or `NaN` if there is no + such label. + + See Also + -------- + Series.asof : Return the latest value in a Series up to the + passed index. + merge_asof : Perform an asof merge (similar to left join but it + matches on nearest key rather than equal key). + Index.get_loc : An `asof` is a thin wrapper around `get_loc` + with method='pad'. + + Examples + -------- + `Index.asof` returns the latest index label up to the passed label. + + >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) + >>> idx.asof('2014-01-01') + '2013-12-31' + + If the label is in the index, the method returns the passed label. + + >>> idx.asof('2014-01-02') + '2014-01-02' + + If all of the labels in the index are later than the passed label, + NaN is returned. + + >>> idx.asof('1999-01-02') + nan + + If the index is not sorted, an error is raised. + + >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', + ... '2014-01-03']) + >>> idx_not_sorted.asof('2013-12-31') + Traceback (most recent call last): + ValueError: index must be monotonic increasing or decreasing + """ + self._searchsorted_monotonic(label) # validate sortedness + try: + loc = self.get_loc(label) + except (KeyError, TypeError): + # KeyError -> No exact match, try for padded + # TypeError -> passed e.g. non-hashable, fall through to get + # the tested exception message + indexer = self.get_indexer([label], method="pad") + if indexer.ndim > 1 or indexer.size > 1: + raise TypeError("asof requires scalar valued input") + loc = indexer.item() + if loc == -1: + return self._na_value + else: + if isinstance(loc, slice): + loc = loc.indices(len(self))[-1] + + return self[loc] + + def asof_locs( + self, where: Index, mask: npt.NDArray[np.bool_] + ) -> npt.NDArray[np.intp]: + """ + Return the locations (indices) of labels in the index. + + As in the :meth:`pandas.Index.asof`, if the label (a particular entry in + ``where``) is not in the index, the latest index label up to the + passed label is chosen and its index returned. + + If all of the labels in the index are later than a label in ``where``, + -1 is returned. + + ``mask`` is used to ignore ``NA`` values in the index during calculation. + + Parameters + ---------- + where : Index + An Index consisting of an array of timestamps. + mask : np.ndarray[bool] + Array of booleans denoting where values in the original + data are not ``NA``. + + Returns + ------- + np.ndarray[np.intp] + An array of locations (indices) of the labels from the index + which correspond to the return values of :meth:`pandas.Index.asof` + for every element in ``where``. + + See Also + -------- + Index.asof : Return the label from the index, or, if not present, the + previous one. + + Examples + -------- + >>> idx = pd.date_range('2023-06-01', periods=3, freq='D') + >>> where = pd.DatetimeIndex(['2023-05-30 00:12:00', '2023-06-01 00:00:00', + ... '2023-06-02 23:59:59']) + >>> mask = np.ones(3, dtype=bool) + >>> idx.asof_locs(where, mask) + array([-1, 0, 1]) + + We can use ``mask`` to ignore certain values in the index during calculation. + + >>> mask[1] = False + >>> idx.asof_locs(where, mask) + array([-1, 0, 0]) + """ + # error: No overload variant of "searchsorted" of "ndarray" matches argument + # types "Union[ExtensionArray, ndarray[Any, Any]]", "str" + # TODO: will be fixed when ExtensionArray.searchsorted() is fixed + locs = self._values[mask].searchsorted( + where._values, side="right" # type: ignore[call-overload] + ) + locs = np.where(locs > 0, locs - 1, 0) + + result = np.arange(len(self), dtype=np.intp)[mask].take(locs) + + first_value = self._values[mask.argmax()] + result[(locs == 0) & (where._values < first_value)] = -1 + + return result + + @overload + def sort_values( + self, + *, + return_indexer: Literal[False] = ..., + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> Self: + ... + + @overload + def sort_values( + self, + *, + return_indexer: Literal[True], + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> tuple[Self, np.ndarray]: + ... + + @overload + def sort_values( + self, + *, + return_indexer: bool = ..., + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> Self | tuple[Self, np.ndarray]: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self"], name="sort_values" + ) + def sort_values( + self, + return_indexer: bool = False, + ascending: bool = True, + na_position: NaPosition = "last", + key: Callable | None = None, + ) -> Self | tuple[Self, np.ndarray]: + """ + Return a sorted copy of the index. + + Return a sorted copy of the index, and optionally return the indices + that sorted the index itself. + + Parameters + ---------- + return_indexer : bool, default False + Should the indices that would sort the index be returned. + ascending : bool, default True + Should the index values be sorted in an ascending order. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + key : callable, optional + If not None, apply the key function to the index values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect an + ``Index`` and return an ``Index`` of the same shape. + + Returns + ------- + sorted_index : pandas.Index + Sorted copy of the index. + indexer : numpy.ndarray, optional + The indices that the index itself was sorted by. + + See Also + -------- + Series.sort_values : Sort values of a Series. + DataFrame.sort_values : Sort values in a DataFrame. + + Examples + -------- + >>> idx = pd.Index([10, 100, 1, 1000]) + >>> idx + Index([10, 100, 1, 1000], dtype='int64') + + Sort values in ascending order (default behavior). + + >>> idx.sort_values() + Index([1, 10, 100, 1000], dtype='int64') + + Sort values in descending order, and also get the indices `idx` was + sorted by. + + >>> idx.sort_values(ascending=False, return_indexer=True) + (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) + """ + if key is None and ( + (ascending and self.is_monotonic_increasing) + or (not ascending and self.is_monotonic_decreasing) + ): + if return_indexer: + indexer = np.arange(len(self), dtype=np.intp) + return self.copy(), indexer + else: + return self.copy() + + # GH 35584. Sort missing values according to na_position kwarg + # ignore na_position for MultiIndex + if not isinstance(self, ABCMultiIndex): + _as = nargsort( + items=self, ascending=ascending, na_position=na_position, key=key + ) + else: + idx = cast(Index, ensure_key_mapped(self, key)) + _as = idx.argsort(na_position=na_position) + if not ascending: + _as = _as[::-1] + + sorted_index = self.take(_as) + + if return_indexer: + return sorted_index, _as + else: + return sorted_index + + @final + def sort(self, *args, **kwargs): + """ + Use sort_values instead. + """ + raise TypeError("cannot sort an Index object in-place, use sort_values instead") + + def shift(self, periods: int = 1, freq=None): + """ + Shift index by desired number of time frequency increments. + + This method is for shifting the values of datetime-like indexes + by a specified time increment a given number of times. + + Parameters + ---------- + periods : int, default 1 + Number of periods (or increments) to shift by, + can be positive or negative. + freq : pandas.DateOffset, pandas.Timedelta or str, optional + Frequency increment to shift by. + If None, the index is shifted by its own `freq` attribute. + Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. + + Returns + ------- + pandas.Index + Shifted index. + + See Also + -------- + Series.shift : Shift values of Series. + + Notes + ----- + This method is only implemented for datetime-like index classes, + i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. + + Examples + -------- + Put the first 5 month starts of 2011 into an index. + + >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') + >>> month_starts + DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', + '2011-05-01'], + dtype='datetime64[ns]', freq='MS') + + Shift the index by 10 days. + + >>> month_starts.shift(10, freq='D') + DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', + '2011-05-11'], + dtype='datetime64[ns]', freq=None) + + The default value of `freq` is the `freq` attribute of the index, + which is 'MS' (month start) in this example. + + >>> month_starts.shift(10) + DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', + '2012-03-01'], + dtype='datetime64[ns]', freq='MS') + """ + raise NotImplementedError( + f"This method is only implemented for DatetimeIndex, PeriodIndex and " + f"TimedeltaIndex; Got type {type(self).__name__}" + ) + + def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: + """ + Return the integer indices that would sort the index. + + Parameters + ---------- + *args + Passed to `numpy.ndarray.argsort`. + **kwargs + Passed to `numpy.ndarray.argsort`. + + Returns + ------- + np.ndarray[np.intp] + Integer indices that would sort the index if used as + an indexer. + + See Also + -------- + numpy.argsort : Similar method for NumPy arrays. + Index.sort_values : Return sorted copy of Index. + + Examples + -------- + >>> idx = pd.Index(['b', 'a', 'd', 'c']) + >>> idx + Index(['b', 'a', 'd', 'c'], dtype='object') + + >>> order = idx.argsort() + >>> order + array([1, 0, 3, 2]) + + >>> idx[order] + Index(['a', 'b', 'c', 'd'], dtype='object') + """ + # This works for either ndarray or EA, is overridden + # by RangeIndex, MultIIndex + return self._data.argsort(*args, **kwargs) + + def _check_indexing_error(self, key): + if not is_scalar(key): + # if key is not a scalar, directly raise an error (the code below + # would convert to numpy arrays and raise later any way) - GH29926 + raise InvalidIndexError(key) + + @cache_readonly + def _should_fallback_to_positional(self) -> bool: + """ + Should an integer key be treated as positional? + """ + return self.inferred_type not in { + "integer", + "mixed-integer", + "floating", + "complex", + } + + _index_shared_docs[ + "get_indexer_non_unique" + ] = """ + Compute indexer and mask for new index given the current index. + + The indexer should be then used as an input to ndarray.take to align the + current data to the new index. + + Parameters + ---------- + target : %(target_klass)s + + Returns + ------- + indexer : np.ndarray[np.intp] + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. + missing : np.ndarray[np.intp] + An indexer into the target of the values not found. + These correspond to the -1 in the indexer array. + + Examples + -------- + >>> index = pd.Index(['c', 'b', 'a', 'b', 'b']) + >>> index.get_indexer_non_unique(['b', 'b']) + (array([1, 3, 4, 1, 3, 4]), array([], dtype=int64)) + + In the example below there are no matched values. + + >>> index = pd.Index(['c', 'b', 'a', 'b', 'b']) + >>> index.get_indexer_non_unique(['q', 'r', 't']) + (array([-1, -1, -1]), array([0, 1, 2])) + + For this reason, the returned ``indexer`` contains only integers equal to -1. + It demonstrates that there's no match between the index and the ``target`` + values at these positions. The mask [0, 1, 2] in the return value shows that + the first, second, and third elements are missing. + + Notice that the return value is a tuple contains two items. In the example + below the first item is an array of locations in ``index``. The second + item is a mask shows that the first and third elements are missing. + + >>> index = pd.Index(['c', 'b', 'a', 'b', 'b']) + >>> index.get_indexer_non_unique(['f', 'b', 's']) + (array([-1, 1, 3, 4, -1]), array([0, 2])) + """ + + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique( + self, target + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + target = ensure_index(target) + target = self._maybe_cast_listlike_indexer(target) + + if not self._should_compare(target) and not self._should_partial_index(target): + # _should_partial_index e.g. IntervalIndex with numeric scalars + # that can be matched to Interval scalars. + return self._get_indexer_non_comparable(target, method=None, unique=False) + + pself, ptarget = self._maybe_downcast_for_indexing(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) + + if self.dtype != target.dtype: + # TODO: if object, could use infer_dtype to preempt costly + # conversion if still non-comparable? + dtype = self._find_common_type_compat(target) + + this = self.astype(dtype, copy=False) + that = target.astype(dtype, copy=False) + return this.get_indexer_non_unique(that) + + # TODO: get_indexer has fastpaths for both Categorical-self and + # Categorical-target. Can we do something similar here? + + # Note: _maybe_downcast_for_indexing ensures we never get here + # with MultiIndex self and non-Multi target + if self._is_multi and target._is_multi: + engine = self._engine + # Item "IndexEngine" of "Union[IndexEngine, ExtensionEngine]" has + # no attribute "_extract_level_codes" + tgt_values = engine._extract_level_codes(target) # type: ignore[union-attr] + else: + tgt_values = target._get_engine_target() + + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return ensure_platform_int(indexer), ensure_platform_int(missing) + + @final + def get_indexer_for(self, target) -> npt.NDArray[np.intp]: + """ + Guaranteed return of an indexer even when non-unique. + + This dispatches to get_indexer or get_indexer_non_unique + as appropriate. + + Returns + ------- + np.ndarray[np.intp] + List of indices. + + Examples + -------- + >>> idx = pd.Index([np.nan, 'var1', np.nan]) + >>> idx.get_indexer_for([np.nan]) + array([0, 2]) + """ + if self._index_as_unique: + return self.get_indexer(target) + indexer, _ = self.get_indexer_non_unique(target) + return indexer + + def _get_indexer_strict(self, key, axis_name: str_t) -> tuple[Index, np.ndarray]: + """ + Analogue to get_indexer that raises if any elements are missing. + """ + keyarr = key + if not isinstance(keyarr, Index): + keyarr = com.asarray_tuplesafe(keyarr) + + if self._index_as_unique: + indexer = self.get_indexer_for(keyarr) + keyarr = self.reindex(keyarr)[0] + else: + keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr) + + self._raise_if_missing(keyarr, indexer, axis_name) + + keyarr = self.take(indexer) + if isinstance(key, Index): + # GH 42790 - Preserve name from an Index + keyarr.name = key.name + if lib.is_np_dtype(keyarr.dtype, "mM") or isinstance( + keyarr.dtype, DatetimeTZDtype + ): + # DTI/TDI.take can infer a freq in some cases when we dont want one + if isinstance(key, list) or ( + isinstance(key, type(self)) + # "Index" has no attribute "freq" + and key.freq is None # type: ignore[attr-defined] + ): + keyarr = keyarr._with_freq(None) + + return keyarr, indexer + + def _raise_if_missing(self, key, indexer, axis_name: str_t) -> None: + """ + Check that indexer can be used to return a result. + + e.g. at least one element was found, + unless the list of keys was actually empty. + + Parameters + ---------- + key : list-like + Targeted labels (only used to show correct error message). + indexer: array-like of booleans + Indices corresponding to the key, + (with -1 indicating not found). + axis_name : str + + Raises + ------ + KeyError + If at least one key was requested but none was found. + """ + if len(key) == 0: + return + + # Count missing values + missing_mask = indexer < 0 + nmissing = missing_mask.sum() + + if nmissing: + if nmissing == len(indexer): + raise KeyError(f"None of [{key}] are in the [{axis_name}]") + + not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) + raise KeyError(f"{not_found} not in index") + + @overload + def _get_indexer_non_comparable( + self, target: Index, method, unique: Literal[True] = ... + ) -> npt.NDArray[np.intp]: + ... + + @overload + def _get_indexer_non_comparable( + self, target: Index, method, unique: Literal[False] + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + ... + + @overload + def _get_indexer_non_comparable( + self, target: Index, method, unique: bool = True + ) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + ... + + @final + def _get_indexer_non_comparable( + self, target: Index, method, unique: bool = True + ) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + """ + Called from get_indexer or get_indexer_non_unique when the target + is of a non-comparable dtype. + + For get_indexer lookups with method=None, get_indexer is an _equality_ + check, so non-comparable dtypes mean we will always have no matches. + + For get_indexer lookups with a method, get_indexer is an _inequality_ + check, so non-comparable dtypes mean we will always raise TypeError. + + Parameters + ---------- + target : Index + method : str or None + unique : bool, default True + * True if called from get_indexer. + * False if called from get_indexer_non_unique. + + Raises + ------ + TypeError + If doing an inequality check, i.e. method is not None. + """ + if method is not None: + other_dtype = _unpack_nested_dtype(target) + raise TypeError(f"Cannot compare dtypes {self.dtype} and {other_dtype}") + + no_matches = -1 * np.ones(target.shape, dtype=np.intp) + if unique: + # This is for get_indexer + return no_matches + else: + # This is for get_indexer_non_unique + missing = np.arange(len(target), dtype=np.intp) + return no_matches, missing + + @property + def _index_as_unique(self) -> bool: + """ + Whether we should treat this as unique for the sake of + get_indexer vs get_indexer_non_unique. + + For IntervalIndex compat. + """ + return self.is_unique + + _requires_unique_msg = "Reindexing only valid with uniquely valued Index objects" + + @final + def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]: + """ + When dealing with an object-dtype Index and a non-object Index, see + if we can upcast the object-dtype one to improve performance. + """ + + if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex): + if ( + self.tz is not None + and other.tz is not None + and not tz_compare(self.tz, other.tz) + ): + # standardize on UTC + return self.tz_convert("UTC"), other.tz_convert("UTC") + + elif self.inferred_type == "date" and isinstance(other, ABCDatetimeIndex): + try: + return type(other)(self), other + except OutOfBoundsDatetime: + return self, other + elif self.inferred_type == "timedelta" and isinstance(other, ABCTimedeltaIndex): + # TODO: we dont have tests that get here + return type(other)(self), other + + elif self.dtype.kind == "u" and other.dtype.kind == "i": + # GH#41873 + if other.min() >= 0: + # lookup min as it may be cached + # TODO: may need itemsize check if we have non-64-bit Indexes + return self, other.astype(self.dtype) + + elif self._is_multi and not other._is_multi: + try: + # "Type[Index]" has no attribute "from_tuples" + other = type(self).from_tuples(other) # type: ignore[attr-defined] + except (TypeError, ValueError): + # let's instead try with a straight Index + self = Index(self._values) + + if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): + # Reverse op so we dont need to re-implement on the subclasses + other, self = other._maybe_downcast_for_indexing(self) + + return self, other + + @final + def _find_common_type_compat(self, target) -> DtypeObj: + """ + Implementation of find_common_type that adjusts for Index-specific + special cases. + """ + target_dtype, _ = infer_dtype_from(target) + + # special case: if one dtype is uint64 and the other a signed int, return object + # See https://github.com/pandas-dev/pandas/issues/26778 for discussion + # Now it's: + # * float | [u]int -> float + # * uint64 | signed int -> object + # We may change union(float | [u]int) to go to object. + if self.dtype == "uint64" or target_dtype == "uint64": + if is_signed_integer_dtype(self.dtype) or is_signed_integer_dtype( + target_dtype + ): + return _dtype_obj + + dtype = find_result_type(self.dtype, target) + dtype = common_dtype_categorical_compat([self, target], dtype) + return dtype + + @final + def _should_compare(self, other: Index) -> bool: + """ + Check if `self == other` can ever have non-False entries. + """ + + # NB: we use inferred_type rather than is_bool_dtype to catch + # object_dtype_of_bool and categorical[object_dtype_of_bool] cases + if ( + other.inferred_type == "boolean" and is_any_real_numeric_dtype(self.dtype) + ) or ( + self.inferred_type == "boolean" and is_any_real_numeric_dtype(other.dtype) + ): + # GH#16877 Treat boolean labels passed to a numeric index as not + # found. Without this fix False and True would be treated as 0 and 1 + # respectively. + return False + + dtype = _unpack_nested_dtype(other) + return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) + + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + if self.dtype.kind == "b": + return dtype.kind == "b" + elif is_numeric_dtype(self.dtype): + return is_numeric_dtype(dtype) + # TODO: this was written assuming we only get here with object-dtype, + # which is no longer correct. Can we specialize for EA? + return True + + @final + def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]: + """ + Group the index labels by a given array of values. + + Parameters + ---------- + values : array + Values used to determine the groups. + + Returns + ------- + dict + {group name -> group labels} + """ + # TODO: if we are a MultiIndex, we can do better + # that converting to tuples + if isinstance(values, ABCMultiIndex): + values = values._values + values = Categorical(values) + result = values._reverse_indexer() + + # map to the label + result = {k: self.take(v) for k, v in result.items()} + + return PrettyDict(result) + + def map(self, mapper, na_action: Literal["ignore"] | None = None): + """ + Map values using an input mapping or function. + + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. + + Returns + ------- + Union[Index, MultiIndex] + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx.map({1: 'a', 2: 'b', 3: 'c'}) + Index(['a', 'b', 'c'], dtype='object') + + Using `map` with a function: + + >>> idx = pd.Index([1, 2, 3]) + >>> idx.map('I am a {}'.format) + Index(['I am a 1', 'I am a 2', 'I am a 3'], dtype='object') + + >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx.map(lambda x: x.upper()) + Index(['A', 'B', 'C'], dtype='object') + """ + from pandas.core.indexes.multi import MultiIndex + + new_values = self._map_values(mapper, na_action=na_action) + + # we can return a MultiIndex + if new_values.size and isinstance(new_values[0], tuple): + if isinstance(self, MultiIndex): + names = self.names + elif self.name: + names = [self.name] * len(new_values[0]) + else: + names = None + return MultiIndex.from_tuples(new_values, names=names) + + dtype = None + if not new_values.size: + # empty + dtype = self.dtype + + # e.g. if we are floating and new_values is all ints, then we + # don't want to cast back to floating. But if we are UInt64 + # and new_values is all ints, we want to try. + same_dtype = lib.infer_dtype(new_values, skipna=False) == self.inferred_type + if same_dtype: + new_values = maybe_cast_pointwise_result( + new_values, self.dtype, same_dtype=same_dtype + ) + + return Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) + + # TODO: De-duplicate with map, xref GH#32349 + @final + def _transform_index(self, func, *, level=None) -> Index: + """ + Apply function to all values found in index. + + This includes transforming multiindex entries separately. + Only apply function to one level of the MultiIndex if level is specified. + """ + if isinstance(self, ABCMultiIndex): + values = [ + self.get_level_values(i).map(func) + if i == level or level is None + else self.get_level_values(i) + for i in range(self.nlevels) + ] + return type(self).from_arrays(values) + else: + items = [func(x) for x in self] + return Index(items, name=self.name, tupleize_cols=False) + + def isin(self, values, level=None) -> npt.NDArray[np.bool_]: + """ + Return a boolean array where the index values are in `values`. + + Compute boolean array of whether each index value is found in the + passed set of values. The length of the returned boolean array matches + the length of the index. + + Parameters + ---------- + values : set or list-like + Sought values. + level : str or int, optional + Name or position of the index level to use (if the index is a + `MultiIndex`). + + Returns + ------- + np.ndarray[bool] + NumPy array of boolean values. + + See Also + -------- + Series.isin : Same for Series. + DataFrame.isin : Same method for DataFrames. + + Notes + ----- + In the case of `MultiIndex` you must either specify `values` as a + list-like object containing tuples that are the same length as the + number of levels, or specify `level`. Otherwise it will raise a + ``ValueError``. + + If `level` is specified: + + - if it is the name of one *and only one* index level, use that level; + - otherwise it should be a number indicating level position. + + Examples + -------- + >>> idx = pd.Index([1,2,3]) + >>> idx + Index([1, 2, 3], dtype='int64') + + Check whether each index value in a list of values. + + >>> idx.isin([1, 4]) + array([ True, False, False]) + + >>> midx = pd.MultiIndex.from_arrays([[1,2,3], + ... ['red', 'blue', 'green']], + ... names=('number', 'color')) + >>> midx + MultiIndex([(1, 'red'), + (2, 'blue'), + (3, 'green')], + names=['number', 'color']) + + Check whether the strings in the 'color' level of the MultiIndex + are in a list of colors. + + >>> midx.isin(['red', 'orange', 'yellow'], level='color') + array([ True, False, False]) + + To check across the levels of a MultiIndex, pass a list of tuples: + + >>> midx.isin([(1, 'red'), (3, 'red')]) + array([ True, False, False]) + """ + if level is not None: + self._validate_index_level(level) + return algos.isin(self._values, values) + + def _get_string_slice(self, key: str_t): + # this is for partial string indexing, + # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex + raise NotImplementedError + + def slice_indexer( + self, + start: Hashable | None = None, + end: Hashable | None = None, + step: int | None = None, + ) -> slice: + """ + Compute the slice indexer for input labels and step. + + Index needs to be ordered and unique. + + Parameters + ---------- + start : label, default None + If None, defaults to the beginning. + end : label, default None + If None, defaults to the end. + step : int, default None + + Returns + ------- + slice + + Raises + ------ + KeyError : If key does not exist, or key is not unique and index is + not ordered. + + Notes + ----- + This function assumes that the data is sorted, so use at your own peril + + Examples + -------- + This is a method on all index types. For example you can do: + + >>> idx = pd.Index(list('abcd')) + >>> idx.slice_indexer(start='b', end='c') + slice(1, 3, None) + + >>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')]) + >>> idx.slice_indexer(start='b', end=('c', 'g')) + slice(1, 3, None) + """ + start_slice, end_slice = self.slice_locs(start, end, step=step) + + # return a slice + if not is_scalar(start_slice): + raise AssertionError("Start slice bound is non-scalar") + if not is_scalar(end_slice): + raise AssertionError("End slice bound is non-scalar") + + return slice(start_slice, end_slice, step) + + def _maybe_cast_indexer(self, key): + """ + If we have a float key and are not a floating index, then try to cast + to an int if equivalent. + """ + return key + + def _maybe_cast_listlike_indexer(self, target) -> Index: + """ + Analogue to maybe_cast_indexer for get_indexer instead of get_loc. + """ + return ensure_index(target) + + @final + def _validate_indexer( + self, + form: Literal["positional", "slice"], + key, + kind: Literal["getitem", "iloc"], + ) -> None: + """ + If we are positional indexer, validate that we have appropriate + typed bounds must be an integer. + """ + if not lib.is_int_or_none(key): + self._raise_invalid_indexer(form, key) + + def _maybe_cast_slice_bound(self, label, side: str_t): + """ + This function should be overloaded in subclasses that allow non-trivial + casting on label-slice bounds, e.g. datetime-like indices allowing + strings containing formatted datetimes. + + Parameters + ---------- + label : object + side : {'left', 'right'} + + Returns + ------- + label : object + + Notes + ----- + Value of `side` parameter should be validated in caller. + """ + + # We are a plain index here (sub-class override this method if they + # wish to have special treatment for floats/ints, e.g. datetimelike Indexes + + if is_numeric_dtype(self.dtype): + return self._maybe_cast_indexer(label) + + # reject them, if index does not contain label + if (is_float(label) or is_integer(label)) and label not in self: + self._raise_invalid_indexer("slice", label) + + return label + + def _searchsorted_monotonic(self, label, side: Literal["left", "right"] = "left"): + if self.is_monotonic_increasing: + return self.searchsorted(label, side=side) + elif self.is_monotonic_decreasing: + # np.searchsorted expects ascending sort order, have to reverse + # everything for it to work (element ordering, search side and + # resulting value). + pos = self[::-1].searchsorted( + label, side="right" if side == "left" else "left" + ) + return len(self) - pos + + raise ValueError("index must be monotonic increasing or decreasing") + + def get_slice_bound(self, label, side: Literal["left", "right"]) -> int: + """ + Calculate slice bound that corresponds to given label. + + Returns leftmost (one-past-the-rightmost if ``side=='right'``) position + of given label. + + Parameters + ---------- + label : object + side : {'left', 'right'} + + Returns + ------- + int + Index of label. + + See Also + -------- + Index.get_loc : Get integer location, slice or boolean mask for requested + label. + + Examples + -------- + >>> idx = pd.RangeIndex(5) + >>> idx.get_slice_bound(3, 'left') + 3 + + >>> idx.get_slice_bound(3, 'right') + 4 + + If ``label`` is non-unique in the index, an error will be raised. + + >>> idx_duplicate = pd.Index(['a', 'b', 'a', 'c', 'd']) + >>> idx_duplicate.get_slice_bound('a', 'left') + Traceback (most recent call last): + KeyError: Cannot get left slice bound for non-unique label: 'a' + """ + + if side not in ("left", "right"): + raise ValueError( + "Invalid value for side kwarg, must be either " + f"'left' or 'right': {side}" + ) + + original_label = label + + # For datetime indices label may be a string that has to be converted + # to datetime boundary according to its resolution. + label = self._maybe_cast_slice_bound(label, side) + + # we need to look up the label + try: + slc = self.get_loc(label) + except KeyError as err: + try: + return self._searchsorted_monotonic(label, side) + except ValueError: + # raise the original KeyError + raise err + + if isinstance(slc, np.ndarray): + # get_loc may return a boolean array, which + # is OK as long as they are representable by a slice. + assert is_bool_dtype(slc.dtype) + slc = lib.maybe_booleans_to_slice(slc.view("u1")) + if isinstance(slc, np.ndarray): + raise KeyError( + f"Cannot get {side} slice bound for non-unique " + f"label: {repr(original_label)}" + ) + + if isinstance(slc, slice): + if side == "left": + return slc.start + else: + return slc.stop + else: + if side == "right": + return slc + 1 + else: + return slc + + def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: + """ + Compute slice locations for input labels. + + Parameters + ---------- + start : label, default None + If None, defaults to the beginning. + end : label, default None + If None, defaults to the end. + step : int, defaults None + If None, defaults to 1. + + Returns + ------- + tuple[int, int] + + See Also + -------- + Index.get_loc : Get location for a single label. + + Notes + ----- + This method only works if the index is monotonic or unique. + + Examples + -------- + >>> idx = pd.Index(list('abcd')) + >>> idx.slice_locs(start='b', end='c') + (1, 3) + """ + inc = step is None or step >= 0 + + if not inc: + # If it's a reverse slice, temporarily swap bounds. + start, end = end, start + + # GH 16785: If start and end happen to be date strings with UTC offsets + # attempt to parse and check that the offsets are the same + if isinstance(start, (str, datetime)) and isinstance(end, (str, datetime)): + try: + ts_start = Timestamp(start) + ts_end = Timestamp(end) + except (ValueError, TypeError): + pass + else: + if not tz_compare(ts_start.tzinfo, ts_end.tzinfo): + raise ValueError("Both dates must have the same UTC offset") + + start_slice = None + if start is not None: + start_slice = self.get_slice_bound(start, "left") + if start_slice is None: + start_slice = 0 + + end_slice = None + if end is not None: + end_slice = self.get_slice_bound(end, "right") + if end_slice is None: + end_slice = len(self) + + if not inc: + # Bounds at this moment are swapped, swap them back and shift by 1. + # + # slice_locs('B', 'A', step=-1): s='B', e='A' + # + # s='A' e='B' + # AFTER SWAP: | | + # v ------------------> V + # ----------------------------------- + # | | |A|A|A|A| | | | | |B|B| | | | | + # ----------------------------------- + # ^ <------------------ ^ + # SHOULD BE: | | + # end=s-1 start=e-1 + # + end_slice, start_slice = start_slice - 1, end_slice - 1 + + # i == -1 triggers ``len(self) + i`` selection that points to the + # last element, not before-the-first one, subtracting len(self) + # compensates that. + if end_slice == -1: + end_slice -= len(self) + if start_slice == -1: + start_slice -= len(self) + + return start_slice, end_slice + + def delete(self, loc) -> Self: + """ + Make new Index with passed location(-s) deleted. + + Parameters + ---------- + loc : int or list of int + Location of item(-s) which will be deleted. + Use a list of locations to delete more than one value at the same time. + + Returns + ------- + Index + Will be same type as self, except for RangeIndex. + + See Also + -------- + numpy.delete : Delete any rows and column from NumPy array (ndarray). + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx.delete(1) + Index(['a', 'c'], dtype='object') + + >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx.delete([0, 2]) + Index(['b'], dtype='object') + """ + values = self._values + res_values: ArrayLike + if isinstance(values, np.ndarray): + # TODO(__array_function__): special casing will be unnecessary + res_values = np.delete(values, loc) + else: + res_values = values.delete(loc) + + # _constructor so RangeIndex-> Index with an int64 dtype + return self._constructor._simple_new(res_values, name=self.name) + + def insert(self, loc: int, item) -> Index: + """ + Make new Index inserting new item at location. + + Follows Python numpy.insert semantics for negative values. + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + Index + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx.insert(1, 'x') + Index(['a', 'x', 'b', 'c'], dtype='object') + """ + item = lib.item_from_zerodim(item) + if is_valid_na_for_dtype(item, self.dtype) and self.dtype != object: + item = self._na_value + + arr = self._values + + try: + if isinstance(arr, ExtensionArray): + res_values = arr.insert(loc, item) + return type(self)._simple_new(res_values, name=self.name) + else: + item = self._validate_fill_value(item) + except (TypeError, ValueError, LossySetitemError): + # e.g. trying to insert an integer into a DatetimeIndex + # We cannot keep the same dtype, so cast to the (often object) + # minimal shared dtype before doing the insert. + dtype = self._find_common_type_compat(item) + return self.astype(dtype).insert(loc, item) + + if arr.dtype != object or not isinstance( + item, (tuple, np.datetime64, np.timedelta64) + ): + # with object-dtype we need to worry about numpy incorrectly casting + # dt64/td64 to integer, also about treating tuples as sequences + # special-casing dt64/td64 https://github.com/numpy/numpy/issues/12550 + casted = arr.dtype.type(item) + new_values = np.insert(arr, loc, casted) + + else: + # error: No overload variant of "insert" matches argument types + # "ndarray[Any, Any]", "int", "None" + new_values = np.insert(arr, loc, None) # type: ignore[call-overload] + loc = loc if loc >= 0 else loc - 1 + new_values[loc] = item + + out = Index._with_infer(new_values, name=self.name) + if ( + using_pyarrow_string_dtype() + and is_string_dtype(out.dtype) + and new_values.dtype == object + ): + out = out.astype(new_values.dtype) + if self.dtype == object and out.dtype != object: + # GH#51363 + warnings.warn( + "The behavior of Index.insert with object-dtype is deprecated, " + "in a future version this will return an object-dtype Index " + "instead of inferring a non-object dtype. To retain the old " + "behavior, do `idx.insert(loc, item).infer_objects(copy=False)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + return out + + def drop( + self, + labels: Index | np.ndarray | Iterable[Hashable], + errors: IgnoreRaise = "raise", + ) -> Index: + """ + Make new Index with passed list of labels deleted. + + Parameters + ---------- + labels : array-like or scalar + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and existing labels are dropped. + + Returns + ------- + Index + Will be same type as self, except for RangeIndex. + + Raises + ------ + KeyError + If not all of the labels are found in the selected axis + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'c']) + >>> idx.drop(['a']) + Index(['b', 'c'], dtype='object') + """ + if not isinstance(labels, Index): + # avoid materializing e.g. RangeIndex + arr_dtype = "object" if self.dtype == "object" else None + labels = com.index_labels_to_array(labels, dtype=arr_dtype) + + indexer = self.get_indexer_for(labels) + mask = indexer == -1 + if mask.any(): + if errors != "ignore": + raise KeyError(f"{labels[mask].tolist()} not found in axis") + indexer = indexer[~mask] + return self.delete(indexer) + + @final + def infer_objects(self, copy: bool = True) -> Index: + """ + If we have an object dtype, try to infer a non-object dtype. + + Parameters + ---------- + copy : bool, default True + Whether to make a copy in cases where no inference occurs. + """ + if self._is_multi: + raise NotImplementedError( + "infer_objects is not implemented for MultiIndex. " + "Use index.to_frame().infer_objects() instead." + ) + if self.dtype != object: + return self.copy() if copy else self + + values = self._values + values = cast("npt.NDArray[np.object_]", values) + res_values = lib.maybe_convert_objects( + values, + convert_non_numeric=True, + ) + if copy and res_values is values: + return self.copy() + result = Index(res_values, name=self.name) + if not copy and res_values is values and self._references is not None: + result._references = self._references + result._references.add_index_reference(result) + return result + + @final + def diff(self, periods: int = 1) -> Index: + """ + Computes the difference between consecutive values in the Index object. + + If periods is greater than 1, computes the difference between values that + are `periods` number of positions apart. + + Parameters + ---------- + periods : int, optional + The number of positions between the current and previous + value to compute the difference with. Default is 1. + + Returns + ------- + Index + A new Index object with the computed differences. + + Examples + -------- + >>> import pandas as pd + >>> idx = pd.Index([10, 20, 30, 40, 50]) + >>> idx.diff() + Index([nan, 10.0, 10.0, 10.0, 10.0], dtype='float64') + + """ + return Index(self.to_series().diff(periods)) + + @final + def round(self, decimals: int = 0) -> Self: + """ + Round each value in the Index to the given number of decimals. + + Parameters + ---------- + decimals : int, optional + Number of decimal places to round to. If decimals is negative, + it specifies the number of positions to the left of the decimal point. + + Returns + ------- + Index + A new Index with the rounded values. + + Examples + -------- + >>> import pandas as pd + >>> idx = pd.Index([10.1234, 20.5678, 30.9123, 40.4567, 50.7890]) + >>> idx.round(decimals=2) + Index([10.12, 20.57, 30.91, 40.46, 50.79], dtype='float64') + + """ + return self._constructor(self.to_series().round(decimals)) + + # -------------------------------------------------------------------- + # Generated Arithmetic, Comparison, and Unary Methods + + def _cmp_method(self, other, op): + """ + Wrapper used to dispatch comparison operations. + """ + if self.is_(other): + # fastpath + if op in {operator.eq, operator.le, operator.ge}: + arr = np.ones(len(self), dtype=bool) + if self._can_hold_na and not isinstance(self, ABCMultiIndex): + # TODO: should set MultiIndex._can_hold_na = False? + arr[self.isna()] = False + return arr + elif op is operator.ne: + arr = np.zeros(len(self), dtype=bool) + if self._can_hold_na and not isinstance(self, ABCMultiIndex): + arr[self.isna()] = True + return arr + + if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)) and len( + self + ) != len(other): + raise ValueError("Lengths must match to compare") + + if not isinstance(other, ABCMultiIndex): + other = extract_array(other, extract_numpy=True) + else: + other = np.asarray(other) + + if is_object_dtype(self.dtype) and isinstance(other, ExtensionArray): + # e.g. PeriodArray, Categorical + result = op(self._values, other) + + elif isinstance(self._values, ExtensionArray): + result = op(self._values, other) + + elif is_object_dtype(self.dtype) and not isinstance(self, ABCMultiIndex): + # don't pass MultiIndex + result = ops.comp_method_OBJECT_ARRAY(op, self._values, other) + + else: + result = ops.comparison_op(self._values, other, op) + + return result + + @final + def _logical_method(self, other, op): + res_name = ops.get_op_result_name(self, other) + + lvalues = self._values + rvalues = extract_array(other, extract_numpy=True, extract_range=True) + + res_values = ops.logical_op(lvalues, rvalues, op) + return self._construct_result(res_values, name=res_name) + + @final + def _construct_result(self, result, name): + if isinstance(result, tuple): + return ( + Index(result[0], name=name, dtype=result[0].dtype), + Index(result[1], name=name, dtype=result[1].dtype), + ) + return Index(result, name=name, dtype=result.dtype) + + def _arith_method(self, other, op): + if ( + isinstance(other, Index) + and is_object_dtype(other.dtype) + and type(other) is not Index + ): + # We return NotImplemented for object-dtype index *subclasses* so they have + # a chance to implement ops before we unwrap them. + # See https://github.com/pandas-dev/pandas/issues/31109 + return NotImplemented + + return super()._arith_method(other, op) + + @final + def _unary_method(self, op): + result = op(self._values) + return Index(result, name=self.name) + + def __abs__(self) -> Index: + return self._unary_method(operator.abs) + + def __neg__(self) -> Index: + return self._unary_method(operator.neg) + + def __pos__(self) -> Index: + return self._unary_method(operator.pos) + + def __invert__(self) -> Index: + # GH#8875 + return self._unary_method(operator.inv) + + # -------------------------------------------------------------------- + # Reductions + + def any(self, *args, **kwargs): + """ + Return whether any element is Truthy. + + Parameters + ---------- + *args + Required for compatibility with numpy. + **kwargs + Required for compatibility with numpy. + + Returns + ------- + bool or array-like (if axis is specified) + A single element array-like may be converted to bool. + + See Also + -------- + Index.all : Return whether all elements are True. + Series.all : Return whether all elements are True. + + Notes + ----- + Not a Number (NaN), positive infinity and negative infinity + evaluate to True because these are not equal to zero. + + Examples + -------- + >>> index = pd.Index([0, 1, 2]) + >>> index.any() + True + + >>> index = pd.Index([0, 0, 0]) + >>> index.any() + False + """ + nv.validate_any(args, kwargs) + self._maybe_disable_logical_methods("any") + vals = self._values + if not isinstance(vals, np.ndarray): + # i.e. EA, call _reduce instead of "any" to get TypeError instead + # of AttributeError + return vals._reduce("any") + return np.any(vals) + + def all(self, *args, **kwargs): + """ + Return whether all elements are Truthy. + + Parameters + ---------- + *args + Required for compatibility with numpy. + **kwargs + Required for compatibility with numpy. + + Returns + ------- + bool or array-like (if axis is specified) + A single element array-like may be converted to bool. + + See Also + -------- + Index.any : Return whether any element in an Index is True. + Series.any : Return whether any element in a Series is True. + Series.all : Return whether all elements in a Series are True. + + Notes + ----- + Not a Number (NaN), positive infinity and negative infinity + evaluate to True because these are not equal to zero. + + Examples + -------- + True, because nonzero integers are considered True. + + >>> pd.Index([1, 2, 3]).all() + True + + False, because ``0`` is considered False. + + >>> pd.Index([0, 1, 2]).all() + False + """ + nv.validate_all(args, kwargs) + self._maybe_disable_logical_methods("all") + vals = self._values + if not isinstance(vals, np.ndarray): + # i.e. EA, call _reduce instead of "all" to get TypeError instead + # of AttributeError + return vals._reduce("all") + return np.all(vals) + + @final + def _maybe_disable_logical_methods(self, opname: str_t) -> None: + """ + raise if this Index subclass does not support any or all. + """ + if ( + isinstance(self, ABCMultiIndex) + # TODO(3.0): PeriodArray and DatetimeArray any/all will raise, + # so checking needs_i8_conversion will be unnecessary + or (needs_i8_conversion(self.dtype) and self.dtype.kind != "m") + ): + # This call will raise + make_invalid_op(opname)(self) + + @Appender(IndexOpsMixin.argmin.__doc__) + def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + nv.validate_argmin(args, kwargs) + nv.validate_minmax_axis(axis) + + if not self._is_multi and self.hasnans: + # Take advantage of cache + mask = self._isnan + if not skipna or mask.all(): + warnings.warn( + f"The behavior of {type(self).__name__}.argmax/argmin " + "with skipna=False and NAs, or with all-NAs is deprecated. " + "In a future version this will raise ValueError.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return -1 + return super().argmin(skipna=skipna) + + @Appender(IndexOpsMixin.argmax.__doc__) + def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + nv.validate_argmax(args, kwargs) + nv.validate_minmax_axis(axis) + + if not self._is_multi and self.hasnans: + # Take advantage of cache + mask = self._isnan + if not skipna or mask.all(): + warnings.warn( + f"The behavior of {type(self).__name__}.argmax/argmin " + "with skipna=False and NAs, or with all-NAs is deprecated. " + "In a future version this will raise ValueError.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return -1 + return super().argmax(skipna=skipna) + + def min(self, axis=None, skipna: bool = True, *args, **kwargs): + """ + Return the minimum value of the Index. + + Parameters + ---------- + axis : {None} + Dummy argument for consistency with Series. + skipna : bool, default True + Exclude NA/null values when showing the result. + *args, **kwargs + Additional arguments and keywords for compatibility with NumPy. + + Returns + ------- + scalar + Minimum value. + + See Also + -------- + Index.max : Return the maximum value of the object. + Series.min : Return the minimum value in a Series. + DataFrame.min : Return the minimum values in a DataFrame. + + Examples + -------- + >>> idx = pd.Index([3, 2, 1]) + >>> idx.min() + 1 + + >>> idx = pd.Index(['c', 'b', 'a']) + >>> idx.min() + 'a' + + For a MultiIndex, the minimum is determined lexicographically. + + >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) + >>> idx.min() + ('a', 1) + """ + nv.validate_min(args, kwargs) + nv.validate_minmax_axis(axis) + + if not len(self): + return self._na_value + + if len(self) and self.is_monotonic_increasing: + # quick check + first = self[0] + if not isna(first): + return first + + if not self._is_multi and self.hasnans: + # Take advantage of cache + mask = self._isnan + if not skipna or mask.all(): + return self._na_value + + if not self._is_multi and not isinstance(self._values, np.ndarray): + return self._values._reduce(name="min", skipna=skipna) + + return nanops.nanmin(self._values, skipna=skipna) + + def max(self, axis=None, skipna: bool = True, *args, **kwargs): + """ + Return the maximum value of the Index. + + Parameters + ---------- + axis : int, optional + For compatibility with NumPy. Only 0 or None are allowed. + skipna : bool, default True + Exclude NA/null values when showing the result. + *args, **kwargs + Additional arguments and keywords for compatibility with NumPy. + + Returns + ------- + scalar + Maximum value. + + See Also + -------- + Index.min : Return the minimum value in an Index. + Series.max : Return the maximum value in a Series. + DataFrame.max : Return the maximum values in a DataFrame. + + Examples + -------- + >>> idx = pd.Index([3, 2, 1]) + >>> idx.max() + 3 + + >>> idx = pd.Index(['c', 'b', 'a']) + >>> idx.max() + 'c' + + For a MultiIndex, the maximum is determined lexicographically. + + >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) + >>> idx.max() + ('b', 2) + """ + + nv.validate_max(args, kwargs) + nv.validate_minmax_axis(axis) + + if not len(self): + return self._na_value + + if len(self) and self.is_monotonic_increasing: + # quick check + last = self[-1] + if not isna(last): + return last + + if not self._is_multi and self.hasnans: + # Take advantage of cache + mask = self._isnan + if not skipna or mask.all(): + return self._na_value + + if not self._is_multi and not isinstance(self._values, np.ndarray): + return self._values._reduce(name="max", skipna=skipna) + + return nanops.nanmax(self._values, skipna=skipna) + + # -------------------------------------------------------------------- + + @final + @property + def shape(self) -> Shape: + """ + Return a tuple of the shape of the underlying data. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Index([1, 2, 3], dtype='int64') + >>> idx.shape + (3,) + """ + # See GH#27775, GH#27384 for history/reasoning in how this is defined. + return (len(self),) + + +def ensure_index_from_sequences(sequences, names=None) -> Index: + """ + Construct an index from sequences of data. + + A single sequence returns an Index. Many sequences returns a + MultiIndex. + + Parameters + ---------- + sequences : sequence of sequences + names : sequence of str + + Returns + ------- + index : Index or MultiIndex + + Examples + -------- + >>> ensure_index_from_sequences([[1, 2, 3]], names=["name"]) + Index([1, 2, 3], dtype='int64', name='name') + + >>> ensure_index_from_sequences([["a", "a"], ["a", "b"]], names=["L1", "L2"]) + MultiIndex([('a', 'a'), + ('a', 'b')], + names=['L1', 'L2']) + + See Also + -------- + ensure_index + """ + from pandas.core.indexes.multi import MultiIndex + + if len(sequences) == 1: + if names is not None: + names = names[0] + return Index(sequences[0], name=names) + else: + return MultiIndex.from_arrays(sequences, names=names) + + +def ensure_index(index_like: Axes, copy: bool = False) -> Index: + """ + Ensure that we have an index from some index-like object. + + Parameters + ---------- + index_like : sequence + An Index or other sequence + copy : bool, default False + + Returns + ------- + index : Index or MultiIndex + + See Also + -------- + ensure_index_from_sequences + + Examples + -------- + >>> ensure_index(['a', 'b']) + Index(['a', 'b'], dtype='object') + + >>> ensure_index([('a', 'a'), ('b', 'c')]) + Index([('a', 'a'), ('b', 'c')], dtype='object') + + >>> ensure_index([['a', 'a'], ['b', 'c']]) + MultiIndex([('a', 'b'), + ('a', 'c')], + ) + """ + if isinstance(index_like, Index): + if copy: + index_like = index_like.copy() + return index_like + + if isinstance(index_like, ABCSeries): + name = index_like.name + return Index(index_like, name=name, copy=copy) + + if is_iterator(index_like): + index_like = list(index_like) + + if isinstance(index_like, list): + if type(index_like) is not list: # noqa: E721 + # must check for exactly list here because of strict type + # check in clean_index_list + index_like = list(index_like) + + if len(index_like) and lib.is_all_arraylike(index_like): + from pandas.core.indexes.multi import MultiIndex + + return MultiIndex.from_arrays(index_like) + else: + return Index(index_like, copy=copy, tupleize_cols=False) + else: + return Index(index_like, copy=copy) + + +def ensure_has_len(seq): + """ + If seq is an iterator, put its values into a list. + """ + try: + len(seq) + except TypeError: + return list(seq) + else: + return seq + + +def trim_front(strings: list[str]) -> list[str]: + """ + Trims zeros and decimal points. + + Examples + -------- + >>> trim_front([" a", " b"]) + ['a', 'b'] + + >>> trim_front([" a", " "]) + ['a', ''] + """ + if not strings: + return strings + while all(strings) and all(x[0] == " " for x in strings): + strings = [x[1:] for x in strings] + return strings + + +def _validate_join_method(method: str) -> None: + if method not in ["left", "right", "inner", "outer"]: + raise ValueError(f"do not recognize join method {method}") + + +def maybe_extract_name(name, obj, cls) -> Hashable: + """ + If no name is passed, then extract it from data, validating hashability. + """ + if name is None and isinstance(obj, (Index, ABCSeries)): + # Note we don't just check for "name" attribute since that would + # pick up e.g. dtype.name + name = obj.name + + # GH#29069 + if not is_hashable(name): + raise TypeError(f"{cls.__name__}.name must be a hashable type") + + return name + + +def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]: + """ + Return common name if all indices agree, otherwise None (level-by-level). + + Parameters + ---------- + indexes : list of Index objects + + Returns + ------- + list + A list representing the unanimous 'names' found. + """ + name_tups = [tuple(i.names) for i in indexes] + name_sets = [{*ns} for ns in zip_longest(*name_tups)] + names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets) + return names + + +def _unpack_nested_dtype(other: Index) -> DtypeObj: + """ + When checking if our dtype is comparable with another, we need + to unpack CategoricalDtype to look at its categories.dtype. + + Parameters + ---------- + other : Index + + Returns + ------- + np.dtype or ExtensionDtype + """ + dtype = other.dtype + if isinstance(dtype, CategoricalDtype): + # If there is ever a SparseIndex, this could get dispatched + # here too. + return dtype.categories.dtype + elif isinstance(dtype, ArrowDtype): + # GH 53617 + import pyarrow as pa + + if pa.types.is_dictionary(dtype.pyarrow_dtype): + other = other[:0].astype(ArrowDtype(dtype.pyarrow_dtype.value_type)) + return other.dtype + + +def _maybe_try_sort(result: Index | ArrayLike, sort: bool | None): + if sort is not False: + try: + # error: Incompatible types in assignment (expression has type + # "Union[ExtensionArray, ndarray[Any, Any], Index, Series, + # Tuple[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series], + # ndarray[Any, Any]]]", variable has type "Union[Index, + # Union[ExtensionArray, ndarray[Any, Any]]]") + result = algos.safe_sort(result) # type: ignore[assignment] + except TypeError as err: + if sort is True: + raise + warnings.warn( + f"{err}, sort order is undefined for incomparable objects.", + RuntimeWarning, + stacklevel=find_stack_level(), + ) + return result + + +def get_values_for_csv( + values: ArrayLike, + *, + date_format, + na_rep: str = "nan", + quoting=None, + float_format=None, + decimal: str = ".", +) -> npt.NDArray[np.object_]: + """ + Convert to types which can be consumed by the standard library's + csv.writer.writerows. + """ + if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": + # GH#40754 Convert categorical datetimes to datetime array + values = algos.take_nd( + values.categories._values, + ensure_platform_int(values._codes), + fill_value=na_rep, + ) + + values = ensure_wrapped_if_datetimelike(values) + + if isinstance(values, (DatetimeArray, TimedeltaArray)): + if values.ndim == 1: + result = values._format_native_types(na_rep=na_rep, date_format=date_format) + result = result.astype(object, copy=False) + return result + + # GH#21734 Process every column separately, they might have different formats + results_converted = [] + for i in range(len(values)): + result = values[i, :]._format_native_types( + na_rep=na_rep, date_format=date_format + ) + results_converted.append(result.astype(object, copy=False)) + return np.vstack(results_converted) + + elif isinstance(values.dtype, PeriodDtype): + # TODO: tests that get here in column path + values = cast("PeriodArray", values) + res = values._format_native_types(na_rep=na_rep, date_format=date_format) + return res + + elif isinstance(values.dtype, IntervalDtype): + # TODO: tests that get here in column path + values = cast("IntervalArray", values) + mask = values.isna() + if not quoting: + result = np.asarray(values).astype(str) + else: + result = np.array(values, dtype=object, copy=True) + + result[mask] = na_rep + return result + + elif values.dtype.kind == "f" and not isinstance(values.dtype, SparseDtype): + # see GH#13418: no special formatting is desired at the + # output (important for appropriate 'quoting' behaviour), + # so do not pass it through the FloatArrayFormatter + if float_format is None and decimal == ".": + mask = isna(values) + + if not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype="object") + + values[mask] = na_rep + values = values.astype(object, copy=False) + return values + + from pandas.io.formats.format import FloatArrayFormatter + + formatter = FloatArrayFormatter( + values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) + res = formatter.get_result_as_array() + res = res.astype(object, copy=False) + return res + + elif isinstance(values, ExtensionArray): + mask = isna(values) + + new_values = np.asarray(values.astype(object)) + new_values[mask] = na_rep + return new_values + + else: + mask = isna(values) + itemsize = writers.word_len(na_rep) + + if values.dtype != _dtype_obj and not quoting and itemsize: + values = values.astype(str) + if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: + # enlarge for the na_rep + values = values.astype(f"`__ + for more. + + Examples + -------- + >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], + categories=['a', 'b', 'c'], ordered=False, dtype='category') + + ``CategoricalIndex`` can also be instantiated from a ``Categorical``: + + >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"]) + >>> pd.CategoricalIndex(c) + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], + categories=['a', 'b', 'c'], ordered=False, dtype='category') + + Ordered ``CategoricalIndex`` can have a min and max value. + + >>> ci = pd.CategoricalIndex( + ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"] + ... ) + >>> ci + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], + categories=['c', 'b', 'a'], ordered=True, dtype='category') + >>> ci.min() + 'c' + """ + + _typ = "categoricalindex" + _data_cls = Categorical + + @property + def _can_hold_strings(self): + return self.categories._can_hold_strings + + @cache_readonly + def _should_fallback_to_positional(self) -> bool: + return self.categories._should_fallback_to_positional + + codes: np.ndarray + categories: Index + ordered: bool | None + _data: Categorical + _values: Categorical + + @property + def _engine_type(self) -> type[libindex.IndexEngine]: + # self.codes can have dtype int8, int16, int32 or int64, so we need + # to return the corresponding engine type (libindex.Int8Engine, etc.). + return { + np.int8: libindex.Int8Engine, + np.int16: libindex.Int16Engine, + np.int32: libindex.Int32Engine, + np.int64: libindex.Int64Engine, + }[self.codes.dtype.type] + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + data=None, + categories=None, + ordered=None, + dtype: Dtype | None = None, + copy: bool = False, + name: Hashable | None = None, + ) -> Self: + name = maybe_extract_name(name, data, cls) + + if is_scalar(data): + # GH#38944 include None here, which pre-2.0 subbed in [] + cls._raise_scalar_data_error(data) + + data = Categorical( + data, categories=categories, ordered=ordered, dtype=dtype, copy=copy + ) + + return cls._simple_new(data, name=name) + + # -------------------------------------------------------------------- + + def _is_dtype_compat(self, other: Index) -> Categorical: + """ + *this is an internal non-public method* + + provide a comparison between the dtype of self and other (coercing if + needed) + + Parameters + ---------- + other : Index + + Returns + ------- + Categorical + + Raises + ------ + TypeError if the dtypes are not compatible + """ + if isinstance(other.dtype, CategoricalDtype): + cat = extract_array(other) + cat = cast(Categorical, cat) + if not cat._categories_match_up_to_permutation(self._values): + raise TypeError( + "categories must match existing categories when appending" + ) + + elif other._is_multi: + # preempt raising NotImplementedError in isna call + raise TypeError("MultiIndex is not dtype-compatible with CategoricalIndex") + else: + values = other + + cat = Categorical(other, dtype=self.dtype) + other = CategoricalIndex(cat) + if not other.isin(values).all(): + raise TypeError( + "cannot append a non-category item to a CategoricalIndex" + ) + cat = other._values + + if not ((cat == values) | (isna(cat) & isna(values))).all(): + # GH#37667 see test_equals_non_category + raise TypeError( + "categories must match existing categories when appending" + ) + + return cat + + def equals(self, other: object) -> bool: + """ + Determine if two CategoricalIndex objects contain the same elements. + + Returns + ------- + bool + ``True`` if two :class:`pandas.CategoricalIndex` objects have equal + elements, ``False`` otherwise. + + Examples + -------- + >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) + >>> ci2 = pd.CategoricalIndex(pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])) + >>> ci.equals(ci2) + True + + The order of elements matters. + + >>> ci3 = pd.CategoricalIndex(['c', 'b', 'a', 'a', 'b', 'c']) + >>> ci.equals(ci3) + False + + The orderedness also matters. + + >>> ci4 = ci.as_ordered() + >>> ci.equals(ci4) + False + + The categories matter, but the order of the categories matters only when + ``ordered=True``. + + >>> ci5 = ci.set_categories(['a', 'b', 'c', 'd']) + >>> ci.equals(ci5) + False + + >>> ci6 = ci.set_categories(['b', 'c', 'a']) + >>> ci.equals(ci6) + True + >>> ci_ordered = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], + ... ordered=True) + >>> ci2_ordered = ci_ordered.set_categories(['b', 'c', 'a']) + >>> ci_ordered.equals(ci2_ordered) + False + """ + if self.is_(other): + return True + + if not isinstance(other, Index): + return False + + try: + other = self._is_dtype_compat(other) + except (TypeError, ValueError): + return False + + return self._data.equals(other) + + # -------------------------------------------------------------------- + # Rendering Methods + + @property + def _formatter_func(self): + return self.categories._formatter_func + + def _format_attrs(self): + """ + Return a list of tuples of the (attr,formatted_value) + """ + attrs: list[tuple[str, str | int | bool | None]] + + attrs = [ + ( + "categories", + f"[{', '.join(self._data._repr_categories())}]", + ), + ("ordered", self.ordered), + ] + extra = super()._format_attrs() + return attrs + extra + + # -------------------------------------------------------------------- + + @property + def inferred_type(self) -> str: + return "categorical" + + @doc(Index.__contains__) + def __contains__(self, key: Any) -> bool: + # if key is a NaN, check if any NaN is in self. + if is_valid_na_for_dtype(key, self.categories.dtype): + return self.hasnans + + return contains(self, key, container=self._engine) + + def reindex( + self, target, method=None, level=None, limit: int | None = None, tolerance=None + ) -> tuple[Index, npt.NDArray[np.intp] | None]: + """ + Create index with target's values (move/add/delete values as necessary) + + Returns + ------- + new_index : pd.Index + Resulting index + indexer : np.ndarray[np.intp] or None + Indices of output values in original index + + """ + if method is not None: + raise NotImplementedError( + "argument method is not implemented for CategoricalIndex.reindex" + ) + if level is not None: + raise NotImplementedError( + "argument level is not implemented for CategoricalIndex.reindex" + ) + if limit is not None: + raise NotImplementedError( + "argument limit is not implemented for CategoricalIndex.reindex" + ) + return super().reindex(target) + + # -------------------------------------------------------------------- + # Indexing Methods + + def _maybe_cast_indexer(self, key) -> int: + # GH#41933: we have to do this instead of self._data._validate_scalar + # because this will correctly get partial-indexing on Interval categories + try: + return self._data._unbox_scalar(key) + except KeyError: + if is_valid_na_for_dtype(key, self.categories.dtype): + return -1 + raise + + def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex: + if isinstance(values, CategoricalIndex): + values = values._data + if isinstance(values, Categorical): + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + cat = self._data._encode_with_my_categories(values) + codes = cat._codes + else: + codes = self.categories.get_indexer(values) + codes = codes.astype(self.codes.dtype, copy=False) + cat = self._data._from_backing_data(codes) + return type(self)._simple_new(cat) + + # -------------------------------------------------------------------- + + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + return self.categories._is_comparable_dtype(dtype) + + def map(self, mapper, na_action: Literal["ignore"] | None = None): + """ + Map values using input an input mapping or function. + + Maps the values (their categories, not the codes) of the index to new + categories. If the mapping correspondence is one-to-one the result is a + :class:`~pandas.CategoricalIndex` which has the same order property as + the original, otherwise an :class:`~pandas.Index` is returned. + + If a `dict` or :class:`~pandas.Series` is used any unmapped category is + mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` + will be returned. + + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + + Returns + ------- + pandas.CategoricalIndex or pandas.Index + Mapped index. + + See Also + -------- + Index.map : Apply a mapping correspondence on an + :class:`~pandas.Index`. + Series.map : Apply a mapping correspondence on a + :class:`~pandas.Series`. + Series.apply : Apply more complex functions on a + :class:`~pandas.Series`. + + Examples + -------- + >>> idx = pd.CategoricalIndex(['a', 'b', 'c']) + >>> idx + CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], + ordered=False, dtype='category') + >>> idx.map(lambda x: x.upper()) + CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'], + ordered=False, dtype='category') + >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'}) + CategoricalIndex(['first', 'second', 'third'], categories=['first', + 'second', 'third'], ordered=False, dtype='category') + + If the mapping is one-to-one the ordering of the categories is + preserved: + + >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True) + >>> idx + CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], + ordered=True, dtype='category') + >>> idx.map({'a': 3, 'b': 2, 'c': 1}) + CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True, + dtype='category') + + If the mapping is not one-to-one an :class:`~pandas.Index` is returned: + + >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'}) + Index(['first', 'second', 'first'], dtype='object') + + If a `dict` is used, all unmapped categories are mapped to `NaN` and + the result is an :class:`~pandas.Index`: + + >>> idx.map({'a': 'first', 'b': 'second'}) + Index(['first', 'second', nan], dtype='object') + """ + mapped = self._values.map(mapper, na_action=na_action) + return Index(mapped, name=self.name) + + def _concat(self, to_concat: list[Index], name: Hashable) -> Index: + # if calling index is category, don't check dtype of others + try: + cat = Categorical._concat_same_type( + [self._is_dtype_compat(c) for c in to_concat] + ) + except TypeError: + # not all to_concat elements are among our categories (or NA) + + res = concat_compat([x._values for x in to_concat]) + return Index(res, name=name) + else: + return type(self)._simple_new(cat, name=name) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/datetimelike.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/datetimelike.py new file mode 100644 index 0000000000000000000000000000000000000000..cad8737a987d44f23518a8b6fa88e9a686755c65 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/datetimelike.py @@ -0,0 +1,843 @@ +""" +Base and utility classes for tseries type pandas objects. +""" +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +from typing import ( + TYPE_CHECKING, + Any, + Callable, + cast, + final, +) +import warnings + +import numpy as np + +from pandas._config import using_copy_on_write + +from pandas._libs import ( + NaT, + Timedelta, + lib, +) +from pandas._libs.tslibs import ( + BaseOffset, + Resolution, + Tick, + parsing, + to_offset, +) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr +from pandas.compat.numpy import function as nv +from pandas.errors import ( + InvalidIndexError, + NullFrequencyError, +) +from pandas.util._decorators import ( + Appender, + cache_readonly, + doc, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_integer, + is_list_like, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.dtypes import CategoricalDtype + +from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + PeriodArray, + TimedeltaArray, +) +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +import pandas.core.common as com +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import ( + Index, + _index_shared_docs, +) +from pandas.core.indexes.extension import NDArrayBackedExtensionIndex +from pandas.core.indexes.range import RangeIndex +from pandas.core.tools.timedeltas import to_timedelta + +if TYPE_CHECKING: + from collections.abc import Sequence + from datetime import datetime + + from pandas._typing import ( + Axis, + Self, + npt, + ) + + from pandas import CategoricalIndex + +_index_doc_kwargs = dict(ibase._index_doc_kwargs) + + +class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex, ABC): + """ + Common ops mixin to support a unified interface datetimelike Index. + """ + + _can_hold_strings = False + _data: DatetimeArray | TimedeltaArray | PeriodArray + + @doc(DatetimeLikeArrayMixin.mean) + def mean(self, *, skipna: bool = True, axis: int | None = 0): + return self._data.mean(skipna=skipna, axis=axis) + + @property + def freq(self) -> BaseOffset | None: + return self._data.freq + + @freq.setter + def freq(self, value) -> None: + # error: Property "freq" defined in "PeriodArray" is read-only [misc] + self._data.freq = value # type: ignore[misc] + + @property + def asi8(self) -> npt.NDArray[np.int64]: + return self._data.asi8 + + @property + @doc(DatetimeLikeArrayMixin.freqstr) + def freqstr(self) -> str: + from pandas import PeriodIndex + + if self._data.freqstr is not None and isinstance( + self._data, (PeriodArray, PeriodIndex) + ): + freq = freq_to_period_freqstr(self._data.freq.n, self._data.freq.name) + return freq + else: + return self._data.freqstr # type: ignore[return-value] + + @cache_readonly + @abstractmethod + def _resolution_obj(self) -> Resolution: + ... + + @cache_readonly + @doc(DatetimeLikeArrayMixin.resolution) + def resolution(self) -> str: + return self._data.resolution + + # ------------------------------------------------------------------------ + + @cache_readonly + def hasnans(self) -> bool: + return self._data._hasna + + def equals(self, other: Any) -> bool: + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + + if not isinstance(other, Index): + return False + elif other.dtype.kind in "iufc": + return False + elif not isinstance(other, type(self)): + should_try = False + inferable = self._data._infer_matches + if other.dtype == object: + should_try = other.inferred_type in inferable + elif isinstance(other.dtype, CategoricalDtype): + other = cast("CategoricalIndex", other) + should_try = other.categories.inferred_type in inferable + + if should_try: + try: + other = type(self)(other) + except (ValueError, TypeError, OverflowError): + # e.g. + # ValueError -> cannot parse str entry, or OutOfBoundsDatetime + # TypeError -> trying to convert IntervalIndex to DatetimeIndex + # OverflowError -> Index([very_large_timedeltas]) + return False + + if self.dtype != other.dtype: + # have different timezone + return False + + return np.array_equal(self.asi8, other.asi8) + + @Appender(Index.__contains__.__doc__) + def __contains__(self, key: Any) -> bool: + hash(key) + try: + self.get_loc(key) + except (KeyError, TypeError, ValueError, InvalidIndexError): + return False + return True + + def _convert_tolerance(self, tolerance, target): + tolerance = np.asarray(to_timedelta(tolerance).to_numpy()) + return super()._convert_tolerance(tolerance, target) + + # -------------------------------------------------------------------- + # Rendering Methods + _default_na_rep = "NaT" + + def format( + self, + name: bool = False, + formatter: Callable | None = None, + na_rep: str = "NaT", + date_format: str | None = None, + ) -> list[str]: + """ + Render a string representation of the Index. + """ + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + header = [] + if name: + header.append( + ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) + + if formatter is not None: + return header + list(self.map(formatter)) + + return self._format_with_header( + header=header, na_rep=na_rep, date_format=date_format + ) + + def _format_with_header( + self, *, header: list[str], na_rep: str, date_format: str | None = None + ) -> list[str]: + # TODO: not reached in tests 2023-10-11 + # matches base class except for whitespace padding and date_format + return header + list( + self._get_values_for_csv(na_rep=na_rep, date_format=date_format) + ) + + @property + def _formatter_func(self): + return self._data._formatter() + + def _format_attrs(self): + """ + Return a list of tuples of the (attr,formatted_value). + """ + attrs = super()._format_attrs() + for attrib in self._attributes: + # iterating over _attributes prevents us from doing this for PeriodIndex + if attrib == "freq": + freq = self.freqstr + if freq is not None: + freq = repr(freq) # e.g. D -> 'D' + attrs.append(("freq", freq)) + return attrs + + @Appender(Index._summary.__doc__) + def _summary(self, name=None) -> str: + result = super()._summary(name=name) + if self.freq: + result += f"\nFreq: {self.freqstr}" + + return result + + # -------------------------------------------------------------------- + # Indexing Methods + + @final + def _can_partial_date_slice(self, reso: Resolution) -> bool: + # e.g. test_getitem_setitem_periodindex + # History of conversation GH#3452, GH#3931, GH#2369, GH#14826 + return reso > self._resolution_obj + # NB: for DTI/PI, not TDI + + def _parsed_string_to_bounds(self, reso: Resolution, parsed): + raise NotImplementedError + + def _parse_with_reso(self, label: str): + # overridden by TimedeltaIndex + try: + if self.freq is None or hasattr(self.freq, "rule_code"): + freq = self.freq + except NotImplementedError: + freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) + + freqstr: str | None + if freq is not None and not isinstance(freq, str): + freqstr = freq.rule_code + else: + freqstr = freq + + if isinstance(label, np.str_): + # GH#45580 + label = str(label) + + parsed, reso_str = parsing.parse_datetime_string_with_reso(label, freqstr) + reso = Resolution.from_attrname(reso_str) + return parsed, reso + + def _get_string_slice(self, key: str): + # overridden by TimedeltaIndex + parsed, reso = self._parse_with_reso(key) + try: + return self._partial_date_slice(reso, parsed) + except KeyError as err: + raise KeyError(key) from err + + @final + def _partial_date_slice( + self, + reso: Resolution, + parsed: datetime, + ) -> slice | npt.NDArray[np.intp]: + """ + Parameters + ---------- + reso : Resolution + parsed : datetime + + Returns + ------- + slice or ndarray[intp] + """ + if not self._can_partial_date_slice(reso): + raise ValueError + + t1, t2 = self._parsed_string_to_bounds(reso, parsed) + vals = self._data._ndarray + unbox = self._data._unbox + + if self.is_monotonic_increasing: + if len(self) and ( + (t1 < self[0] and t2 < self[0]) or (t1 > self[-1] and t2 > self[-1]) + ): + # we are out of range + raise KeyError + + # TODO: does this depend on being monotonic _increasing_? + + # a monotonic (sorted) series can be sliced + left = vals.searchsorted(unbox(t1), side="left") + right = vals.searchsorted(unbox(t2), side="right") + return slice(left, right) + + else: + lhs_mask = vals >= unbox(t1) + rhs_mask = vals <= unbox(t2) + + # try to find the dates + return (lhs_mask & rhs_mask).nonzero()[0] + + def _maybe_cast_slice_bound(self, label, side: str): + """ + If label is a string, cast it to scalar type according to resolution. + + Parameters + ---------- + label : object + side : {'left', 'right'} + + Returns + ------- + label : object + + Notes + ----- + Value of `side` parameter should be validated in caller. + """ + if isinstance(label, str): + try: + parsed, reso = self._parse_with_reso(label) + except ValueError as err: + # DTI -> parsing.DateParseError + # TDI -> 'unit abbreviation w/o a number' + # PI -> string cannot be parsed as datetime-like + self._raise_invalid_indexer("slice", label, err) + + lower, upper = self._parsed_string_to_bounds(reso, parsed) + return lower if side == "left" else upper + elif not isinstance(label, self._data._recognized_scalars): + self._raise_invalid_indexer("slice", label) + + return label + + # -------------------------------------------------------------------- + # Arithmetic Methods + + def shift(self, periods: int = 1, freq=None) -> Self: + """ + Shift index by desired number of time frequency increments. + + This method is for shifting the values of datetime-like indexes + by a specified time increment a given number of times. + + Parameters + ---------- + periods : int, default 1 + Number of periods (or increments) to shift by, + can be positive or negative. + freq : pandas.DateOffset, pandas.Timedelta or string, optional + Frequency increment to shift by. + If None, the index is shifted by its own `freq` attribute. + Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. + + Returns + ------- + pandas.DatetimeIndex + Shifted index. + + See Also + -------- + Index.shift : Shift values of Index. + PeriodIndex.shift : Shift values of PeriodIndex. + """ + raise NotImplementedError + + # -------------------------------------------------------------------- + + @doc(Index._maybe_cast_listlike_indexer) + def _maybe_cast_listlike_indexer(self, keyarr): + try: + res = self._data._validate_listlike(keyarr, allow_object=True) + except (ValueError, TypeError): + if not isinstance(keyarr, ExtensionArray): + # e.g. we don't want to cast DTA to ndarray[object] + res = com.asarray_tuplesafe(keyarr) + # TODO: com.asarray_tuplesafe shouldn't cast e.g. DatetimeArray + else: + res = keyarr + return Index(res, dtype=res.dtype) + + +class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, ABC): + """ + Mixin class for methods shared by DatetimeIndex and TimedeltaIndex, + but not PeriodIndex + """ + + _data: DatetimeArray | TimedeltaArray + _comparables = ["name", "freq"] + _attributes = ["name", "freq"] + + # Compat for frequency inference, see GH#23789 + _is_monotonic_increasing = Index.is_monotonic_increasing + _is_monotonic_decreasing = Index.is_monotonic_decreasing + _is_unique = Index.is_unique + + @property + def unit(self) -> str: + return self._data.unit + + def as_unit(self, unit: str) -> Self: + """ + Convert to a dtype with the given unit resolution. + + Parameters + ---------- + unit : {'s', 'ms', 'us', 'ns'} + + Returns + ------- + same type as self + + Examples + -------- + For :class:`pandas.DatetimeIndex`: + + >>> idx = pd.DatetimeIndex(['2020-01-02 01:02:03.004005006']) + >>> idx + DatetimeIndex(['2020-01-02 01:02:03.004005006'], + dtype='datetime64[ns]', freq=None) + >>> idx.as_unit('s') + DatetimeIndex(['2020-01-02 01:02:03'], dtype='datetime64[s]', freq=None) + + For :class:`pandas.TimedeltaIndex`: + + >>> tdelta_idx = pd.to_timedelta(['1 day 3 min 2 us 42 ns']) + >>> tdelta_idx + TimedeltaIndex(['1 days 00:03:00.000002042'], + dtype='timedelta64[ns]', freq=None) + >>> tdelta_idx.as_unit('s') + TimedeltaIndex(['1 days 00:03:00'], dtype='timedelta64[s]', freq=None) + """ + arr = self._data.as_unit(unit) + return type(self)._simple_new(arr, name=self.name) + + def _with_freq(self, freq): + arr = self._data._with_freq(freq) + return type(self)._simple_new(arr, name=self._name) + + @property + def values(self) -> np.ndarray: + # NB: For Datetime64TZ this is lossy + data = self._data._ndarray + if using_copy_on_write(): + data = data.view() + data.flags.writeable = False + return data + + @doc(DatetimeIndexOpsMixin.shift) + def shift(self, periods: int = 1, freq=None) -> Self: + if freq is not None and freq != self.freq: + if isinstance(freq, str): + freq = to_offset(freq) + offset = periods * freq + return self + offset + + if periods == 0 or len(self) == 0: + # GH#14811 empty case + return self.copy() + + if self.freq is None: + raise NullFrequencyError("Cannot shift with no freq") + + start = self[0] + periods * self.freq + end = self[-1] + periods * self.freq + + # Note: in the DatetimeTZ case, _generate_range will infer the + # appropriate timezone from `start` and `end`, so tz does not need + # to be passed explicitly. + result = self._data._generate_range( + start=start, end=end, periods=None, freq=self.freq, unit=self.unit + ) + return type(self)._simple_new(result, name=self.name) + + @cache_readonly + @doc(DatetimeLikeArrayMixin.inferred_freq) + def inferred_freq(self) -> str | None: + return self._data.inferred_freq + + # -------------------------------------------------------------------- + # Set Operation Methods + + @cache_readonly + def _as_range_index(self) -> RangeIndex: + # Convert our i8 representations to RangeIndex + # Caller is responsible for checking isinstance(self.freq, Tick) + freq = cast(Tick, self.freq) + tick = Timedelta(freq).as_unit("ns")._value + rng = range(self[0]._value, self[-1]._value + tick, tick) + return RangeIndex(rng) + + def _can_range_setop(self, other) -> bool: + return isinstance(self.freq, Tick) and isinstance(other.freq, Tick) + + def _wrap_range_setop(self, other, res_i8) -> Self: + new_freq = None + if not len(res_i8): + # RangeIndex defaults to step=1, which we don't want. + new_freq = self.freq + elif isinstance(res_i8, RangeIndex): + new_freq = to_offset(Timedelta(res_i8.step)) + + # TODO(GH#41493): we cannot just do + # type(self._data)(res_i8.values, dtype=self.dtype, freq=new_freq) + # because test_setops_preserve_freq fails with _validate_frequency raising. + # This raising is incorrect, as 'on_freq' is incorrect. This will + # be fixed by GH#41493 + res_values = res_i8.values.view(self._data._ndarray.dtype) + result = type(self._data)._simple_new( + # error: Argument "dtype" to "_simple_new" of "DatetimeArray" has + # incompatible type "Union[dtype[Any], ExtensionDtype]"; expected + # "Union[dtype[datetime64], DatetimeTZDtype]" + res_values, + dtype=self.dtype, # type: ignore[arg-type] + freq=new_freq, # type: ignore[arg-type] + ) + return cast("Self", self._wrap_setop_result(other, result)) + + def _range_intersect(self, other, sort) -> Self: + # Dispatch to RangeIndex intersection logic. + left = self._as_range_index + right = other._as_range_index + res_i8 = left.intersection(right, sort=sort) + return self._wrap_range_setop(other, res_i8) + + def _range_union(self, other, sort) -> Self: + # Dispatch to RangeIndex union logic. + left = self._as_range_index + right = other._as_range_index + res_i8 = left.union(right, sort=sort) + return self._wrap_range_setop(other, res_i8) + + def _intersection(self, other: Index, sort: bool = False) -> Index: + """ + intersection specialized to the case with matching dtypes and both non-empty. + """ + other = cast("DatetimeTimedeltaMixin", other) + + if self._can_range_setop(other): + return self._range_intersect(other, sort=sort) + + if not self._can_fast_intersect(other): + result = Index._intersection(self, other, sort=sort) + # We need to invalidate the freq because Index._intersection + # uses _shallow_copy on a view of self._data, which will preserve + # self.freq if we're not careful. + # At this point we should have result.dtype == self.dtype + # and type(result) is type(self._data) + result = self._wrap_setop_result(other, result) + return result._with_freq(None)._with_freq("infer") + + else: + return self._fast_intersect(other, sort) + + def _fast_intersect(self, other, sort): + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + # after sorting, the intersection always starts with the right index + # and ends with the index of which the last elements is smallest + end = min(left[-1], right[-1]) + start = right[0] + + if end < start: + result = self[:0] + else: + lslice = slice(*left.slice_locs(start, end)) + result = left._values[lslice] + + return result + + def _can_fast_intersect(self, other: Self) -> bool: + # Note: we only get here with len(self) > 0 and len(other) > 0 + if self.freq is None: + return False + + elif other.freq != self.freq: + return False + + elif not self.is_monotonic_increasing: + # Because freq is not None, we must then be monotonic decreasing + return False + + # this along with matching freqs ensure that we "line up", + # so intersection will preserve freq + # Note we are assuming away Ticks, as those go through _range_intersect + # GH#42104 + return self.freq.n == 1 + + def _can_fast_union(self, other: Self) -> bool: + # Assumes that type(self) == type(other), as per the annotation + # The ability to fast_union also implies that `freq` should be + # retained on union. + freq = self.freq + + if freq is None or freq != other.freq: + return False + + if not self.is_monotonic_increasing: + # Because freq is not None, we must then be monotonic decreasing + # TODO: do union on the reversed indexes? + return False + + if len(self) == 0 or len(other) == 0: + # only reached via union_many + return True + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + right_start = right[0] + left_end = left[-1] + + # Only need to "adjoin", not overlap + return (right_start == left_end + freq) or right_start in left + + def _fast_union(self, other: Self, sort=None) -> Self: + # Caller is responsible for ensuring self and other are non-empty + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + elif sort is False: + # TDIs are not in the "correct" order and we don't want + # to sort but want to remove overlaps + left, right = self, other + left_start = left[0] + loc = right.searchsorted(left_start, side="left") + right_chunk = right._values[:loc] + dates = concat_compat((left._values, right_chunk)) + result = type(self)._simple_new(dates, name=self.name) + return result + else: + left, right = other, self + + left_end = left[-1] + right_end = right[-1] + + # concatenate + if left_end < right_end: + loc = right.searchsorted(left_end, side="right") + right_chunk = right._values[loc:] + dates = concat_compat([left._values, right_chunk]) + # The can_fast_union check ensures that the result.freq + # should match self.freq + assert isinstance(dates, type(self._data)) + # error: Item "ExtensionArray" of "ExtensionArray | + # ndarray[Any, Any]" has no attribute "_freq" + assert dates._freq == self.freq # type: ignore[union-attr] + result = type(self)._simple_new(dates) + return result + else: + return left + + def _union(self, other, sort): + # We are called by `union`, which is responsible for this validation + assert isinstance(other, type(self)) + assert self.dtype == other.dtype + + if self._can_range_setop(other): + return self._range_union(other, sort=sort) + + if self._can_fast_union(other): + result = self._fast_union(other, sort=sort) + # in the case with sort=None, the _can_fast_union check ensures + # that result.freq == self.freq + return result + else: + return super()._union(other, sort)._with_freq("infer") + + # -------------------------------------------------------------------- + # Join Methods + + def _get_join_freq(self, other): + """ + Get the freq to attach to the result of a join operation. + """ + freq = None + if self._can_fast_union(other): + freq = self.freq + return freq + + def _wrap_joined_index( + self, joined, other, lidx: npt.NDArray[np.intp], ridx: npt.NDArray[np.intp] + ): + assert other.dtype == self.dtype, (other.dtype, self.dtype) + result = super()._wrap_joined_index(joined, other, lidx, ridx) + result._data._freq = self._get_join_freq(other) + return result + + def _get_engine_target(self) -> np.ndarray: + # engine methods and libjoin methods need dt64/td64 values cast to i8 + return self._data._ndarray.view("i8") + + def _from_join_target(self, result: np.ndarray): + # view e.g. i8 back to M8[ns] + result = result.view(self._data._ndarray.dtype) + return self._data._from_backing_data(result) + + # -------------------------------------------------------------------- + # List-like Methods + + def _get_delete_freq(self, loc: int | slice | Sequence[int]): + """ + Find the `freq` for self.delete(loc). + """ + freq = None + if self.freq is not None: + if is_integer(loc): + if loc in (0, -len(self), -1, len(self) - 1): + freq = self.freq + else: + if is_list_like(loc): + # error: Incompatible types in assignment (expression has + # type "Union[slice, ndarray]", variable has type + # "Union[int, slice, Sequence[int]]") + loc = lib.maybe_indices_to_slice( # type: ignore[assignment] + np.asarray(loc, dtype=np.intp), len(self) + ) + if isinstance(loc, slice) and loc.step in (1, None): + if loc.start in (0, None) or loc.stop in (len(self), None): + freq = self.freq + return freq + + def _get_insert_freq(self, loc: int, item): + """ + Find the `freq` for self.insert(loc, item). + """ + value = self._data._validate_scalar(item) + item = self._data._box_func(value) + + freq = None + if self.freq is not None: + # freq can be preserved on edge cases + if self.size: + if item is NaT: + pass + elif loc in (0, -len(self)) and item + self.freq == self[0]: + freq = self.freq + elif (loc == len(self)) and item - self.freq == self[-1]: + freq = self.freq + else: + # Adding a single item to an empty index may preserve freq + if isinstance(self.freq, Tick): + # all TimedeltaIndex cases go through here; is_on_offset + # would raise TypeError + freq = self.freq + elif self.freq.is_on_offset(item): + freq = self.freq + return freq + + @doc(NDArrayBackedExtensionIndex.delete) + def delete(self, loc) -> Self: + result = super().delete(loc) + result._data._freq = self._get_delete_freq(loc) + return result + + @doc(NDArrayBackedExtensionIndex.insert) + def insert(self, loc: int, item): + result = super().insert(loc, item) + if isinstance(result, type(self)): + # i.e. parent class method did not cast + result._data._freq = self._get_insert_freq(loc, item) + return result + + # -------------------------------------------------------------------- + # NDArray-Like Methods + + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take( + self, + indices, + axis: Axis = 0, + allow_fill: bool = True, + fill_value=None, + **kwargs, + ) -> Self: + nv.validate_take((), kwargs) + indices = np.asarray(indices, dtype=np.intp) + + result = NDArrayBackedExtensionIndex.take( + self, indices, axis, allow_fill, fill_value, **kwargs + ) + + maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) + if isinstance(maybe_slice, slice): + freq = self._data._get_getitem_freq(maybe_slice) + result._data._freq = freq + return result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/datetimes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/datetimes.py new file mode 100644 index 0000000000000000000000000000000000000000..c978abd8c2427f03022980bb62ddf24666d9e7c2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/datetimes.py @@ -0,0 +1,1127 @@ +from __future__ import annotations + +import datetime as dt +import operator +from typing import TYPE_CHECKING +import warnings + +import numpy as np +import pytz + +from pandas._libs import ( + NaT, + Period, + Timestamp, + index as libindex, + lib, +) +from pandas._libs.tslibs import ( + Resolution, + Tick, + Timedelta, + periods_per_day, + timezones, + to_offset, +) +from pandas._libs.tslibs.offsets import prefix_mapping +from pandas.util._decorators import ( + cache_readonly, + doc, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import is_scalar +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import is_valid_na_for_dtype + +from pandas.core.arrays.datetimes import ( + DatetimeArray, + tz_to_dtype, +) +import pandas.core.common as com +from pandas.core.indexes.base import ( + Index, + maybe_extract_name, +) +from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin +from pandas.core.indexes.extension import inherit_names +from pandas.core.tools.times import to_time + +if TYPE_CHECKING: + from collections.abc import Hashable + + from pandas._typing import ( + Dtype, + DtypeObj, + Frequency, + IntervalClosedType, + Self, + TimeAmbiguous, + TimeNonexistent, + npt, + ) + + from pandas.core.api import ( + DataFrame, + PeriodIndex, + ) + +from pandas._libs.tslibs.dtypes import OFFSET_TO_PERIOD_FREQSTR + + +def _new_DatetimeIndex(cls, d): + """ + This is called upon unpickling, rather than the default which doesn't + have arguments and breaks __new__ + """ + if "data" in d and not isinstance(d["data"], DatetimeIndex): + # Avoid need to verify integrity by calling simple_new directly + data = d.pop("data") + if not isinstance(data, DatetimeArray): + # For backward compat with older pickles, we may need to construct + # a DatetimeArray to adapt to the newer _simple_new signature + tz = d.pop("tz") + freq = d.pop("freq") + dta = DatetimeArray._simple_new(data, dtype=tz_to_dtype(tz), freq=freq) + else: + dta = data + for key in ["tz", "freq"]: + # These are already stored in our DatetimeArray; if they are + # also in the pickle and don't match, we have a problem. + if key in d: + assert d[key] == getattr(dta, key) + d.pop(key) + result = cls._simple_new(dta, **d) + else: + with warnings.catch_warnings(): + # TODO: If we knew what was going in to **d, we might be able to + # go through _simple_new instead + warnings.simplefilter("ignore") + result = cls.__new__(cls, **d) + + return result + + +@inherit_names( + DatetimeArray._field_ops + + [ + method + for method in DatetimeArray._datetimelike_methods + if method not in ("tz_localize", "tz_convert", "strftime") + ], + DatetimeArray, + wrap=True, +) +@inherit_names(["is_normalized"], DatetimeArray, cache=True) +@inherit_names( + [ + "tz", + "tzinfo", + "dtype", + "to_pydatetime", + "date", + "time", + "timetz", + "std", + ] + + DatetimeArray._bool_ops, + DatetimeArray, +) +class DatetimeIndex(DatetimeTimedeltaMixin): + """ + Immutable ndarray-like of datetime64 data. + + Represented internally as int64, and which can be boxed to Timestamp objects + that are subclasses of datetime and carry metadata. + + .. versionchanged:: 2.0.0 + The various numeric date/time attributes (:attr:`~DatetimeIndex.day`, + :attr:`~DatetimeIndex.month`, :attr:`~DatetimeIndex.year` etc.) now have dtype + ``int32``. Previously they had dtype ``int64``. + + Parameters + ---------- + data : array-like (1-dimensional) + Datetime-like data to construct index with. + freq : str or pandas offset object, optional + One of pandas date offset strings or corresponding objects. The string + 'infer' can be passed in order to set the frequency of the index as the + inferred frequency upon creation. + tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str + Set the Timezone of the data. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + + .. deprecated:: 2.1.0 + + closed : {'left', 'right'}, optional + Set whether to include `start` and `end` that are on the + boundary. The default includes boundary points on either end. + + .. deprecated:: 2.1.0 + + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + When clocks moved backward due to DST, ambiguous times may arise. + For example in Central European Time (UTC+01), when going from 03:00 + DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC + and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter + dictates how ambiguous times should be handled. + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False signifies a + non-DST time (note that this flag is only applicable for ambiguous + times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. + dayfirst : bool, default False + If True, parse dates in `data` with the day first order. + yearfirst : bool, default False + If True parse dates in `data` with the year first order. + dtype : numpy.dtype or DatetimeTZDtype or str, default None + Note that the only NumPy dtype allowed is `datetime64[ns]`. + copy : bool, default False + Make a copy of input ndarray. + name : label, default None + Name to be stored in the index. + + Attributes + ---------- + year + month + day + hour + minute + second + microsecond + nanosecond + date + time + timetz + dayofyear + day_of_year + dayofweek + day_of_week + weekday + quarter + tz + freq + freqstr + is_month_start + is_month_end + is_quarter_start + is_quarter_end + is_year_start + is_year_end + is_leap_year + inferred_freq + + Methods + ------- + normalize + strftime + snap + tz_convert + tz_localize + round + floor + ceil + to_period + to_pydatetime + to_series + to_frame + month_name + day_name + mean + std + + See Also + -------- + Index : The base pandas Index type. + TimedeltaIndex : Index of timedelta64 data. + PeriodIndex : Index of Period data. + to_datetime : Convert argument to datetime. + date_range : Create a fixed-frequency DatetimeIndex. + + Notes + ----- + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) + >>> idx + DatetimeIndex(['2020-01-01 10:00:00+00:00', '2020-02-01 11:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) + """ + + _typ = "datetimeindex" + + _data_cls = DatetimeArray + _supports_partial_string_indexing = True + + @property + def _engine_type(self) -> type[libindex.DatetimeEngine]: + return libindex.DatetimeEngine + + _data: DatetimeArray + _values: DatetimeArray + tz: dt.tzinfo | None + + # -------------------------------------------------------------------- + # methods that dispatch to DatetimeArray and wrap result + + @doc(DatetimeArray.strftime) + def strftime(self, date_format) -> Index: + arr = self._data.strftime(date_format) + return Index(arr, name=self.name, dtype=object) + + @doc(DatetimeArray.tz_convert) + def tz_convert(self, tz) -> Self: + arr = self._data.tz_convert(tz) + return type(self)._simple_new(arr, name=self.name, refs=self._references) + + @doc(DatetimeArray.tz_localize) + def tz_localize( + self, + tz, + ambiguous: TimeAmbiguous = "raise", + nonexistent: TimeNonexistent = "raise", + ) -> Self: + arr = self._data.tz_localize(tz, ambiguous, nonexistent) + return type(self)._simple_new(arr, name=self.name) + + @doc(DatetimeArray.to_period) + def to_period(self, freq=None) -> PeriodIndex: + from pandas.core.indexes.api import PeriodIndex + + arr = self._data.to_period(freq) + return PeriodIndex._simple_new(arr, name=self.name) + + @doc(DatetimeArray.to_julian_date) + def to_julian_date(self) -> Index: + arr = self._data.to_julian_date() + return Index._simple_new(arr, name=self.name) + + @doc(DatetimeArray.isocalendar) + def isocalendar(self) -> DataFrame: + df = self._data.isocalendar() + return df.set_index(self) + + @cache_readonly + def _resolution_obj(self) -> Resolution: + return self._data._resolution_obj + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + data=None, + freq: Frequency | lib.NoDefault = lib.no_default, + tz=lib.no_default, + normalize: bool | lib.NoDefault = lib.no_default, + closed=lib.no_default, + ambiguous: TimeAmbiguous = "raise", + dayfirst: bool = False, + yearfirst: bool = False, + dtype: Dtype | None = None, + copy: bool = False, + name: Hashable | None = None, + ) -> Self: + if closed is not lib.no_default: + # GH#52628 + warnings.warn( + f"The 'closed' keyword in {cls.__name__} construction is " + "deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if normalize is not lib.no_default: + # GH#52628 + warnings.warn( + f"The 'normalize' keyword in {cls.__name__} construction is " + "deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if is_scalar(data): + cls._raise_scalar_data_error(data) + + # - Cases checked above all return/raise before reaching here - # + + name = maybe_extract_name(name, data, cls) + + if ( + isinstance(data, DatetimeArray) + and freq is lib.no_default + and tz is lib.no_default + and dtype is None + ): + # fastpath, similar logic in TimedeltaIndex.__new__; + # Note in this particular case we retain non-nano. + if copy: + data = data.copy() + return cls._simple_new(data, name=name) + + dtarr = DatetimeArray._from_sequence_not_strict( + data, + dtype=dtype, + copy=copy, + tz=tz, + freq=freq, + dayfirst=dayfirst, + yearfirst=yearfirst, + ambiguous=ambiguous, + ) + refs = None + if not copy and isinstance(data, (Index, ABCSeries)): + refs = data._references + + subarr = cls._simple_new(dtarr, name=name, refs=refs) + return subarr + + # -------------------------------------------------------------------- + + @cache_readonly + def _is_dates_only(self) -> bool: + """ + Return a boolean if we are only dates (and don't have a timezone) + + Returns + ------- + bool + """ + if isinstance(self.freq, Tick): + delta = Timedelta(self.freq) + + if delta % dt.timedelta(days=1) != dt.timedelta(days=0): + return False + + return self._values._is_dates_only + + def __reduce__(self): + d = {"data": self._data, "name": self.name} + return _new_DatetimeIndex, (type(self), d), None + + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + if self.tz is not None: + # If we have tz, we can compare to tzaware + return isinstance(dtype, DatetimeTZDtype) + # if we dont have tz, we can only compare to tznaive + return lib.is_np_dtype(dtype, "M") + + # -------------------------------------------------------------------- + # Rendering Methods + + @cache_readonly + def _formatter_func(self): + # Note this is equivalent to the DatetimeIndexOpsMixin method but + # uses the maybe-cached self._is_dates_only instead of re-computing it. + from pandas.io.formats.format import get_format_datetime64 + + formatter = get_format_datetime64(is_dates_only=self._is_dates_only) + return lambda x: f"'{formatter(x)}'" + + # -------------------------------------------------------------------- + # Set Operation Methods + + def _can_range_setop(self, other) -> bool: + # GH 46702: If self or other have non-UTC tzs, DST transitions prevent + # range representation due to no singular step + if ( + self.tz is not None + and not timezones.is_utc(self.tz) + and not timezones.is_fixed_offset(self.tz) + ): + return False + if ( + other.tz is not None + and not timezones.is_utc(other.tz) + and not timezones.is_fixed_offset(other.tz) + ): + return False + return super()._can_range_setop(other) + + # -------------------------------------------------------------------- + + def _get_time_micros(self) -> npt.NDArray[np.int64]: + """ + Return the number of microseconds since midnight. + + Returns + ------- + ndarray[int64_t] + """ + values = self._data._local_timestamps() + + ppd = periods_per_day(self._data._creso) + + frac = values % ppd + if self.unit == "ns": + micros = frac // 1000 + elif self.unit == "us": + micros = frac + elif self.unit == "ms": + micros = frac * 1000 + elif self.unit == "s": + micros = frac * 1_000_000 + else: # pragma: no cover + raise NotImplementedError(self.unit) + + micros[self._isnan] = -1 + return micros + + def snap(self, freq: Frequency = "S") -> DatetimeIndex: + """ + Snap time stamps to nearest occurring frequency. + + Returns + ------- + DatetimeIndex + + Examples + -------- + >>> idx = pd.DatetimeIndex(['2023-01-01', '2023-01-02', + ... '2023-02-01', '2023-02-02']) + >>> idx + DatetimeIndex(['2023-01-01', '2023-01-02', '2023-02-01', '2023-02-02'], + dtype='datetime64[ns]', freq=None) + >>> idx.snap('MS') + DatetimeIndex(['2023-01-01', '2023-01-01', '2023-02-01', '2023-02-01'], + dtype='datetime64[ns]', freq=None) + """ + # Superdumb, punting on any optimizing + freq = to_offset(freq) + + dta = self._data.copy() + + for i, v in enumerate(self): + s = v + if not freq.is_on_offset(s): + t0 = freq.rollback(s) + t1 = freq.rollforward(s) + if abs(s - t0) < abs(t1 - s): + s = t0 + else: + s = t1 + dta[i] = s + + return DatetimeIndex._simple_new(dta, name=self.name) + + # -------------------------------------------------------------------- + # Indexing Methods + + def _parsed_string_to_bounds(self, reso: Resolution, parsed: dt.datetime): + """ + Calculate datetime bounds for parsed time string and its resolution. + + Parameters + ---------- + reso : Resolution + Resolution provided by parsed string. + parsed : datetime + Datetime from parsed string. + + Returns + ------- + lower, upper: pd.Timestamp + """ + freq = OFFSET_TO_PERIOD_FREQSTR.get(reso.attr_abbrev, reso.attr_abbrev) + per = Period(parsed, freq=freq) + start, end = per.start_time, per.end_time + + # GH 24076 + # If an incoming date string contained a UTC offset, need to localize + # the parsed date to this offset first before aligning with the index's + # timezone + start = start.tz_localize(parsed.tzinfo) + end = end.tz_localize(parsed.tzinfo) + + if parsed.tzinfo is not None: + if self.tz is None: + raise ValueError( + "The index must be timezone aware when indexing " + "with a date string with a UTC offset" + ) + # The flipped case with parsed.tz is None and self.tz is not None + # is ruled out bc parsed and reso are produced by _parse_with_reso, + # which localizes parsed. + return start, end + + def _parse_with_reso(self, label: str): + parsed, reso = super()._parse_with_reso(label) + + parsed = Timestamp(parsed) + + if self.tz is not None and parsed.tzinfo is None: + # we special-case timezone-naive strings and timezone-aware + # DatetimeIndex + # https://github.com/pandas-dev/pandas/pull/36148#issuecomment-687883081 + parsed = parsed.tz_localize(self.tz) + + return parsed, reso + + def _disallow_mismatched_indexing(self, key) -> None: + """ + Check for mismatched-tzawareness indexing and re-raise as KeyError. + """ + # we get here with isinstance(key, self._data._recognized_scalars) + try: + # GH#36148 + self._data._assert_tzawareness_compat(key) + except TypeError as err: + raise KeyError(key) from err + + def get_loc(self, key): + """ + Get integer location for requested label + + Returns + ------- + loc : int + """ + self._check_indexing_error(key) + + orig_key = key + if is_valid_na_for_dtype(key, self.dtype): + key = NaT + + if isinstance(key, self._data._recognized_scalars): + # needed to localize naive datetimes + self._disallow_mismatched_indexing(key) + key = Timestamp(key) + + elif isinstance(key, str): + try: + parsed, reso = self._parse_with_reso(key) + except (ValueError, pytz.NonExistentTimeError) as err: + raise KeyError(key) from err + self._disallow_mismatched_indexing(parsed) + + if self._can_partial_date_slice(reso): + try: + return self._partial_date_slice(reso, parsed) + except KeyError as err: + raise KeyError(key) from err + + key = parsed + + elif isinstance(key, dt.timedelta): + # GH#20464 + raise TypeError( + f"Cannot index {type(self).__name__} with {type(key).__name__}" + ) + + elif isinstance(key, dt.time): + return self.indexer_at_time(key) + + else: + # unrecognized type + raise KeyError(key) + + try: + return Index.get_loc(self, key) + except KeyError as err: + raise KeyError(orig_key) from err + + @doc(DatetimeTimedeltaMixin._maybe_cast_slice_bound) + def _maybe_cast_slice_bound(self, label, side: str): + # GH#42855 handle date here instead of get_slice_bound + if isinstance(label, dt.date) and not isinstance(label, dt.datetime): + # Pandas supports slicing with dates, treated as datetimes at midnight. + # https://github.com/pandas-dev/pandas/issues/31501 + label = Timestamp(label).to_pydatetime() + + label = super()._maybe_cast_slice_bound(label, side) + self._data._assert_tzawareness_compat(label) + return Timestamp(label) + + def slice_indexer(self, start=None, end=None, step=None): + """ + Return indexer for specified label slice. + Index.slice_indexer, customized to handle time slicing. + + In addition to functionality provided by Index.slice_indexer, does the + following: + + - if both `start` and `end` are instances of `datetime.time`, it + invokes `indexer_between_time` + - if `start` and `end` are both either string or None perform + value-based selection in non-monotonic cases. + + """ + # For historical reasons DatetimeIndex supports slices between two + # instances of datetime.time as if it were applying a slice mask to + # an array of (self.hour, self.minute, self.seconds, self.microsecond). + if isinstance(start, dt.time) and isinstance(end, dt.time): + if step is not None and step != 1: + raise ValueError("Must have step size of 1 with time slices") + return self.indexer_between_time(start, end) + + if isinstance(start, dt.time) or isinstance(end, dt.time): + raise KeyError("Cannot mix time and non-time slice keys") + + def check_str_or_none(point) -> bool: + return point is not None and not isinstance(point, str) + + # GH#33146 if start and end are combinations of str and None and Index is not + # monotonic, we can not use Index.slice_indexer because it does not honor the + # actual elements, is only searching for start and end + if ( + check_str_or_none(start) + or check_str_or_none(end) + or self.is_monotonic_increasing + ): + return Index.slice_indexer(self, start, end, step) + + mask = np.array(True) + in_index = True + if start is not None: + start_casted = self._maybe_cast_slice_bound(start, "left") + mask = start_casted <= self + in_index &= (start_casted == self).any() + + if end is not None: + end_casted = self._maybe_cast_slice_bound(end, "right") + mask = (self <= end_casted) & mask + in_index &= (end_casted == self).any() + + if not in_index: + raise KeyError( + "Value based partial slicing on non-monotonic DatetimeIndexes " + "with non-existing keys is not allowed.", + ) + indexer = mask.nonzero()[0][::step] + if len(indexer) == len(self): + return slice(None) + else: + return indexer + + # -------------------------------------------------------------------- + + @property + def inferred_type(self) -> str: + # b/c datetime is represented as microseconds since the epoch, make + # sure we can't have ambiguous indexing + return "datetime64" + + def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]: + """ + Return index locations of values at particular time of day. + + Parameters + ---------- + time : datetime.time or str + Time passed in either as object (datetime.time) or as string in + appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", + "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). + + Returns + ------- + np.ndarray[np.intp] + + See Also + -------- + indexer_between_time : Get index locations of values between particular + times of day. + DataFrame.at_time : Select values at particular time of day. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["1/1/2020 10:00", "2/1/2020 11:00", + ... "3/1/2020 10:00"]) + >>> idx.indexer_at_time("10:00") + array([0, 2]) + """ + if asof: + raise NotImplementedError("'asof' argument is not supported") + + if isinstance(time, str): + from dateutil.parser import parse + + time = parse(time).time() + + if time.tzinfo: + if self.tz is None: + raise ValueError("Index must be timezone aware.") + time_micros = self.tz_convert(time.tzinfo)._get_time_micros() + else: + time_micros = self._get_time_micros() + micros = _time_to_micros(time) + return (time_micros == micros).nonzero()[0] + + def indexer_between_time( + self, start_time, end_time, include_start: bool = True, include_end: bool = True + ) -> npt.NDArray[np.intp]: + """ + Return index locations of values between particular times of day. + + Parameters + ---------- + start_time, end_time : datetime.time, str + Time passed either as object (datetime.time) or as string in + appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", + "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p"). + include_start : bool, default True + include_end : bool, default True + + Returns + ------- + np.ndarray[np.intp] + + See Also + -------- + indexer_at_time : Get index locations of values at particular time of day. + DataFrame.between_time : Select values between particular times of day. + + Examples + -------- + >>> idx = pd.date_range("2023-01-01", periods=4, freq="h") + >>> idx + DatetimeIndex(['2023-01-01 00:00:00', '2023-01-01 01:00:00', + '2023-01-01 02:00:00', '2023-01-01 03:00:00'], + dtype='datetime64[ns]', freq='h') + >>> idx.indexer_between_time("00:00", "2:00", include_end=False) + array([0, 1]) + """ + start_time = to_time(start_time) + end_time = to_time(end_time) + time_micros = self._get_time_micros() + start_micros = _time_to_micros(start_time) + end_micros = _time_to_micros(end_time) + + if include_start and include_end: + lop = rop = operator.le + elif include_start: + lop = operator.le + rop = operator.lt + elif include_end: + lop = operator.lt + rop = operator.le + else: + lop = rop = operator.lt + + if start_time <= end_time: + join_op = operator.and_ + else: + join_op = operator.or_ + + mask = join_op(lop(start_micros, time_micros), rop(time_micros, end_micros)) + + return mask.nonzero()[0] + + +def date_range( + start=None, + end=None, + periods=None, + freq=None, + tz=None, + normalize: bool = False, + name: Hashable | None = None, + inclusive: IntervalClosedType = "both", + *, + unit: str | None = None, + **kwargs, +) -> DatetimeIndex: + """ + Return a fixed frequency DatetimeIndex. + + Returns the range of equally spaced time points (where the difference between any + two adjacent points is specified by the given frequency) such that they all + satisfy `start <[=] x <[=] end`, where the first one and the last one are, resp., + the first and last time points in that range that fall on the boundary of ``freq`` + (if given as a frequency string) or that are valid for ``freq`` (if given as a + :class:`pandas.tseries.offsets.DateOffset`). (If exactly one of ``start``, + ``end``, or ``freq`` is *not* specified, this missing parameter can be computed + given ``periods``, the number of timesteps in the range. See the note below.) + + Parameters + ---------- + start : str or datetime-like, optional + Left bound for generating dates. + end : str or datetime-like, optional + Right bound for generating dates. + periods : int, optional + Number of periods to generate. + freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'D' + Frequency strings can have multiples, e.g. '5h'. See + :ref:`here ` for a list of + frequency aliases. + tz : str or tzinfo, optional + Time zone name for returning localized DatetimeIndex, for example + 'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is + timezone-naive unless timezone-aware datetime-likes are passed. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + name : str, default None + Name of the resulting DatetimeIndex. + inclusive : {"both", "neither", "left", "right"}, default "both" + Include boundaries; Whether to set each bound as closed or open. + + .. versionadded:: 1.4.0 + unit : str, default None + Specify the desired resolution of the result. + + .. versionadded:: 2.0.0 + **kwargs + For compatibility. Has no effect on the result. + + Returns + ------- + DatetimeIndex + + See Also + -------- + DatetimeIndex : An immutable container for datetimes. + timedelta_range : Return a fixed frequency TimedeltaIndex. + period_range : Return a fixed frequency PeriodIndex. + interval_range : Return a fixed frequency IntervalIndex. + + Notes + ----- + Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, + exactly three must be specified. If ``freq`` is omitted, the resulting + ``DatetimeIndex`` will have ``periods`` linearly spaced elements between + ``start`` and ``end`` (closed on both sides). + + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + **Specifying the values** + + The next four examples generate the same `DatetimeIndex`, but vary + the combination of `start`, `end` and `periods`. + + Specify `start` and `end`, with the default daily frequency. + + >>> pd.date_range(start='1/1/2018', end='1/08/2018') + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq='D') + + Specify timezone-aware `start` and `end`, with the default daily frequency. + + >>> pd.date_range( + ... start=pd.to_datetime("1/1/2018").tz_localize("Europe/Berlin"), + ... end=pd.to_datetime("1/08/2018").tz_localize("Europe/Berlin"), + ... ) + DatetimeIndex(['2018-01-01 00:00:00+01:00', '2018-01-02 00:00:00+01:00', + '2018-01-03 00:00:00+01:00', '2018-01-04 00:00:00+01:00', + '2018-01-05 00:00:00+01:00', '2018-01-06 00:00:00+01:00', + '2018-01-07 00:00:00+01:00', '2018-01-08 00:00:00+01:00'], + dtype='datetime64[ns, Europe/Berlin]', freq='D') + + Specify `start` and `periods`, the number of periods (days). + + >>> pd.date_range(start='1/1/2018', periods=8) + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq='D') + + Specify `end` and `periods`, the number of periods (days). + + >>> pd.date_range(end='1/1/2018', periods=8) + DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28', + '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'], + dtype='datetime64[ns]', freq='D') + + Specify `start`, `end`, and `periods`; the frequency is generated + automatically (linearly spaced). + + >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3) + DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00', + '2018-04-27 00:00:00'], + dtype='datetime64[ns]', freq=None) + + **Other Parameters** + + Changed the `freq` (frequency) to ``'ME'`` (month end frequency). + + >>> pd.date_range(start='1/1/2018', periods=5, freq='ME') + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', + '2018-05-31'], + dtype='datetime64[ns]', freq='ME') + + Multiples are allowed + + >>> pd.date_range(start='1/1/2018', periods=5, freq='3ME') + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq='3ME') + + `freq` can also be specified as an Offset object. + + >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)) + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq='3ME') + + Specify `tz` to set the timezone. + + >>> pd.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo') + DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00', + '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00', + '2018-01-05 00:00:00+09:00'], + dtype='datetime64[ns, Asia/Tokyo]', freq='D') + + `inclusive` controls whether to include `start` and `end` that are on the + boundary. The default, "both", includes boundary points on either end. + + >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive="both") + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], + dtype='datetime64[ns]', freq='D') + + Use ``inclusive='left'`` to exclude `end` if it falls on the boundary. + + >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='left') + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], + dtype='datetime64[ns]', freq='D') + + Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and + similarly ``inclusive='neither'`` will exclude both `start` and `end`. + + >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right') + DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], + dtype='datetime64[ns]', freq='D') + + **Specify a unit** + + >>> pd.date_range(start="2017-01-01", periods=10, freq="100YS", unit="s") + DatetimeIndex(['2017-01-01', '2117-01-01', '2217-01-01', '2317-01-01', + '2417-01-01', '2517-01-01', '2617-01-01', '2717-01-01', + '2817-01-01', '2917-01-01'], + dtype='datetime64[s]', freq='100YS-JAN') + """ + if freq is None and com.any_none(periods, start, end): + freq = "D" + + dtarr = DatetimeArray._generate_range( + start=start, + end=end, + periods=periods, + freq=freq, + tz=tz, + normalize=normalize, + inclusive=inclusive, + unit=unit, + **kwargs, + ) + return DatetimeIndex._simple_new(dtarr, name=name) + + +def bdate_range( + start=None, + end=None, + periods: int | None = None, + freq: Frequency | dt.timedelta = "B", + tz=None, + normalize: bool = True, + name: Hashable | None = None, + weekmask=None, + holidays=None, + inclusive: IntervalClosedType = "both", + **kwargs, +) -> DatetimeIndex: + """ + Return a fixed frequency DatetimeIndex with business day as the default. + + Parameters + ---------- + start : str or datetime-like, default None + Left bound for generating dates. + end : str or datetime-like, default None + Right bound for generating dates. + periods : int, default None + Number of periods to generate. + freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'B' + Frequency strings can have multiples, e.g. '5h'. The default is + business daily ('B'). + tz : str or None + Time zone name for returning localized DatetimeIndex, for example + Asia/Beijing. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + name : str, default None + Name of the resulting DatetimeIndex. + weekmask : str or None, default None + Weekmask of valid business days, passed to ``numpy.busdaycalendar``, + only used when custom frequency strings are passed. The default + value None is equivalent to 'Mon Tue Wed Thu Fri'. + holidays : list-like or None, default None + Dates to exclude from the set of valid business days, passed to + ``numpy.busdaycalendar``, only used when custom frequency strings + are passed. + inclusive : {"both", "neither", "left", "right"}, default "both" + Include boundaries; Whether to set each bound as closed or open. + + .. versionadded:: 1.4.0 + **kwargs + For compatibility. Has no effect on the result. + + Returns + ------- + DatetimeIndex + + Notes + ----- + Of the four parameters: ``start``, ``end``, ``periods``, and ``freq``, + exactly three must be specified. Specifying ``freq`` is a requirement + for ``bdate_range``. Use ``date_range`` if specifying ``freq`` is not + desired. + + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + Note how the two weekend days are skipped in the result. + + >>> pd.bdate_range(start='1/1/2018', end='1/08/2018') + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-08'], + dtype='datetime64[ns]', freq='B') + """ + if freq is None: + msg = "freq must be specified for bdate_range; use date_range instead" + raise TypeError(msg) + + if isinstance(freq, str) and freq.startswith("C"): + try: + weekmask = weekmask or "Mon Tue Wed Thu Fri" + freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask) + except (KeyError, TypeError) as err: + msg = f"invalid custom frequency string: {freq}" + raise ValueError(msg) from err + elif holidays or weekmask: + msg = ( + "a custom frequency string is required when holidays or " + f"weekmask are passed, got frequency {freq}" + ) + raise ValueError(msg) + + return date_range( + start=start, + end=end, + periods=periods, + freq=freq, + tz=tz, + normalize=normalize, + name=name, + inclusive=inclusive, + **kwargs, + ) + + +def _time_to_micros(time_obj: dt.time) -> int: + seconds = time_obj.hour * 60 * 60 + 60 * time_obj.minute + time_obj.second + return 1_000_000 * seconds + time_obj.microsecond diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/extension.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/extension.py new file mode 100644 index 0000000000000000000000000000000000000000..61949531f37df38f74a37c00e66141313a4fd767 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/extension.py @@ -0,0 +1,172 @@ +""" +Shared methods for Index subclasses backed by ExtensionArray. +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Callable, + TypeVar, +) + +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.generic import ABCDataFrame + +from pandas.core.indexes.base import Index + +if TYPE_CHECKING: + import numpy as np + + from pandas._typing import ( + ArrayLike, + npt, + ) + + from pandas.core.arrays import IntervalArray + from pandas.core.arrays._mixins import NDArrayBackedExtensionArray + +_ExtensionIndexT = TypeVar("_ExtensionIndexT", bound="ExtensionIndex") + + +def _inherit_from_data( + name: str, delegate: type, cache: bool = False, wrap: bool = False +): + """ + Make an alias for a method of the underlying ExtensionArray. + + Parameters + ---------- + name : str + Name of an attribute the class should inherit from its EA parent. + delegate : class + cache : bool, default False + Whether to convert wrapped properties into cache_readonly + wrap : bool, default False + Whether to wrap the inherited result in an Index. + + Returns + ------- + attribute, method, property, or cache_readonly + """ + attr = getattr(delegate, name) + + if isinstance(attr, property) or type(attr).__name__ == "getset_descriptor": + # getset_descriptor i.e. property defined in cython class + if cache: + + def cached(self): + return getattr(self._data, name) + + cached.__name__ = name + cached.__doc__ = attr.__doc__ + method = cache_readonly(cached) + + else: + + def fget(self): + result = getattr(self._data, name) + if wrap: + if isinstance(result, type(self._data)): + return type(self)._simple_new(result, name=self.name) + elif isinstance(result, ABCDataFrame): + return result.set_index(self) + return Index(result, name=self.name) + return result + + def fset(self, value) -> None: + setattr(self._data, name, value) + + fget.__name__ = name + fget.__doc__ = attr.__doc__ + + method = property(fget, fset) + + elif not callable(attr): + # just a normal attribute, no wrapping + method = attr + + else: + # error: Incompatible redefinition (redefinition with type "Callable[[Any, + # VarArg(Any), KwArg(Any)], Any]", original type "property") + def method(self, *args, **kwargs): # type: ignore[misc] + if "inplace" in kwargs: + raise ValueError(f"cannot use inplace with {type(self).__name__}") + result = attr(self._data, *args, **kwargs) + if wrap: + if isinstance(result, type(self._data)): + return type(self)._simple_new(result, name=self.name) + elif isinstance(result, ABCDataFrame): + return result.set_index(self) + return Index(result, name=self.name) + return result + + # error: "property" has no attribute "__name__" + method.__name__ = name # type: ignore[attr-defined] + method.__doc__ = attr.__doc__ + return method + + +def inherit_names( + names: list[str], delegate: type, cache: bool = False, wrap: bool = False +) -> Callable[[type[_ExtensionIndexT]], type[_ExtensionIndexT]]: + """ + Class decorator to pin attributes from an ExtensionArray to a Index subclass. + + Parameters + ---------- + names : List[str] + delegate : class + cache : bool, default False + wrap : bool, default False + Whether to wrap the inherited result in an Index. + """ + + def wrapper(cls: type[_ExtensionIndexT]) -> type[_ExtensionIndexT]: + for name in names: + meth = _inherit_from_data(name, delegate, cache=cache, wrap=wrap) + setattr(cls, name, meth) + + return cls + + return wrapper + + +class ExtensionIndex(Index): + """ + Index subclass for indexes backed by ExtensionArray. + """ + + # The base class already passes through to _data: + # size, __len__, dtype + + _data: IntervalArray | NDArrayBackedExtensionArray + + # --------------------------------------------------------------------- + + def _validate_fill_value(self, value): + """ + Convert value to be insertable to underlying array. + """ + return self._data._validate_setitem_value(value) + + @cache_readonly + def _isnan(self) -> npt.NDArray[np.bool_]: + # error: Incompatible return value type (got "ExtensionArray", expected + # "ndarray") + return self._data.isna() # type: ignore[return-value] + + +class NDArrayBackedExtensionIndex(ExtensionIndex): + """ + Index subclass for indexes backed by NDArrayBackedExtensionArray. + """ + + _data: NDArrayBackedExtensionArray + + def _get_engine_target(self) -> np.ndarray: + return self._data._ndarray + + def _from_join_target(self, result: np.ndarray) -> ArrayLike: + assert result.dtype == self._data._ndarray.dtype + return self._data._from_backing_data(result) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/frozen.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/frozen.py new file mode 100644 index 0000000000000000000000000000000000000000..9d528d34e36845efd44126c087cd15cd81e1e02e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/frozen.py @@ -0,0 +1,120 @@ +""" +frozen (immutable) data structures to support MultiIndexing + +These are used for: + +- .names (FrozenList) + +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + NoReturn, +) + +from pandas.core.base import PandasObject + +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from pandas._typing import Self + + +class FrozenList(PandasObject, list): + """ + Container that doesn't allow setting item *but* + because it's technically hashable, will be used + for lookups, appropriately, etc. + """ + + # Side note: This has to be of type list. Otherwise, + # it messes up PyTables type checks. + + def union(self, other) -> FrozenList: + """ + Returns a FrozenList with other concatenated to the end of self. + + Parameters + ---------- + other : array-like + The array-like whose elements we are concatenating. + + Returns + ------- + FrozenList + The collection difference between self and other. + """ + if isinstance(other, tuple): + other = list(other) + return type(self)(super().__add__(other)) + + def difference(self, other) -> FrozenList: + """ + Returns a FrozenList with elements from other removed from self. + + Parameters + ---------- + other : array-like + The array-like whose elements we are removing self. + + Returns + ------- + FrozenList + The collection difference between self and other. + """ + other = set(other) + temp = [x for x in self if x not in other] + return type(self)(temp) + + # TODO: Consider deprecating these in favor of `union` (xref gh-15506) + # error: Incompatible types in assignment (expression has type + # "Callable[[FrozenList, Any], FrozenList]", base class "list" defined the + # type as overloaded function) + __add__ = __iadd__ = union # type: ignore[assignment] + + def __getitem__(self, n): + if isinstance(n, slice): + return type(self)(super().__getitem__(n)) + return super().__getitem__(n) + + def __radd__(self, other) -> Self: + if isinstance(other, tuple): + other = list(other) + return type(self)(other + list(self)) + + def __eq__(self, other: object) -> bool: + if isinstance(other, (tuple, FrozenList)): + other = list(other) + return super().__eq__(other) + + __req__ = __eq__ + + def __mul__(self, other) -> Self: + return type(self)(super().__mul__(other)) + + __imul__ = __mul__ + + def __reduce__(self): + return type(self), (list(self),) + + # error: Signature of "__hash__" incompatible with supertype "list" + def __hash__(self) -> int: # type: ignore[override] + return hash(tuple(self)) + + def _disabled(self, *args, **kwargs) -> NoReturn: + """ + This method will not function because object is immutable. + """ + raise TypeError(f"'{type(self).__name__}' does not support mutable operations.") + + def __str__(self) -> str: + return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) + + def __repr__(self) -> str: + return f"{type(self).__name__}({str(self)})" + + __setitem__ = __setslice__ = _disabled # type: ignore[assignment] + __delitem__ = __delslice__ = _disabled + pop = append = extend = _disabled + remove = sort = insert = _disabled # type: ignore[assignment] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/interval.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/interval.py new file mode 100644 index 0000000000000000000000000000000000000000..4fcdb879745112889dd510a00b39586a22ed3dea --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/interval.py @@ -0,0 +1,1136 @@ +""" define the IntervalIndex """ +from __future__ import annotations + +from operator import ( + le, + lt, +) +import textwrap +from typing import ( + TYPE_CHECKING, + Any, + Literal, +) + +import numpy as np + +from pandas._libs import lib +from pandas._libs.interval import ( + Interval, + IntervalMixin, + IntervalTree, +) +from pandas._libs.tslibs import ( + BaseOffset, + Period, + Timedelta, + Timestamp, + to_offset, +) +from pandas.errors import InvalidIndexError +from pandas.util._decorators import ( + Appender, + cache_readonly, +) +from pandas.util._exceptions import rewrite_exception + +from pandas.core.dtypes.cast import ( + find_common_type, + infer_dtype_from_scalar, + maybe_box_datetimelike, + maybe_downcast_numeric, + maybe_upcast_numeric_to_64bit, +) +from pandas.core.dtypes.common import ( + ensure_platform_int, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_number, + is_object_dtype, + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + IntervalDtype, +) +from pandas.core.dtypes.missing import is_valid_na_for_dtype + +from pandas.core.algorithms import unique +from pandas.core.arrays.datetimelike import validate_periods +from pandas.core.arrays.interval import ( + IntervalArray, + _interval_shared_docs, +) +import pandas.core.common as com +from pandas.core.indexers import is_valid_positional_slice +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import ( + Index, + _index_shared_docs, + ensure_index, + maybe_extract_name, +) +from pandas.core.indexes.datetimes import ( + DatetimeIndex, + date_range, +) +from pandas.core.indexes.extension import ( + ExtensionIndex, + inherit_names, +) +from pandas.core.indexes.multi import MultiIndex +from pandas.core.indexes.timedeltas import ( + TimedeltaIndex, + timedelta_range, +) + +if TYPE_CHECKING: + from collections.abc import Hashable + + from pandas._typing import ( + Dtype, + DtypeObj, + IntervalClosedType, + Self, + npt, + ) +_index_doc_kwargs = dict(ibase._index_doc_kwargs) + +_index_doc_kwargs.update( + { + "klass": "IntervalIndex", + "qualname": "IntervalIndex", + "target_klass": "IntervalIndex or list of Intervals", + "name": textwrap.dedent( + """\ + name : object, optional + Name to be stored in the index. + """ + ), + } +) + + +def _get_next_label(label): + # see test_slice_locs_with_ints_and_floats_succeeds + dtype = getattr(label, "dtype", type(label)) + if isinstance(label, (Timestamp, Timedelta)): + dtype = "datetime64[ns]" + dtype = pandas_dtype(dtype) + + if lib.is_np_dtype(dtype, "mM") or isinstance(dtype, DatetimeTZDtype): + return label + np.timedelta64(1, "ns") + elif is_integer_dtype(dtype): + return label + 1 + elif is_float_dtype(dtype): + return np.nextafter(label, np.inf) + else: + raise TypeError(f"cannot determine next label for type {repr(type(label))}") + + +def _get_prev_label(label): + # see test_slice_locs_with_ints_and_floats_succeeds + dtype = getattr(label, "dtype", type(label)) + if isinstance(label, (Timestamp, Timedelta)): + dtype = "datetime64[ns]" + dtype = pandas_dtype(dtype) + + if lib.is_np_dtype(dtype, "mM") or isinstance(dtype, DatetimeTZDtype): + return label - np.timedelta64(1, "ns") + elif is_integer_dtype(dtype): + return label - 1 + elif is_float_dtype(dtype): + return np.nextafter(label, -np.inf) + else: + raise TypeError(f"cannot determine next label for type {repr(type(label))}") + + +def _new_IntervalIndex(cls, d): + """ + This is called upon unpickling, rather than the default which doesn't have + arguments and breaks __new__. + """ + return cls.from_arrays(**d) + + +@Appender( + _interval_shared_docs["class"] + % { + "klass": "IntervalIndex", + "summary": "Immutable index of intervals that are closed on the same side.", + "name": _index_doc_kwargs["name"], + "extra_attributes": "is_overlapping\nvalues\n", + "extra_methods": "", + "examples": textwrap.dedent( + """\ + Examples + -------- + A new ``IntervalIndex`` is typically constructed using + :func:`interval_range`: + + >>> pd.interval_range(start=0, end=5) + IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], + dtype='interval[int64, right]') + + It may also be constructed using one of the constructor + methods: :meth:`IntervalIndex.from_arrays`, + :meth:`IntervalIndex.from_breaks`, and :meth:`IntervalIndex.from_tuples`. + + See further examples in the doc strings of ``interval_range`` and the + mentioned constructor methods. + """ + ), + } +) +@inherit_names(["set_closed", "to_tuples"], IntervalArray, wrap=True) +@inherit_names( + [ + "__array__", + "overlaps", + "contains", + "closed_left", + "closed_right", + "open_left", + "open_right", + "is_empty", + ], + IntervalArray, +) +@inherit_names(["is_non_overlapping_monotonic", "closed"], IntervalArray, cache=True) +class IntervalIndex(ExtensionIndex): + _typ = "intervalindex" + + # annotate properties pinned via inherit_names + closed: IntervalClosedType + is_non_overlapping_monotonic: bool + closed_left: bool + closed_right: bool + open_left: bool + open_right: bool + + _data: IntervalArray + _values: IntervalArray + _can_hold_strings = False + _data_cls = IntervalArray + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + data, + closed: IntervalClosedType | None = None, + dtype: Dtype | None = None, + copy: bool = False, + name: Hashable | None = None, + verify_integrity: bool = True, + ) -> Self: + name = maybe_extract_name(name, data, cls) + + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray( + data, + closed=closed, + copy=copy, + dtype=dtype, + verify_integrity=verify_integrity, + ) + + return cls._simple_new(array, name) + + @classmethod + @Appender( + _interval_shared_docs["from_breaks"] + % { + "klass": "IntervalIndex", + "name": textwrap.dedent( + """ + name : str, optional + Name of the resulting IntervalIndex.""" + ), + "examples": textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_breaks([0, 1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]], + dtype='interval[int64, right]') + """ + ), + } + ) + def from_breaks( + cls, + breaks, + closed: IntervalClosedType | None = "right", + name: Hashable | None = None, + copy: bool = False, + dtype: Dtype | None = None, + ) -> IntervalIndex: + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_breaks( + breaks, closed=closed, copy=copy, dtype=dtype + ) + return cls._simple_new(array, name=name) + + @classmethod + @Appender( + _interval_shared_docs["from_arrays"] + % { + "klass": "IntervalIndex", + "name": textwrap.dedent( + """ + name : str, optional + Name of the resulting IntervalIndex.""" + ), + "examples": textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]], + dtype='interval[int64, right]') + """ + ), + } + ) + def from_arrays( + cls, + left, + right, + closed: IntervalClosedType = "right", + name: Hashable | None = None, + copy: bool = False, + dtype: Dtype | None = None, + ) -> IntervalIndex: + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_arrays( + left, right, closed, copy=copy, dtype=dtype + ) + return cls._simple_new(array, name=name) + + @classmethod + @Appender( + _interval_shared_docs["from_tuples"] + % { + "klass": "IntervalIndex", + "name": textwrap.dedent( + """ + name : str, optional + Name of the resulting IntervalIndex.""" + ), + "examples": textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_tuples([(0, 1), (1, 2)]) + IntervalIndex([(0, 1], (1, 2]], + dtype='interval[int64, right]') + """ + ), + } + ) + def from_tuples( + cls, + data, + closed: IntervalClosedType = "right", + name: Hashable | None = None, + copy: bool = False, + dtype: Dtype | None = None, + ) -> IntervalIndex: + with rewrite_exception("IntervalArray", cls.__name__): + arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) + return cls._simple_new(arr, name=name) + + # -------------------------------------------------------------------- + # error: Return type "IntervalTree" of "_engine" incompatible with return type + # "Union[IndexEngine, ExtensionEngine]" in supertype "Index" + @cache_readonly + def _engine(self) -> IntervalTree: # type: ignore[override] + # IntervalTree does not supports numpy array unless they are 64 bit + left = self._maybe_convert_i8(self.left) + left = maybe_upcast_numeric_to_64bit(left) + right = self._maybe_convert_i8(self.right) + right = maybe_upcast_numeric_to_64bit(right) + return IntervalTree(left, right, closed=self.closed) + + def __contains__(self, key: Any) -> bool: + """ + return a boolean if this key is IN the index + We *only* accept an Interval + + Parameters + ---------- + key : Interval + + Returns + ------- + bool + """ + hash(key) + if not isinstance(key, Interval): + if is_valid_na_for_dtype(key, self.dtype): + return self.hasnans + return False + + try: + self.get_loc(key) + return True + except KeyError: + return False + + def _getitem_slice(self, slobj: slice) -> IntervalIndex: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + res = self._data[slobj] + return type(self)._simple_new(res, name=self._name) + + @cache_readonly + def _multiindex(self) -> MultiIndex: + return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"]) + + def __reduce__(self): + d = { + "left": self.left, + "right": self.right, + "closed": self.closed, + "name": self.name, + } + return _new_IntervalIndex, (type(self), d), None + + @property + def inferred_type(self) -> str: + """Return a string of the type inferred from the values""" + return "interval" + + # Cannot determine type of "memory_usage" + @Appender(Index.memory_usage.__doc__) # type: ignore[has-type] + def memory_usage(self, deep: bool = False) -> int: + # we don't use an explicit engine + # so return the bytes here + return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) + + # IntervalTree doesn't have a is_monotonic_decreasing, so have to override + # the Index implementation + @cache_readonly + def is_monotonic_decreasing(self) -> bool: + """ + Return True if the IntervalIndex is monotonic decreasing (only equal or + decreasing values), else False + """ + return self[::-1].is_monotonic_increasing + + @cache_readonly + def is_unique(self) -> bool: + """ + Return True if the IntervalIndex contains unique elements, else False. + """ + left = self.left + right = self.right + + if self.isna().sum() > 1: + return False + + if left.is_unique or right.is_unique: + return True + + seen_pairs = set() + check_idx = np.where(left.duplicated(keep=False))[0] + for idx in check_idx: + pair = (left[idx], right[idx]) + if pair in seen_pairs: + return False + seen_pairs.add(pair) + + return True + + @property + def is_overlapping(self) -> bool: + """ + Return True if the IntervalIndex has overlapping intervals, else False. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + Returns + ------- + bool + Boolean indicating if the IntervalIndex has overlapping intervals. + + See Also + -------- + Interval.overlaps : Check whether two Interval objects overlap. + IntervalIndex.overlaps : Check an IntervalIndex elementwise for + overlaps. + + Examples + -------- + >>> index = pd.IntervalIndex.from_tuples([(0, 2), (1, 3), (4, 5)]) + >>> index + IntervalIndex([(0, 2], (1, 3], (4, 5]], + dtype='interval[int64, right]') + >>> index.is_overlapping + True + + Intervals that share closed endpoints overlap: + + >>> index = pd.interval_range(0, 3, closed='both') + >>> index + IntervalIndex([[0, 1], [1, 2], [2, 3]], + dtype='interval[int64, both]') + >>> index.is_overlapping + True + + Intervals that only have an open endpoint in common do not overlap: + + >>> index = pd.interval_range(0, 3, closed='left') + >>> index + IntervalIndex([[0, 1), [1, 2), [2, 3)], + dtype='interval[int64, left]') + >>> index.is_overlapping + False + """ + # GH 23309 + return self._engine.is_overlapping + + def _needs_i8_conversion(self, key) -> bool: + """ + Check if a given key needs i8 conversion. Conversion is necessary for + Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An + Interval-like requires conversion if its endpoints are one of the + aforementioned types. + + Assumes that any list-like data has already been cast to an Index. + + Parameters + ---------- + key : scalar or Index-like + The key that should be checked for i8 conversion + + Returns + ------- + bool + """ + key_dtype = getattr(key, "dtype", None) + if isinstance(key_dtype, IntervalDtype) or isinstance(key, Interval): + return self._needs_i8_conversion(key.left) + + i8_types = (Timestamp, Timedelta, DatetimeIndex, TimedeltaIndex) + return isinstance(key, i8_types) + + def _maybe_convert_i8(self, key): + """ + Maybe convert a given key to its equivalent i8 value(s). Used as a + preprocessing step prior to IntervalTree queries (self._engine), which + expects numeric data. + + Parameters + ---------- + key : scalar or list-like + The key that should maybe be converted to i8. + + Returns + ------- + scalar or list-like + The original key if no conversion occurred, int if converted scalar, + Index with an int64 dtype if converted list-like. + """ + if is_list_like(key): + key = ensure_index(key) + key = maybe_upcast_numeric_to_64bit(key) + + if not self._needs_i8_conversion(key): + return key + + scalar = is_scalar(key) + key_dtype = getattr(key, "dtype", None) + if isinstance(key_dtype, IntervalDtype) or isinstance(key, Interval): + # convert left/right and reconstruct + left = self._maybe_convert_i8(key.left) + right = self._maybe_convert_i8(key.right) + constructor = Interval if scalar else IntervalIndex.from_arrays + # error: "object" not callable + return constructor( + left, right, closed=self.closed + ) # type: ignore[operator] + + if scalar: + # Timestamp/Timedelta + key_dtype, key_i8 = infer_dtype_from_scalar(key) + if isinstance(key, Period): + key_i8 = key.ordinal + elif isinstance(key_i8, Timestamp): + key_i8 = key_i8._value + elif isinstance(key_i8, (np.datetime64, np.timedelta64)): + key_i8 = key_i8.view("i8") + else: + # DatetimeIndex/TimedeltaIndex + key_dtype, key_i8 = key.dtype, Index(key.asi8) + if key.hasnans: + # convert NaT from its i8 value to np.nan so it's not viewed + # as a valid value, maybe causing errors (e.g. is_overlapping) + key_i8 = key_i8.where(~key._isnan) + + # ensure consistency with IntervalIndex subtype + # error: Item "ExtensionDtype"/"dtype[Any]" of "Union[dtype[Any], + # ExtensionDtype]" has no attribute "subtype" + subtype = self.dtype.subtype # type: ignore[union-attr] + + if subtype != key_dtype: + raise ValueError( + f"Cannot index an IntervalIndex of subtype {subtype} with " + f"values of dtype {key_dtype}" + ) + + return key_i8 + + def _searchsorted_monotonic(self, label, side: Literal["left", "right"] = "left"): + if not self.is_non_overlapping_monotonic: + raise KeyError( + "can only get slices from an IntervalIndex if bounds are " + "non-overlapping and all monotonic increasing or decreasing" + ) + + if isinstance(label, (IntervalMixin, IntervalIndex)): + raise NotImplementedError("Interval objects are not currently supported") + + # GH 20921: "not is_monotonic_increasing" for the second condition + # instead of "is_monotonic_decreasing" to account for single element + # indexes being both increasing and decreasing + if (side == "left" and self.left.is_monotonic_increasing) or ( + side == "right" and not self.left.is_monotonic_increasing + ): + sub_idx = self.right + if self.open_right: + label = _get_next_label(label) + else: + sub_idx = self.left + if self.open_left: + label = _get_prev_label(label) + + return sub_idx._searchsorted_monotonic(label, side) + + # -------------------------------------------------------------------- + # Indexing Methods + + def get_loc(self, key) -> int | slice | np.ndarray: + """ + Get integer location, slice or boolean mask for requested label. + + Parameters + ---------- + key : label + + Returns + ------- + int if unique index, slice if monotonic index, else mask + + Examples + -------- + >>> i1, i2 = pd.Interval(0, 1), pd.Interval(1, 2) + >>> index = pd.IntervalIndex([i1, i2]) + >>> index.get_loc(1) + 0 + + You can also supply a point inside an interval. + + >>> index.get_loc(1.5) + 1 + + If a label is in several intervals, you get the locations of all the + relevant intervals. + + >>> i3 = pd.Interval(0, 2) + >>> overlapping_index = pd.IntervalIndex([i1, i2, i3]) + >>> overlapping_index.get_loc(0.5) + array([ True, False, True]) + + Only exact matches will be returned if an interval is provided. + + >>> index.get_loc(pd.Interval(0, 1)) + 0 + """ + self._check_indexing_error(key) + + if isinstance(key, Interval): + if self.closed != key.closed: + raise KeyError(key) + mask = (self.left == key.left) & (self.right == key.right) + elif is_valid_na_for_dtype(key, self.dtype): + mask = self.isna() + else: + # assume scalar + op_left = le if self.closed_left else lt + op_right = le if self.closed_right else lt + try: + mask = op_left(self.left, key) & op_right(key, self.right) + except TypeError as err: + # scalar is not comparable to II subtype --> invalid label + raise KeyError(key) from err + + matches = mask.sum() + if matches == 0: + raise KeyError(key) + if matches == 1: + return mask.argmax() + + res = lib.maybe_booleans_to_slice(mask.view("u1")) + if isinstance(res, slice) and res.stop is None: + # TODO: DO this in maybe_booleans_to_slice? + res = slice(res.start, len(self), res.step) + return res + + def _get_indexer( + self, + target: Index, + method: str | None = None, + limit: int | None = None, + tolerance: Any | None = None, + ) -> npt.NDArray[np.intp]: + if isinstance(target, IntervalIndex): + # We only get here with not self.is_overlapping + # -> at most one match per interval in target + # want exact matches -> need both left/right to match, so defer to + # left/right get_indexer, compare elementwise, equality -> match + indexer = self._get_indexer_unique_sides(target) + + elif not is_object_dtype(target.dtype): + # homogeneous scalar index: use IntervalTree + # we should always have self._should_partial_index(target) here + target = self._maybe_convert_i8(target) + indexer = self._engine.get_indexer(target.values) + else: + # heterogeneous scalar index: defer elementwise to get_loc + # we should always have self._should_partial_index(target) here + return self._get_indexer_pointwise(target)[0] + + return ensure_platform_int(indexer) + + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique( + self, target: Index + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + target = ensure_index(target) + + if not self._should_compare(target) and not self._should_partial_index(target): + # e.g. IntervalIndex with different closed or incompatible subtype + # -> no matches + return self._get_indexer_non_comparable(target, None, unique=False) + + elif isinstance(target, IntervalIndex): + if self.left.is_unique and self.right.is_unique: + # fastpath available even if we don't have self._index_as_unique + indexer = self._get_indexer_unique_sides(target) + missing = (indexer == -1).nonzero()[0] + else: + return self._get_indexer_pointwise(target) + + elif is_object_dtype(target.dtype) or not self._should_partial_index(target): + # target might contain intervals: defer elementwise to get_loc + return self._get_indexer_pointwise(target) + + else: + # Note: this case behaves differently from other Index subclasses + # because IntervalIndex does partial-int indexing + target = self._maybe_convert_i8(target) + indexer, missing = self._engine.get_indexer_non_unique(target.values) + + return ensure_platform_int(indexer), ensure_platform_int(missing) + + def _get_indexer_unique_sides(self, target: IntervalIndex) -> npt.NDArray[np.intp]: + """ + _get_indexer specialized to the case where both of our sides are unique. + """ + # Caller is responsible for checking + # `self.left.is_unique and self.right.is_unique` + + left_indexer = self.left.get_indexer(target.left) + right_indexer = self.right.get_indexer(target.right) + indexer = np.where(left_indexer == right_indexer, left_indexer, -1) + return indexer + + def _get_indexer_pointwise( + self, target: Index + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + """ + pointwise implementation for get_indexer and get_indexer_non_unique. + """ + indexer, missing = [], [] + for i, key in enumerate(target): + try: + locs = self.get_loc(key) + if isinstance(locs, slice): + # Only needed for get_indexer_non_unique + locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") + elif lib.is_integer(locs): + locs = np.array(locs, ndmin=1) + else: + # otherwise we have ndarray[bool] + locs = np.where(locs)[0] + except KeyError: + missing.append(i) + locs = np.array([-1]) + except InvalidIndexError: + # i.e. non-scalar key e.g. a tuple. + # see test_append_different_columns_types_raises + missing.append(i) + locs = np.array([-1]) + + indexer.append(locs) + + indexer = np.concatenate(indexer) + return ensure_platform_int(indexer), ensure_platform_int(missing) + + @cache_readonly + def _index_as_unique(self) -> bool: + return not self.is_overlapping and self._engine._na_count < 2 + + _requires_unique_msg = ( + "cannot handle overlapping indices; use IntervalIndex.get_indexer_non_unique" + ) + + def _convert_slice_indexer(self, key: slice, kind: Literal["loc", "getitem"]): + if not (key.step is None or key.step == 1): + # GH#31658 if label-based, we require step == 1, + # if positional, we disallow float start/stop + msg = "label-based slicing with step!=1 is not supported for IntervalIndex" + if kind == "loc": + raise ValueError(msg) + if kind == "getitem": + if not is_valid_positional_slice(key): + # i.e. this cannot be interpreted as a positional slice + raise ValueError(msg) + + return super()._convert_slice_indexer(key, kind) + + @cache_readonly + def _should_fallback_to_positional(self) -> bool: + # integer lookups in Series.__getitem__ are unambiguously + # positional in this case + # error: Item "ExtensionDtype"/"dtype[Any]" of "Union[dtype[Any], + # ExtensionDtype]" has no attribute "subtype" + return self.dtype.subtype.kind in "mM" # type: ignore[union-attr] + + def _maybe_cast_slice_bound(self, label, side: str): + return getattr(self, side)._maybe_cast_slice_bound(label, side) + + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + if not isinstance(dtype, IntervalDtype): + return False + common_subtype = find_common_type([self.dtype, dtype]) + return not is_object_dtype(common_subtype) + + # -------------------------------------------------------------------- + + @cache_readonly + def left(self) -> Index: + return Index(self._data.left, copy=False) + + @cache_readonly + def right(self) -> Index: + return Index(self._data.right, copy=False) + + @cache_readonly + def mid(self) -> Index: + return Index(self._data.mid, copy=False) + + @property + def length(self) -> Index: + return Index(self._data.length, copy=False) + + # -------------------------------------------------------------------- + # Set Operations + + def _intersection(self, other, sort): + """ + intersection specialized to the case with matching dtypes. + """ + # For IntervalIndex we also know other.closed == self.closed + if self.left.is_unique and self.right.is_unique: + taken = self._intersection_unique(other) + elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1: + # Swap other/self if other is unique and self does not have + # multiple NaNs + taken = other._intersection_unique(self) + else: + # duplicates + taken = self._intersection_non_unique(other) + + if sort is None: + taken = taken.sort_values() + + return taken + + def _intersection_unique(self, other: IntervalIndex) -> IntervalIndex: + """ + Used when the IntervalIndex does not have any common endpoint, + no matter left or right. + Return the intersection with another IntervalIndex. + Parameters + ---------- + other : IntervalIndex + Returns + ------- + IntervalIndex + """ + # Note: this is much more performant than super()._intersection(other) + lindexer = self.left.get_indexer(other.left) + rindexer = self.right.get_indexer(other.right) + + match = (lindexer == rindexer) & (lindexer != -1) + indexer = lindexer.take(match.nonzero()[0]) + indexer = unique(indexer) + + return self.take(indexer) + + def _intersection_non_unique(self, other: IntervalIndex) -> IntervalIndex: + """ + Used when the IntervalIndex does have some common endpoints, + on either sides. + Return the intersection with another IntervalIndex. + + Parameters + ---------- + other : IntervalIndex + + Returns + ------- + IntervalIndex + """ + # Note: this is about 3.25x faster than super()._intersection(other) + # in IntervalIndexMethod.time_intersection_both_duplicate(1000) + mask = np.zeros(len(self), dtype=bool) + + if self.hasnans and other.hasnans: + first_nan_loc = np.arange(len(self))[self.isna()][0] + mask[first_nan_loc] = True + + other_tups = set(zip(other.left, other.right)) + for i, tup in enumerate(zip(self.left, self.right)): + if tup in other_tups: + mask[i] = True + + return self[mask] + + # -------------------------------------------------------------------- + + def _get_engine_target(self) -> np.ndarray: + # Note: we _could_ use libjoin functions by either casting to object + # dtype or constructing tuples (faster than constructing Intervals) + # but the libjoin fastpaths are no longer fast in these cases. + raise NotImplementedError( + "IntervalIndex does not use libjoin fastpaths or pass values to " + "IndexEngine objects" + ) + + def _from_join_target(self, result): + raise NotImplementedError("IntervalIndex does not use libjoin fastpaths") + + # TODO: arithmetic operations + + +def _is_valid_endpoint(endpoint) -> bool: + """ + Helper for interval_range to check if start/end are valid types. + """ + return any( + [ + is_number(endpoint), + isinstance(endpoint, Timestamp), + isinstance(endpoint, Timedelta), + endpoint is None, + ] + ) + + +def _is_type_compatible(a, b) -> bool: + """ + Helper for interval_range to check type compat of start/end/freq. + """ + is_ts_compat = lambda x: isinstance(x, (Timestamp, BaseOffset)) + is_td_compat = lambda x: isinstance(x, (Timedelta, BaseOffset)) + return ( + (is_number(a) and is_number(b)) + or (is_ts_compat(a) and is_ts_compat(b)) + or (is_td_compat(a) and is_td_compat(b)) + or com.any_none(a, b) + ) + + +def interval_range( + start=None, + end=None, + periods=None, + freq=None, + name: Hashable | None = None, + closed: IntervalClosedType = "right", +) -> IntervalIndex: + """ + Return a fixed frequency IntervalIndex. + + Parameters + ---------- + start : numeric or datetime-like, default None + Left bound for generating intervals. + end : numeric or datetime-like, default None + Right bound for generating intervals. + periods : int, default None + Number of periods to generate. + freq : numeric, str, Timedelta, datetime.timedelta, or DateOffset, default None + The length of each interval. Must be consistent with the type of start + and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1 + for numeric and 'D' for datetime-like. + name : str, default None + Name of the resulting IntervalIndex. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + + Returns + ------- + IntervalIndex + + See Also + -------- + IntervalIndex : An Index of intervals that are all closed on the same side. + + Notes + ----- + Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, + exactly three must be specified. If ``freq`` is omitted, the resulting + ``IntervalIndex`` will have ``periods`` linearly spaced elements between + ``start`` and ``end``, inclusively. + + To learn more about datetime-like frequency strings, please see `this link + `__. + + Examples + -------- + Numeric ``start`` and ``end`` is supported. + + >>> pd.interval_range(start=0, end=5) + IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], + dtype='interval[int64, right]') + + Additionally, datetime-like input is also supported. + + >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), + ... end=pd.Timestamp('2017-01-04')) + IntervalIndex([(2017-01-01 00:00:00, 2017-01-02 00:00:00], + (2017-01-02 00:00:00, 2017-01-03 00:00:00], + (2017-01-03 00:00:00, 2017-01-04 00:00:00]], + dtype='interval[datetime64[ns], right]') + + The ``freq`` parameter specifies the frequency between the left and right. + endpoints of the individual intervals within the ``IntervalIndex``. For + numeric ``start`` and ``end``, the frequency must also be numeric. + + >>> pd.interval_range(start=0, periods=4, freq=1.5) + IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], + dtype='interval[float64, right]') + + Similarly, for datetime-like ``start`` and ``end``, the frequency must be + convertible to a DateOffset. + + >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), + ... periods=3, freq='MS') + IntervalIndex([(2017-01-01 00:00:00, 2017-02-01 00:00:00], + (2017-02-01 00:00:00, 2017-03-01 00:00:00], + (2017-03-01 00:00:00, 2017-04-01 00:00:00]], + dtype='interval[datetime64[ns], right]') + + Specify ``start``, ``end``, and ``periods``; the frequency is generated + automatically (linearly spaced). + + >>> pd.interval_range(start=0, end=6, periods=4) + IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], + dtype='interval[float64, right]') + + The ``closed`` parameter specifies which endpoints of the individual + intervals within the ``IntervalIndex`` are closed. + + >>> pd.interval_range(end=5, periods=4, closed='both') + IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], + dtype='interval[int64, both]') + """ + start = maybe_box_datetimelike(start) + end = maybe_box_datetimelike(end) + endpoint = start if start is not None else end + + if freq is None and com.any_none(periods, start, end): + freq = 1 if is_number(endpoint) else "D" + + if com.count_not_none(start, end, periods, freq) != 3: + raise ValueError( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) + + if not _is_valid_endpoint(start): + raise ValueError(f"start must be numeric or datetime-like, got {start}") + if not _is_valid_endpoint(end): + raise ValueError(f"end must be numeric or datetime-like, got {end}") + + periods = validate_periods(periods) + + if freq is not None and not is_number(freq): + try: + freq = to_offset(freq) + except ValueError as err: + raise ValueError( + f"freq must be numeric or convertible to DateOffset, got {freq}" + ) from err + + # verify type compatibility + if not all( + [ + _is_type_compatible(start, end), + _is_type_compatible(start, freq), + _is_type_compatible(end, freq), + ] + ): + raise TypeError("start, end, freq need to be type compatible") + + # +1 to convert interval count to breaks count (n breaks = n-1 intervals) + if periods is not None: + periods += 1 + + breaks: np.ndarray | TimedeltaIndex | DatetimeIndex + + if is_number(endpoint): + if com.all_not_none(start, end, freq): + # 0.1 ensures we capture end + breaks = np.arange(start, end + (freq * 0.1), freq) + else: + # compute the period/start/end if unspecified (at most one) + if periods is None: + periods = int((end - start) // freq) + 1 + elif start is None: + start = end - (periods - 1) * freq + elif end is None: + end = start + (periods - 1) * freq + + breaks = np.linspace(start, end, periods) + if all(is_integer(x) for x in com.not_none(start, end, freq)): + # np.linspace always produces float output + + # error: Argument 1 to "maybe_downcast_numeric" has incompatible type + # "Union[ndarray[Any, Any], TimedeltaIndex, DatetimeIndex]"; + # expected "ndarray[Any, Any]" [ + breaks = maybe_downcast_numeric( + breaks, # type: ignore[arg-type] + np.dtype("int64"), + ) + else: + # delegate to the appropriate range function + if isinstance(endpoint, Timestamp): + breaks = date_range(start=start, end=end, periods=periods, freq=freq) + else: + breaks = timedelta_range(start=start, end=end, periods=periods, freq=freq) + + return IntervalIndex.from_breaks(breaks, name=name, closed=closed) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/multi.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/multi.py new file mode 100644 index 0000000000000000000000000000000000000000..091ddbcc099be87e92926e18bdddea60b88ed68d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/multi.py @@ -0,0 +1,4157 @@ +from __future__ import annotations + +from collections.abc import ( + Collection, + Generator, + Hashable, + Iterable, + Sequence, +) +from functools import wraps +from sys import getsizeof +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + cast, +) +import warnings + +import numpy as np + +from pandas._config import get_option + +from pandas._libs import ( + algos as libalgos, + index as libindex, + lib, +) +from pandas._libs.hashtable import duplicated +from pandas._typing import ( + AnyAll, + AnyArrayLike, + Axis, + DropKeep, + DtypeObj, + F, + IgnoreRaise, + IndexLabel, + Scalar, + Self, + Shape, + npt, +) +from pandas.compat.numpy import function as nv +from pandas.errors import ( + InvalidIndexError, + PerformanceWarning, + UnsortedIndexError, +) +from pandas.util._decorators import ( + Appender, + cache_readonly, + doc, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import coerce_indexer_dtype +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_object_dtype, + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + ExtensionDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) +from pandas.core.dtypes.inference import is_array_like +from pandas.core.dtypes.missing import ( + array_equivalent, + isna, +) + +import pandas.core.algorithms as algos +from pandas.core.array_algos.putmask import validate_putmask +from pandas.core.arrays import ( + Categorical, + ExtensionArray, +) +from pandas.core.arrays.categorical import ( + factorize_from_iterables, + recode_for_categories, +) +import pandas.core.common as com +from pandas.core.construction import sanitize_array +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import ( + Index, + _index_shared_docs, + ensure_index, + get_unanimous_names, +) +from pandas.core.indexes.frozen import FrozenList +from pandas.core.ops.invalid import make_invalid_op +from pandas.core.sorting import ( + get_group_index, + lexsort_indexer, +) + +from pandas.io.formats.printing import ( + get_adjustment, + pprint_thing, +) + +if TYPE_CHECKING: + from pandas import ( + CategoricalIndex, + DataFrame, + Series, + ) + +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update( + {"klass": "MultiIndex", "target_klass": "MultiIndex or list of tuples"} +) + + +class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): + """ + This class manages a MultiIndex by mapping label combinations to positive + integers. + """ + + _base = libindex.UInt64Engine + + def _codes_to_ints(self, codes): + """ + Transform combination(s) of uint64 in one uint64 (each), in a strictly + monotonic way (i.e. respecting the lexicographic order of integer + combinations): see BaseMultiIndexCodesEngine documentation. + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint64 + Combinations of integers (one per row) + + Returns + ------- + scalar or 1-dimensional array, of dtype uint64 + Integer(s) representing one combination (each). + """ + # Shift the representation of each level by the pre-calculated number + # of bits: + codes <<= self.offsets + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer: + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) + + +class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): + """ + This class manages those (extreme) cases in which the number of possible + label combinations overflows the 64 bits integers, and uses an ObjectEngine + containing Python integers. + """ + + _base = libindex.ObjectEngine + + def _codes_to_ints(self, codes): + """ + Transform combination(s) of uint64 in one Python integer (each), in a + strictly monotonic way (i.e. respecting the lexicographic order of + integer combinations): see BaseMultiIndexCodesEngine documentation. + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint64 + Combinations of integers (one per row) + + Returns + ------- + int, or 1-dimensional array of dtype object + Integer(s) representing one combination (each). + """ + # Shift the representation of each level by the pre-calculated number + # of bits. Since this can overflow uint64, first make sure we are + # working with Python integers: + codes = codes.astype("object") << self.offsets + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer (per row): + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) + + +def names_compat(meth: F) -> F: + """ + A decorator to allow either `name` or `names` keyword but not both. + + This makes it easier to share code with base class. + """ + + @wraps(meth) + def new_meth(self_or_cls, *args, **kwargs): + if "name" in kwargs and "names" in kwargs: + raise TypeError("Can only provide one of `names` and `name`") + if "name" in kwargs: + kwargs["names"] = kwargs.pop("name") + + return meth(self_or_cls, *args, **kwargs) + + return cast(F, new_meth) + + +class MultiIndex(Index): + """ + A multi-level, or hierarchical, index object for pandas objects. + + Parameters + ---------- + levels : sequence of arrays + The unique labels for each level. + codes : sequence of arrays + Integers for each level designating which label at each location. + sortorder : optional int + Level of sortedness (must be lexicographically sorted by that + level). + names : optional sequence of objects + Names for each of the index levels. (name is accepted for compat). + copy : bool, default False + Copy the meta-data. + verify_integrity : bool, default True + Check that the levels/codes are consistent and valid. + + Attributes + ---------- + names + levels + codes + nlevels + levshape + dtypes + + Methods + ------- + from_arrays + from_tuples + from_product + from_frame + set_levels + set_codes + to_frame + to_flat_index + sortlevel + droplevel + swaplevel + reorder_levels + remove_unused_levels + get_level_values + get_indexer + get_loc + get_locs + get_loc_level + drop + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_product : Create a MultiIndex from the cartesian product + of iterables. + MultiIndex.from_tuples : Convert list of tuples to a MultiIndex. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + Index : The base pandas Index type. + + Notes + ----- + See the `user guide + `__ + for more. + + Examples + -------- + A new ``MultiIndex`` is typically constructed using one of the helper + methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product` + and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``): + + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + + See further examples for how to construct a MultiIndex in the doc strings + of the mentioned helper methods. + """ + + _hidden_attrs = Index._hidden_attrs | frozenset() + + # initialize to zero-length tuples to make everything work + _typ = "multiindex" + _names: list[Hashable | None] = [] + _levels = FrozenList() + _codes = FrozenList() + _comparables = ["names"] + + sortorder: int | None + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + levels=None, + codes=None, + sortorder=None, + names=None, + dtype=None, + copy: bool = False, + name=None, + verify_integrity: bool = True, + ) -> Self: + # compat with Index + if name is not None: + names = name + if levels is None or codes is None: + raise TypeError("Must pass both levels and codes") + if len(levels) != len(codes): + raise ValueError("Length of levels and codes must be the same.") + if len(levels) == 0: + raise ValueError("Must pass non-zero number of levels/codes") + + result = object.__new__(cls) + result._cache = {} + + # we've already validated levels and codes, so shortcut here + result._set_levels(levels, copy=copy, validate=False) + result._set_codes(codes, copy=copy, validate=False) + + result._names = [None] * len(levels) + if names is not None: + # handles name validation + result._set_names(names) + + if sortorder is not None: + result.sortorder = int(sortorder) + else: + result.sortorder = sortorder + + if verify_integrity: + new_codes = result._verify_integrity() + result._codes = new_codes + + result._reset_identity() + result._references = None + + return result + + def _validate_codes(self, level: list, code: list): + """ + Reassign code values as -1 if their corresponding levels are NaN. + + Parameters + ---------- + code : list + Code to reassign. + level : list + Level to check for missing values (NaN, NaT, None). + + Returns + ------- + new code where code value = -1 if it corresponds + to a level with missing values (NaN, NaT, None). + """ + null_mask = isna(level) + if np.any(null_mask): + # error: Incompatible types in assignment + # (expression has type "ndarray[Any, dtype[Any]]", + # variable has type "List[Any]") + code = np.where(null_mask[code], -1, code) # type: ignore[assignment] + return code + + def _verify_integrity( + self, + codes: list | None = None, + levels: list | None = None, + levels_to_verify: list[int] | range | None = None, + ): + """ + Parameters + ---------- + codes : optional list + Codes to check for validity. Defaults to current codes. + levels : optional list + Levels to check for validity. Defaults to current levels. + levels_to_validate: optional list + Specifies the levels to verify. + + Raises + ------ + ValueError + If length of levels and codes don't match, if the codes for any + level would exceed level bounds, or there are any duplicate levels. + + Returns + ------- + new codes where code value = -1 if it corresponds to a + NaN level. + """ + # NOTE: Currently does not check, among other things, that cached + # nlevels matches nor that sortorder matches actually sortorder. + codes = codes or self.codes + levels = levels or self.levels + if levels_to_verify is None: + levels_to_verify = range(len(levels)) + + if len(levels) != len(codes): + raise ValueError( + "Length of levels and codes must match. NOTE: " + "this index is in an inconsistent state." + ) + codes_length = len(codes[0]) + for i in levels_to_verify: + level = levels[i] + level_codes = codes[i] + + if len(level_codes) != codes_length: + raise ValueError( + f"Unequal code lengths: {[len(code_) for code_ in codes]}" + ) + if len(level_codes) and level_codes.max() >= len(level): + raise ValueError( + f"On level {i}, code max ({level_codes.max()}) >= length of " + f"level ({len(level)}). NOTE: this index is in an " + "inconsistent state" + ) + if len(level_codes) and level_codes.min() < -1: + raise ValueError(f"On level {i}, code value ({level_codes.min()}) < -1") + if not level.is_unique: + raise ValueError( + f"Level values must be unique: {list(level)} on level {i}" + ) + if self.sortorder is not None: + if self.sortorder > _lexsort_depth(self.codes, self.nlevels): + raise ValueError( + "Value for sortorder must be inferior or equal to actual " + f"lexsort_depth: sortorder {self.sortorder} " + f"with lexsort_depth {_lexsort_depth(self.codes, self.nlevels)}" + ) + + result_codes = [] + for i in range(len(levels)): + if i in levels_to_verify: + result_codes.append(self._validate_codes(levels[i], codes[i])) + else: + result_codes.append(codes[i]) + + new_codes = FrozenList(result_codes) + return new_codes + + @classmethod + def from_arrays( + cls, + arrays, + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | lib.NoDefault = lib.no_default, + ) -> MultiIndex: + """ + Convert arrays to MultiIndex. + + Parameters + ---------- + arrays : list / sequence of array-likes + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of str, optional + Names for the levels in the index. + + Returns + ------- + MultiIndex + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + """ + error_msg = "Input must be a list / sequence of array-likes." + if not is_list_like(arrays): + raise TypeError(error_msg) + if is_iterator(arrays): + arrays = list(arrays) + + # Check if elements of array are list-like + for array in arrays: + if not is_list_like(array): + raise TypeError(error_msg) + + # Check if lengths of all arrays are equal or not, + # raise ValueError, if not + for i in range(1, len(arrays)): + if len(arrays[i]) != len(arrays[i - 1]): + raise ValueError("all arrays must be same length") + + codes, levels = factorize_from_iterables(arrays) + if names is lib.no_default: + names = [getattr(arr, "name", None) for arr in arrays] + + return cls( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + verify_integrity=False, + ) + + @classmethod + @names_compat + def from_tuples( + cls, + tuples: Iterable[tuple[Hashable, ...]], + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | None = None, + ) -> MultiIndex: + """ + Convert list of tuples to MultiIndex. + + Parameters + ---------- + tuples : list / sequence of tuple-likes + Each tuple is the index of one row/column. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of str, optional + Names for the levels in the index. + + Returns + ------- + MultiIndex + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> tuples = [(1, 'red'), (1, 'blue'), + ... (2, 'red'), (2, 'blue')] + >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + MultiIndex([(1, 'red'), + (1, 'blue'), + (2, 'red'), + (2, 'blue')], + names=['number', 'color']) + """ + if not is_list_like(tuples): + raise TypeError("Input must be a list / sequence of tuple-likes.") + if is_iterator(tuples): + tuples = list(tuples) + tuples = cast(Collection[tuple[Hashable, ...]], tuples) + + # handling the empty tuple cases + if len(tuples) and all(isinstance(e, tuple) and not e for e in tuples): + codes = [np.zeros(len(tuples))] + levels = [Index(com.asarray_tuplesafe(tuples, dtype=np.dtype("object")))] + return cls( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + verify_integrity=False, + ) + + arrays: list[Sequence[Hashable]] + if len(tuples) == 0: + if names is None: + raise TypeError("Cannot infer number of levels from empty list") + # error: Argument 1 to "len" has incompatible type "Hashable"; + # expected "Sized" + arrays = [[]] * len(names) # type: ignore[arg-type] + elif isinstance(tuples, (np.ndarray, Index)): + if isinstance(tuples, Index): + tuples = np.asarray(tuples._values) + + arrays = list(lib.tuples_to_object_array(tuples).T) + elif isinstance(tuples, list): + arrays = list(lib.to_object_array_tuples(tuples).T) + else: + arrs = zip(*tuples) + arrays = cast(list[Sequence[Hashable]], arrs) + + return cls.from_arrays(arrays, sortorder=sortorder, names=names) + + @classmethod + def from_product( + cls, + iterables: Sequence[Iterable[Hashable]], + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | lib.NoDefault = lib.no_default, + ) -> MultiIndex: + """ + Make a MultiIndex from the cartesian product of multiple iterables. + + Parameters + ---------- + iterables : list / sequence of iterables + Each iterable has unique labels for each level of the index. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of str, optional + Names for the levels in the index. + If not explicitly provided, names will be inferred from the + elements of iterables if an element has a name attribute. + + Returns + ------- + MultiIndex + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> numbers = [0, 1, 2] + >>> colors = ['green', 'purple'] + >>> pd.MultiIndex.from_product([numbers, colors], + ... names=['number', 'color']) + MultiIndex([(0, 'green'), + (0, 'purple'), + (1, 'green'), + (1, 'purple'), + (2, 'green'), + (2, 'purple')], + names=['number', 'color']) + """ + from pandas.core.reshape.util import cartesian_product + + if not is_list_like(iterables): + raise TypeError("Input must be a list / sequence of iterables.") + if is_iterator(iterables): + iterables = list(iterables) + + codes, levels = factorize_from_iterables(iterables) + if names is lib.no_default: + names = [getattr(it, "name", None) for it in iterables] + + # codes are all ndarrays, so cartesian_product is lossless + codes = cartesian_product(codes) + return cls(levels, codes, sortorder=sortorder, names=names) + + @classmethod + def from_frame( + cls, + df: DataFrame, + sortorder: int | None = None, + names: Sequence[Hashable] | Hashable | None = None, + ) -> MultiIndex: + """ + Make a MultiIndex from a DataFrame. + + Parameters + ---------- + df : DataFrame + DataFrame to be converted to MultiIndex. + sortorder : int, optional + Level of sortedness (must be lexicographically sorted by that + level). + names : list-like, optional + If no names are provided, use the column names, or tuple of column + names if the columns is a MultiIndex. If a sequence, overwrite + names with the given sequence. + + Returns + ------- + MultiIndex + The MultiIndex representation of the given DataFrame. + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + + Examples + -------- + >>> df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], + ... ['NJ', 'Temp'], ['NJ', 'Precip']], + ... columns=['a', 'b']) + >>> df + a b + 0 HI Temp + 1 HI Precip + 2 NJ Temp + 3 NJ Precip + + >>> pd.MultiIndex.from_frame(df) + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], + names=['a', 'b']) + + Using explicit names, instead of the column names + + >>> pd.MultiIndex.from_frame(df, names=['state', 'observation']) + MultiIndex([('HI', 'Temp'), + ('HI', 'Precip'), + ('NJ', 'Temp'), + ('NJ', 'Precip')], + names=['state', 'observation']) + """ + if not isinstance(df, ABCDataFrame): + raise TypeError("Input must be a DataFrame") + + column_names, columns = zip(*df.items()) + names = column_names if names is None else names + return cls.from_arrays(columns, sortorder=sortorder, names=names) + + # -------------------------------------------------------------------- + + @cache_readonly + def _values(self) -> np.ndarray: + # We override here, since our parent uses _data, which we don't use. + values = [] + + for i in range(self.nlevels): + index = self.levels[i] + codes = self.codes[i] + + vals = index + if isinstance(vals.dtype, CategoricalDtype): + vals = cast("CategoricalIndex", vals) + vals = vals._data._internal_get_values() + + if isinstance(vals.dtype, ExtensionDtype) or lib.is_np_dtype( + vals.dtype, "mM" + ): + vals = vals.astype(object) + + vals = np.asarray(vals) + vals = algos.take_nd(vals, codes, fill_value=index._na_value) + values.append(vals) + + arr = lib.fast_zip(values) + return arr + + @property + def values(self) -> np.ndarray: + return self._values + + @property + def array(self): + """ + Raises a ValueError for `MultiIndex` because there's no single + array backing a MultiIndex. + + Raises + ------ + ValueError + """ + raise ValueError( + "MultiIndex has no single backing array. Use " + "'MultiIndex.to_numpy()' to get a NumPy array of tuples." + ) + + @cache_readonly + def dtypes(self) -> Series: + """ + Return the dtypes as a Series for the underlying MultiIndex. + + Examples + -------- + >>> idx = pd.MultiIndex.from_product([(0, 1, 2), ('green', 'purple')], + ... names=['number', 'color']) + >>> idx + MultiIndex([(0, 'green'), + (0, 'purple'), + (1, 'green'), + (1, 'purple'), + (2, 'green'), + (2, 'purple')], + names=['number', 'color']) + >>> idx.dtypes + number int64 + color object + dtype: object + """ + from pandas import Series + + names = com.fill_missing_names([level.name for level in self.levels]) + return Series([level.dtype for level in self.levels], index=Index(names)) + + def __len__(self) -> int: + return len(self.codes[0]) + + @property + def size(self) -> int: + """ + Return the number of elements in the underlying data. + """ + # override Index.size to avoid materializing _values + return len(self) + + # -------------------------------------------------------------------- + # Levels Methods + + @cache_readonly + def levels(self) -> FrozenList: + """ + Levels of the MultiIndex. + + Levels refer to the different hierarchical levels or layers in a MultiIndex. + In a MultiIndex, each level represents a distinct dimension or category of + the index. + + To access the levels, you can use the levels attribute of the MultiIndex, + which returns a tuple of Index objects. Each Index object represents a + level in the MultiIndex and contains the unique values found in that + specific level. + + If a MultiIndex is created with levels A, B, C, and the DataFrame using + it filters out all rows of the level C, MultiIndex.levels will still + return A, B, C. + + Examples + -------- + >>> index = pd.MultiIndex.from_product([['mammal'], + ... ('goat', 'human', 'cat', 'dog')], + ... names=['Category', 'Animals']) + >>> leg_num = pd.DataFrame(data=(4, 2, 4, 4), index=index, columns=['Legs']) + >>> leg_num + Legs + Category Animals + mammal goat 4 + human 2 + cat 4 + dog 4 + + >>> leg_num.index.levels + FrozenList([['mammal'], ['cat', 'dog', 'goat', 'human']]) + + MultiIndex levels will not change even if the DataFrame using the MultiIndex + does not contain all them anymore. + See how "human" is not in the DataFrame, but it is still in levels: + + >>> large_leg_num = leg_num[leg_num.Legs > 2] + >>> large_leg_num + Legs + Category Animals + mammal goat 4 + cat 4 + dog 4 + + >>> large_leg_num.index.levels + FrozenList([['mammal'], ['cat', 'dog', 'goat', 'human']]) + """ + # Use cache_readonly to ensure that self.get_locs doesn't repeatedly + # create new IndexEngine + # https://github.com/pandas-dev/pandas/issues/31648 + result = [x._rename(name=name) for x, name in zip(self._levels, self._names)] + for level in result: + # disallow midx.levels[0].name = "foo" + level._no_setting_name = True + return FrozenList(result) + + def _set_levels( + self, + levels, + *, + level=None, + copy: bool = False, + validate: bool = True, + verify_integrity: bool = False, + ) -> None: + # This is NOT part of the levels property because it should be + # externally not allowed to set levels. User beware if you change + # _levels directly + if validate: + if len(levels) == 0: + raise ValueError("Must set non-zero number of levels.") + if level is None and len(levels) != self.nlevels: + raise ValueError("Length of levels must match number of levels.") + if level is not None and len(levels) != len(level): + raise ValueError("Length of levels must match length of level.") + + if level is None: + new_levels = FrozenList( + ensure_index(lev, copy=copy)._view() for lev in levels + ) + level_numbers = list(range(len(new_levels))) + else: + level_numbers = [self._get_level_number(lev) for lev in level] + new_levels_list = list(self._levels) + for lev_num, lev in zip(level_numbers, levels): + new_levels_list[lev_num] = ensure_index(lev, copy=copy)._view() + new_levels = FrozenList(new_levels_list) + + if verify_integrity: + new_codes = self._verify_integrity( + levels=new_levels, levels_to_verify=level_numbers + ) + self._codes = new_codes + + names = self.names + self._levels = new_levels + if any(names): + self._set_names(names) + + self._reset_cache() + + def set_levels( + self, levels, *, level=None, verify_integrity: bool = True + ) -> MultiIndex: + """ + Set new levels on MultiIndex. Defaults to returning new index. + + Parameters + ---------- + levels : sequence or list of sequence + New level(s) to apply. + level : int, level name, or sequence of int/level names (default None) + Level(s) to set (None for all levels). + verify_integrity : bool, default True + If True, checks that levels and codes are compatible. + + Returns + ------- + MultiIndex + + Examples + -------- + >>> idx = pd.MultiIndex.from_tuples( + ... [ + ... (1, "one"), + ... (1, "two"), + ... (2, "one"), + ... (2, "two"), + ... (3, "one"), + ... (3, "two") + ... ], + ... names=["foo", "bar"] + ... ) + >>> idx + MultiIndex([(1, 'one'), + (1, 'two'), + (2, 'one'), + (2, 'two'), + (3, 'one'), + (3, 'two')], + names=['foo', 'bar']) + + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) + MultiIndex([('a', 1), + ('a', 2), + ('b', 1), + ('b', 2), + ('c', 1), + ('c', 2)], + names=['foo', 'bar']) + >>> idx.set_levels(['a', 'b', 'c'], level=0) + MultiIndex([('a', 'one'), + ('a', 'two'), + ('b', 'one'), + ('b', 'two'), + ('c', 'one'), + ('c', 'two')], + names=['foo', 'bar']) + >>> idx.set_levels(['a', 'b'], level='bar') + MultiIndex([(1, 'a'), + (1, 'b'), + (2, 'a'), + (2, 'b'), + (3, 'a'), + (3, 'b')], + names=['foo', 'bar']) + + If any of the levels passed to ``set_levels()`` exceeds the + existing length, all of the values from that argument will + be stored in the MultiIndex levels, though the values will + be truncated in the MultiIndex output. + + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) + MultiIndex([('a', 1), + ('a', 2), + ('b', 1), + ('b', 2), + ('c', 1), + ('c', 2)], + names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels + FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) + """ + + if isinstance(levels, Index): + pass + elif is_array_like(levels): + levels = Index(levels) + elif is_list_like(levels): + levels = list(levels) + + level, levels = _require_listlike(level, levels, "Levels") + idx = self._view() + idx._reset_identity() + idx._set_levels( + levels, level=level, validate=True, verify_integrity=verify_integrity + ) + return idx + + @property + def nlevels(self) -> int: + """ + Integer number of levels in this MultiIndex. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi + MultiIndex([('a', 'b', 'c')], + ) + >>> mi.nlevels + 3 + """ + return len(self._levels) + + @property + def levshape(self) -> Shape: + """ + A tuple with the length of each level. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi + MultiIndex([('a', 'b', 'c')], + ) + >>> mi.levshape + (1, 1, 1) + """ + return tuple(len(x) for x in self.levels) + + # -------------------------------------------------------------------- + # Codes Methods + + @property + def codes(self) -> FrozenList: + return self._codes + + def _set_codes( + self, + codes, + *, + level=None, + copy: bool = False, + validate: bool = True, + verify_integrity: bool = False, + ) -> None: + if validate: + if level is None and len(codes) != self.nlevels: + raise ValueError("Length of codes must match number of levels") + if level is not None and len(codes) != len(level): + raise ValueError("Length of codes must match length of levels.") + + level_numbers: list[int] | range + if level is None: + new_codes = FrozenList( + _coerce_indexer_frozen(level_codes, lev, copy=copy).view() + for lev, level_codes in zip(self._levels, codes) + ) + level_numbers = range(len(new_codes)) + else: + level_numbers = [self._get_level_number(lev) for lev in level] + new_codes_list = list(self._codes) + for lev_num, level_codes in zip(level_numbers, codes): + lev = self.levels[lev_num] + new_codes_list[lev_num] = _coerce_indexer_frozen( + level_codes, lev, copy=copy + ) + new_codes = FrozenList(new_codes_list) + + if verify_integrity: + new_codes = self._verify_integrity( + codes=new_codes, levels_to_verify=level_numbers + ) + + self._codes = new_codes + + self._reset_cache() + + def set_codes( + self, codes, *, level=None, verify_integrity: bool = True + ) -> MultiIndex: + """ + Set new codes on MultiIndex. Defaults to returning new index. + + Parameters + ---------- + codes : sequence or list of sequence + New codes to apply. + level : int, level name, or sequence of int/level names (default None) + Level(s) to set (None for all levels). + verify_integrity : bool, default True + If True, checks that levels and codes are compatible. + + Returns + ------- + new index (of same type and class...etc) or None + The same type as the caller or None if ``inplace=True``. + + Examples + -------- + >>> idx = pd.MultiIndex.from_tuples( + ... [(1, "one"), (1, "two"), (2, "one"), (2, "two")], names=["foo", "bar"] + ... ) + >>> idx + MultiIndex([(1, 'one'), + (1, 'two'), + (2, 'one'), + (2, 'two')], + names=['foo', 'bar']) + + >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]]) + MultiIndex([(2, 'one'), + (1, 'one'), + (2, 'two'), + (1, 'two')], + names=['foo', 'bar']) + >>> idx.set_codes([1, 0, 1, 0], level=0) + MultiIndex([(2, 'one'), + (1, 'two'), + (2, 'one'), + (1, 'two')], + names=['foo', 'bar']) + >>> idx.set_codes([0, 0, 1, 1], level='bar') + MultiIndex([(1, 'one'), + (1, 'one'), + (2, 'two'), + (2, 'two')], + names=['foo', 'bar']) + >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]], level=[0, 1]) + MultiIndex([(2, 'one'), + (1, 'one'), + (2, 'two'), + (1, 'two')], + names=['foo', 'bar']) + """ + + level, codes = _require_listlike(level, codes, "Codes") + idx = self._view() + idx._reset_identity() + idx._set_codes(codes, level=level, verify_integrity=verify_integrity) + return idx + + # -------------------------------------------------------------------- + # Index Internals + + @cache_readonly + def _engine(self): + # Calculate the number of bits needed to represent labels in each + # level, as log2 of their sizes: + # NaN values are shifted to 1 and missing values in other while + # calculating the indexer are shifted to 0 + sizes = np.ceil( + np.log2( + [len(level) + libindex.multiindex_nulls_shift for level in self.levels] + ) + ) + + # Sum bit counts, starting from the _right_.... + lev_bits = np.cumsum(sizes[::-1])[::-1] + + # ... in order to obtain offsets such that sorting the combination of + # shifted codes (one for each level, resulting in a unique integer) is + # equivalent to sorting lexicographically the codes themselves. Notice + # that each level needs to be shifted by the number of bits needed to + # represent the _previous_ ones: + offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64") + + # Check the total number of bits needed for our representation: + if lev_bits[0] > 64: + # The levels would overflow a 64 bit uint - use Python integers: + return MultiIndexPyIntEngine(self.levels, self.codes, offsets) + return MultiIndexUIntEngine(self.levels, self.codes, offsets) + + # Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return + # type "Type[MultiIndex]" in supertype "Index" + @property + def _constructor(self) -> Callable[..., MultiIndex]: # type: ignore[override] + return type(self).from_tuples + + @doc(Index._shallow_copy) + def _shallow_copy(self, values: np.ndarray, name=lib.no_default) -> MultiIndex: + names = name if name is not lib.no_default else self.names + + return type(self).from_tuples(values, sortorder=None, names=names) + + def _view(self) -> MultiIndex: + result = type(self)( + levels=self.levels, + codes=self.codes, + sortorder=self.sortorder, + names=self.names, + verify_integrity=False, + ) + result._cache = self._cache.copy() + result._cache.pop("levels", None) # GH32669 + return result + + # -------------------------------------------------------------------- + + # error: Signature of "copy" incompatible with supertype "Index" + def copy( # type: ignore[override] + self, + names=None, + deep: bool = False, + name=None, + ) -> Self: + """ + Make a copy of this object. + + Names, dtype, levels and codes can be passed and will be set on new copy. + + Parameters + ---------- + names : sequence, optional + deep : bool, default False + name : Label + Kept for compatibility with 1-dimensional Index. Should not be used. + + Returns + ------- + MultiIndex + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + This could be potentially expensive on large MultiIndex objects. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi + MultiIndex([('a', 'b', 'c')], + ) + >>> mi.copy() + MultiIndex([('a', 'b', 'c')], + ) + """ + names = self._validate_names(name=name, names=names, deep=deep) + keep_id = not deep + levels, codes = None, None + + if deep: + from copy import deepcopy + + levels = deepcopy(self.levels) + codes = deepcopy(self.codes) + + levels = levels if levels is not None else self.levels + codes = codes if codes is not None else self.codes + + new_index = type(self)( + levels=levels, + codes=codes, + sortorder=self.sortorder, + names=names, + verify_integrity=False, + ) + new_index._cache = self._cache.copy() + new_index._cache.pop("levels", None) # GH32669 + if keep_id: + new_index._id = self._id + return new_index + + def __array__(self, dtype=None, copy=None) -> np.ndarray: + """the array interface, return my values""" + return self.values + + def view(self, cls=None) -> Self: + """this is defined as a copy with the same identity""" + result = self.copy() + result._id = self._id + return result + + @doc(Index.__contains__) + def __contains__(self, key: Any) -> bool: + hash(key) + try: + self.get_loc(key) + return True + except (LookupError, TypeError, ValueError): + return False + + @cache_readonly + def dtype(self) -> np.dtype: + return np.dtype("O") + + def _is_memory_usage_qualified(self) -> bool: + """return a boolean if we need a qualified .info display""" + + def f(level) -> bool: + return "mixed" in level or "string" in level or "unicode" in level + + return any(f(level) for level in self._inferred_type_levels) + + # Cannot determine type of "memory_usage" + @doc(Index.memory_usage) # type: ignore[has-type] + def memory_usage(self, deep: bool = False) -> int: + # we are overwriting our base class to avoid + # computing .values here which could materialize + # a tuple representation unnecessarily + return self._nbytes(deep) + + @cache_readonly + def nbytes(self) -> int: + """return the number of bytes in the underlying data""" + return self._nbytes(False) + + def _nbytes(self, deep: bool = False) -> int: + """ + return the number of bytes in the underlying data + deeply introspect the level data if deep=True + + include the engine hashtable + + *this is in internal routine* + + """ + # for implementations with no useful getsizeof (PyPy) + objsize = 24 + + level_nbytes = sum(i.memory_usage(deep=deep) for i in self.levels) + label_nbytes = sum(i.nbytes for i in self.codes) + names_nbytes = sum(getsizeof(i, objsize) for i in self.names) + result = level_nbytes + label_nbytes + names_nbytes + + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result + + # -------------------------------------------------------------------- + # Rendering Methods + + def _formatter_func(self, tup): + """ + Formats each item in tup according to its level's formatter function. + """ + formatter_funcs = [level._formatter_func for level in self.levels] + return tuple(func(val) for func, val in zip(formatter_funcs, tup)) + + def _get_values_for_csv( + self, *, na_rep: str = "nan", **kwargs + ) -> npt.NDArray[np.object_]: + new_levels = [] + new_codes = [] + + # go through the levels and format them + for level, level_codes in zip(self.levels, self.codes): + level_strs = level._get_values_for_csv(na_rep=na_rep, **kwargs) + # add nan values, if there are any + mask = level_codes == -1 + if mask.any(): + nan_index = len(level_strs) + # numpy 1.21 deprecated implicit string casting + level_strs = level_strs.astype(str) + level_strs = np.append(level_strs, na_rep) + assert not level_codes.flags.writeable # i.e. copy is needed + level_codes = level_codes.copy() # make writeable + level_codes[mask] = nan_index + new_levels.append(level_strs) + new_codes.append(level_codes) + + if len(new_levels) == 1: + # a single-level multi-index + return Index(new_levels[0].take(new_codes[0]))._get_values_for_csv() + else: + # reconstruct the multi-index + mi = MultiIndex( + levels=new_levels, + codes=new_codes, + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) + return mi._values + + def format( + self, + name: bool | None = None, + formatter: Callable | None = None, + na_rep: str | None = None, + names: bool = False, + space: int = 2, + sparsify=None, + adjoin: bool = True, + ) -> list: + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if name is not None: + names = name + + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, level_codes in zip(self.levels, self.codes): + na = na_rep if na_rep is not None else _get_na_rep(lev.dtype) + + if len(lev) > 0: + formatted = lev.take(level_codes).format(formatter=formatter) + + # we have some NA + mask = level_codes == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [ + pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n")) + for x in algos.take_nd(lev._values, level_codes) + ] + stringified_levels.append(formatted) + + result_levels = [] + for lev, lev_name in zip(stringified_levels, self.names): + level = [] + + if names: + level.append( + pprint_thing(lev_name, escape_chars=("\t", "\r", "\n")) + if lev_name is not None + else "" + ) + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel: Literal[""] | bool | lib.NoDefault = "" + # GH3547 use value of sparsify as sentinel if it's "Falsey" + assert isinstance(sparsify, bool) or sparsify is lib.no_default + if sparsify in [False, lib.no_default]: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = sparsify_labels( + result_levels, start=int(names), sentinel=sentinel + ) + + if adjoin: + adj = get_adjustment() + return adj.adjoin(space, *result_levels).split("\n") + else: + return result_levels + + def _format_multi( + self, + *, + include_names: bool, + sparsify: bool | None | lib.NoDefault, + formatter: Callable | None = None, + ) -> list: + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, level_codes in zip(self.levels, self.codes): + na = _get_na_rep(lev.dtype) + + if len(lev) > 0: + taken = formatted = lev.take(level_codes) + formatted = taken._format_flat(include_name=False, formatter=formatter) + + # we have some NA + mask = level_codes == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [ + pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n")) + for x in algos.take_nd(lev._values, level_codes) + ] + stringified_levels.append(formatted) + + result_levels = [] + for lev, lev_name in zip(stringified_levels, self.names): + level = [] + + if include_names: + level.append( + pprint_thing(lev_name, escape_chars=("\t", "\r", "\n")) + if lev_name is not None + else "" + ) + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel: Literal[""] | bool | lib.NoDefault = "" + # GH3547 use value of sparsify as sentinel if it's "Falsey" + assert isinstance(sparsify, bool) or sparsify is lib.no_default + if sparsify is lib.no_default: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = sparsify_labels( + result_levels, start=int(include_names), sentinel=sentinel + ) + + return result_levels + + # -------------------------------------------------------------------- + # Names Methods + + def _get_names(self) -> FrozenList: + return FrozenList(self._names) + + def _set_names(self, names, *, level=None, validate: bool = True): + """ + Set new names on index. Each name has to be a hashable type. + + Parameters + ---------- + values : str or sequence + name(s) to set + level : int, level name, or sequence of int/level names (default None) + If the index is a MultiIndex (hierarchical), level(s) to set (None + for all levels). Otherwise level must be None + validate : bool, default True + validate that the names match level lengths + + Raises + ------ + TypeError if each name is not hashable. + + Notes + ----- + sets names on levels. WARNING: mutates! + + Note that you generally want to set this *after* changing levels, so + that it only acts on copies + """ + # GH 15110 + # Don't allow a single string for names in a MultiIndex + if names is not None and not is_list_like(names): + raise ValueError("Names should be list-like for a MultiIndex") + names = list(names) + + if validate: + if level is not None and len(names) != len(level): + raise ValueError("Length of names must match length of level.") + if level is None and len(names) != self.nlevels: + raise ValueError( + "Length of names must match number of levels in MultiIndex." + ) + + if level is None: + level = range(self.nlevels) + else: + level = [self._get_level_number(lev) for lev in level] + + # set the name + for lev, name in zip(level, names): + if name is not None: + # GH 20527 + # All items in 'names' need to be hashable: + if not is_hashable(name): + raise TypeError( + f"{type(self).__name__}.name must be a hashable type" + ) + self._names[lev] = name + + # If .levels has been accessed, the names in our cache will be stale. + self._reset_cache() + + names = property( + fset=_set_names, + fget=_get_names, + doc=""" + Names of levels in MultiIndex. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays( + ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) + >>> mi + MultiIndex([(1, 3, 5), + (2, 4, 6)], + names=['x', 'y', 'z']) + >>> mi.names + FrozenList(['x', 'y', 'z']) + """, + ) + + # -------------------------------------------------------------------- + + @cache_readonly + def inferred_type(self) -> str: + return "mixed" + + def _get_level_number(self, level) -> int: + count = self.names.count(level) + if (count > 1) and not is_integer(level): + raise ValueError( + f"The name {level} occurs multiple times, use a level number" + ) + try: + level = self.names.index(level) + except ValueError as err: + if not is_integer(level): + raise KeyError(f"Level {level} not found") from err + if level < 0: + level += self.nlevels + if level < 0: + orig_level = level - self.nlevels + raise IndexError( + f"Too many levels: Index has only {self.nlevels} levels, " + f"{orig_level} is not a valid level number" + ) from err + # Note: levels are zero-based + elif level >= self.nlevels: + raise IndexError( + f"Too many levels: Index has only {self.nlevels} levels, " + f"not {level + 1}" + ) from err + return level + + @cache_readonly + def is_monotonic_increasing(self) -> bool: + """ + Return a boolean if the values are equal or increasing. + """ + if any(-1 in code for code in self.codes): + return False + + if all(level.is_monotonic_increasing for level in self.levels): + # If each level is sorted, we can operate on the codes directly. GH27495 + return libalgos.is_lexsorted( + [x.astype("int64", copy=False) for x in self.codes] + ) + + # reversed() because lexsort() wants the most significant key last. + values = [ + self._get_level_values(i)._values for i in reversed(range(len(self.levels))) + ] + try: + # error: Argument 1 to "lexsort" has incompatible type + # "List[Union[ExtensionArray, ndarray[Any, Any]]]"; + # expected "Union[_SupportsArray[dtype[Any]], + # _NestedSequence[_SupportsArray[dtype[Any]]], bool, + # int, float, complex, str, bytes, _NestedSequence[Union + # [bool, int, float, complex, str, bytes]]]" + sort_order = np.lexsort(values) # type: ignore[arg-type] + return Index(sort_order).is_monotonic_increasing + except TypeError: + # we have mixed types and np.lexsort is not happy + return Index(self._values).is_monotonic_increasing + + @cache_readonly + def is_monotonic_decreasing(self) -> bool: + """ + Return a boolean if the values are equal or decreasing. + """ + # monotonic decreasing if and only if reverse is monotonic increasing + return self[::-1].is_monotonic_increasing + + @cache_readonly + def _inferred_type_levels(self) -> list[str]: + """return a list of the inferred types, one for each level""" + return [i.inferred_type for i in self.levels] + + @doc(Index.duplicated) + def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: + shape = tuple(len(lev) for lev in self.levels) + ids = get_group_index(self.codes, shape, sort=False, xnull=False) + + return duplicated(ids, keep) + + # error: Cannot override final attribute "_duplicated" + # (previously declared in base class "IndexOpsMixin") + _duplicated = duplicated # type: ignore[misc] + + def fillna(self, value=None, downcast=None): + """ + fillna is not implemented for MultiIndex + """ + raise NotImplementedError("isna is not defined for MultiIndex") + + @doc(Index.dropna) + def dropna(self, how: AnyAll = "any") -> MultiIndex: + nans = [level_codes == -1 for level_codes in self.codes] + if how == "any": + indexer = np.any(nans, axis=0) + elif how == "all": + indexer = np.all(nans, axis=0) + else: + raise ValueError(f"invalid how option: {how}") + + new_codes = [level_codes[~indexer] for level_codes in self.codes] + return self.set_codes(codes=new_codes) + + def _get_level_values(self, level: int, unique: bool = False) -> Index: + """ + Return vector of label values for requested level, + equal to the length of the index + + **this is an internal method** + + Parameters + ---------- + level : int + unique : bool, default False + if True, drop duplicated values + + Returns + ------- + Index + """ + lev = self.levels[level] + level_codes = self.codes[level] + name = self._names[level] + if unique: + level_codes = algos.unique(level_codes) + filled = algos.take_nd(lev._values, level_codes, fill_value=lev._na_value) + return lev._shallow_copy(filled, name=name) + + # error: Signature of "get_level_values" incompatible with supertype "Index" + def get_level_values(self, level) -> Index: # type: ignore[override] + """ + Return vector of label values for requested level. + + Length of returned vector is equal to the length of the index. + + Parameters + ---------- + level : int or str + ``level`` is either the integer position of the level in the + MultiIndex, or the name of the level. + + Returns + ------- + Index + Values is a level of this MultiIndex converted to + a single :class:`Index` (or subclass thereof). + + Notes + ----- + If the level contains missing values, the result may be casted to + ``float`` with missing values specified as ``NaN``. This is because + the level is converted to a regular ``Index``. + + Examples + -------- + Create a MultiIndex: + + >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def'))) + >>> mi.names = ['level_1', 'level_2'] + + Get level values by supplying level as either integer or name: + + >>> mi.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object', name='level_1') + >>> mi.get_level_values('level_2') + Index(['d', 'e', 'f'], dtype='object', name='level_2') + + If a level contains missing values, the return type of the level + may be cast to ``float``. + + >>> pd.MultiIndex.from_arrays([[1, None, 2], [3, 4, 5]]).dtypes + level_0 int64 + level_1 int64 + dtype: object + >>> pd.MultiIndex.from_arrays([[1, None, 2], [3, 4, 5]]).get_level_values(0) + Index([1.0, nan, 2.0], dtype='float64') + """ + level = self._get_level_number(level) + values = self._get_level_values(level) + return values + + @doc(Index.unique) + def unique(self, level=None): + if level is None: + return self.drop_duplicates() + else: + level = self._get_level_number(level) + return self._get_level_values(level=level, unique=True) + + def to_frame( + self, + index: bool = True, + name=lib.no_default, + allow_duplicates: bool = False, + ) -> DataFrame: + """ + Create a DataFrame with the levels of the MultiIndex as columns. + + Column ordering is determined by the DataFrame constructor with data as + a dict. + + Parameters + ---------- + index : bool, default True + Set the index of the returned DataFrame as the original MultiIndex. + + name : list / sequence of str, optional + The passed names should substitute index level names. + + allow_duplicates : bool, optional default False + Allow duplicate column labels to be created. + + .. versionadded:: 1.5.0 + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous + tabular data. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']]) + >>> mi + MultiIndex([('a', 'c'), + ('b', 'd')], + ) + + >>> df = mi.to_frame() + >>> df + 0 1 + a c a c + b d b d + + >>> df = mi.to_frame(index=False) + >>> df + 0 1 + 0 a c + 1 b d + + >>> df = mi.to_frame(name=['x', 'y']) + >>> df + x y + a c a c + b d b d + """ + from pandas import DataFrame + + if name is not lib.no_default: + if not is_list_like(name): + raise TypeError("'name' must be a list / sequence of column names.") + + if len(name) != len(self.levels): + raise ValueError( + "'name' should have same length as number of levels on index." + ) + idx_names = name + else: + idx_names = self._get_level_names() + + if not allow_duplicates and len(set(idx_names)) != len(idx_names): + raise ValueError( + "Cannot create duplicate column labels if allow_duplicates is False" + ) + + # Guarantee resulting column order - PY36+ dict maintains insertion order + result = DataFrame( + {level: self._get_level_values(level) for level in range(len(self.levels))}, + copy=False, + ) + result.columns = idx_names + + if index: + result.index = self + return result + + # error: Return type "Index" of "to_flat_index" incompatible with return type + # "MultiIndex" in supertype "Index" + def to_flat_index(self) -> Index: # type: ignore[override] + """ + Convert a MultiIndex to an Index of Tuples containing the level values. + + Returns + ------- + pd.Index + Index with the MultiIndex data represented in Tuples. + + See Also + -------- + MultiIndex.from_tuples : Convert flat index back to MultiIndex. + + Notes + ----- + This method will simply return the caller if called by anything other + than a MultiIndex. + + Examples + -------- + >>> index = pd.MultiIndex.from_product( + ... [['foo', 'bar'], ['baz', 'qux']], + ... names=['a', 'b']) + >>> index.to_flat_index() + Index([('foo', 'baz'), ('foo', 'qux'), + ('bar', 'baz'), ('bar', 'qux')], + dtype='object') + """ + return Index(self._values, tupleize_cols=False) + + def _is_lexsorted(self) -> bool: + """ + Return True if the codes are lexicographically sorted. + + Returns + ------- + bool + + Examples + -------- + In the below examples, the first level of the MultiIndex is sorted because + a>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], + ... ['d', 'e', 'f']])._is_lexsorted() + True + >>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], + ... ['d', 'f', 'e']])._is_lexsorted() + True + + In case there is a tie, the lexicographical sorting looks + at the next level of the MultiIndex. + + >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'b', 'c']])._is_lexsorted() + True + >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'c', 'b']])._is_lexsorted() + False + >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], + ... ['aa', 'bb', 'aa', 'bb']])._is_lexsorted() + True + >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], + ... ['bb', 'aa', 'aa', 'bb']])._is_lexsorted() + False + """ + return self._lexsort_depth == self.nlevels + + @cache_readonly + def _lexsort_depth(self) -> int: + """ + Compute and return the lexsort_depth, the number of levels of the + MultiIndex that are sorted lexically + + Returns + ------- + int + """ + if self.sortorder is not None: + return self.sortorder + return _lexsort_depth(self.codes, self.nlevels) + + def _sort_levels_monotonic(self, raise_if_incomparable: bool = False) -> MultiIndex: + """ + This is an *internal* function. + + Create a new MultiIndex from the current to monotonically sorted + items IN the levels. This does not actually make the entire MultiIndex + monotonic, JUST the levels. + + The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will also + be .equals() to the original. + + Returns + ------- + MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi + MultiIndex([('a', 'bb'), + ('a', 'aa'), + ('b', 'bb'), + ('b', 'aa')], + ) + + >>> mi.sort_values() + MultiIndex([('a', 'aa'), + ('a', 'bb'), + ('b', 'aa'), + ('b', 'bb')], + ) + """ + if self._is_lexsorted() and self.is_monotonic_increasing: + return self + + new_levels = [] + new_codes = [] + + for lev, level_codes in zip(self.levels, self.codes): + if not lev.is_monotonic_increasing: + try: + # indexer to reorder the levels + indexer = lev.argsort() + except TypeError: + if raise_if_incomparable: + raise + else: + lev = lev.take(indexer) + + # indexer to reorder the level codes + indexer = ensure_platform_int(indexer) + ri = lib.get_reverse_indexer(indexer, len(indexer)) + level_codes = algos.take_nd(ri, level_codes, fill_value=-1) + + new_levels.append(lev) + new_codes.append(level_codes) + + return MultiIndex( + new_levels, + new_codes, + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) + + def remove_unused_levels(self) -> MultiIndex: + """ + Create new MultiIndex from current that removes unused levels. + + Unused level(s) means levels that are not expressed in the + labels. The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will + also be .equals() to the original. + + Returns + ------- + MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex.from_product([range(2), list('ab')]) + >>> mi + MultiIndex([(0, 'a'), + (0, 'b'), + (1, 'a'), + (1, 'b')], + ) + + >>> mi[2:] + MultiIndex([(1, 'a'), + (1, 'b')], + ) + + The 0 from the first level is not represented + and can be removed + + >>> mi2 = mi[2:].remove_unused_levels() + >>> mi2.levels + FrozenList([[1], ['a', 'b']]) + """ + new_levels = [] + new_codes = [] + + changed = False + for lev, level_codes in zip(self.levels, self.codes): + # Since few levels are typically unused, bincount() is more + # efficient than unique() - however it only accepts positive values + # (and drops order): + uniques = np.where(np.bincount(level_codes + 1) > 0)[0] - 1 + has_na = int(len(uniques) and (uniques[0] == -1)) + + if len(uniques) != len(lev) + has_na: + if lev.isna().any() and len(uniques) == len(lev): + break + # We have unused levels + changed = True + + # Recalculate uniques, now preserving order. + # Can easily be cythonized by exploiting the already existing + # "uniques" and stop parsing "level_codes" when all items + # are found: + uniques = algos.unique(level_codes) + if has_na: + na_idx = np.where(uniques == -1)[0] + # Just ensure that -1 is in first position: + uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] + + # codes get mapped from uniques to 0:len(uniques) + # -1 (if present) is mapped to last position + code_mapping = np.zeros(len(lev) + has_na) + # ... and reassigned value -1: + code_mapping[uniques] = np.arange(len(uniques)) - has_na + + level_codes = code_mapping[level_codes] + + # new levels are simple + lev = lev.take(uniques[has_na:]) + + new_levels.append(lev) + new_codes.append(level_codes) + + result = self.view() + + if changed: + result._reset_identity() + result._set_levels(new_levels, validate=False) + result._set_codes(new_codes, validate=False) + + return result + + # -------------------------------------------------------------------- + # Pickling Methods + + def __reduce__(self): + """Necessary for making this object picklable""" + d = { + "levels": list(self.levels), + "codes": list(self.codes), + "sortorder": self.sortorder, + "names": list(self.names), + } + return ibase._new_Index, (type(self), d), None + + # -------------------------------------------------------------------- + + def __getitem__(self, key): + if is_scalar(key): + key = com.cast_scalar_indexer(key) + + retval = [] + for lev, level_codes in zip(self.levels, self.codes): + if level_codes[key] == -1: + retval.append(np.nan) + else: + retval.append(lev[level_codes[key]]) + + return tuple(retval) + else: + # in general cannot be sure whether the result will be sorted + sortorder = None + if com.is_bool_indexer(key): + key = np.asarray(key, dtype=bool) + sortorder = self.sortorder + elif isinstance(key, slice): + if key.step is None or key.step > 0: + sortorder = self.sortorder + elif isinstance(key, Index): + key = np.asarray(key) + + new_codes = [level_codes[key] for level_codes in self.codes] + + return MultiIndex( + levels=self.levels, + codes=new_codes, + names=self.names, + sortorder=sortorder, + verify_integrity=False, + ) + + def _getitem_slice(self: MultiIndex, slobj: slice) -> MultiIndex: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + sortorder = None + if slobj.step is None or slobj.step > 0: + sortorder = self.sortorder + + new_codes = [level_codes[slobj] for level_codes in self.codes] + + return type(self)( + levels=self.levels, + codes=new_codes, + names=self._names, + sortorder=sortorder, + verify_integrity=False, + ) + + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take( + self: MultiIndex, + indices, + axis: Axis = 0, + allow_fill: bool = True, + fill_value=None, + **kwargs, + ) -> MultiIndex: + nv.validate_take((), kwargs) + indices = ensure_platform_int(indices) + + # only fill if we are passing a non-None fill_value + allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) + + na_value = -1 + + taken = [lab.take(indices) for lab in self.codes] + if allow_fill: + mask = indices == -1 + if mask.any(): + masked = [] + for new_label in taken: + label_values = new_label + label_values[mask] = na_value + masked.append(np.asarray(label_values)) + taken = masked + + return MultiIndex( + levels=self.levels, codes=taken, names=self.names, verify_integrity=False + ) + + def append(self, other): + """ + Append a collection of Index options together. + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + Index + The combined index. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a'], ['b']]) + >>> mi + MultiIndex([('a', 'b')], + ) + >>> mi.append(mi) + MultiIndex([('a', 'b'), ('a', 'b')], + ) + """ + if not isinstance(other, (list, tuple)): + other = [other] + + if all( + (isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other + ): + codes = [] + levels = [] + names = [] + for i in range(self.nlevels): + level_values = self.levels[i] + for mi in other: + level_values = level_values.union(mi.levels[i]) + level_codes = [ + recode_for_categories( + mi.codes[i], mi.levels[i], level_values, copy=False + ) + for mi in ([self, *other]) + ] + level_name = self.names[i] + if any(mi.names[i] != level_name for mi in other): + level_name = None + codes.append(np.concatenate(level_codes)) + levels.append(level_values) + names.append(level_name) + return MultiIndex( + codes=codes, levels=levels, names=names, verify_integrity=False + ) + + to_concat = (self._values,) + tuple(k._values for k in other) + new_tuples = np.concatenate(to_concat) + + # if all(isinstance(x, MultiIndex) for x in other): + try: + # We only get here if other contains at least one index with tuples, + # setting names to None automatically + return MultiIndex.from_tuples(new_tuples) + except (TypeError, IndexError): + return Index(new_tuples) + + def argsort( + self, *args, na_position: str = "last", **kwargs + ) -> npt.NDArray[np.intp]: + target = self._sort_levels_monotonic(raise_if_incomparable=True) + keys = [lev.codes for lev in target._get_codes_for_sorting()] + return lexsort_indexer(keys, na_position=na_position, codes_given=True) + + @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) + def repeat(self, repeats: int, axis=None) -> MultiIndex: + nv.validate_repeat((), {"axis": axis}) + # error: Incompatible types in assignment (expression has type "ndarray", + # variable has type "int") + repeats = ensure_platform_int(repeats) # type: ignore[assignment] + return MultiIndex( + levels=self.levels, + codes=[ + level_codes.view(np.ndarray).astype(np.intp, copy=False).repeat(repeats) + for level_codes in self.codes + ], + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) + + # error: Signature of "drop" incompatible with supertype "Index" + def drop( # type: ignore[override] + self, + codes, + level: Index | np.ndarray | Iterable[Hashable] | None = None, + errors: IgnoreRaise = "raise", + ) -> MultiIndex: + """ + Make a new :class:`pandas.MultiIndex` with the passed list of codes deleted. + + Parameters + ---------- + codes : array-like + Must be a list of tuples when ``level`` is not specified. + level : int or level name, default None + errors : str, default 'raise' + + Returns + ------- + MultiIndex + + Examples + -------- + >>> idx = pd.MultiIndex.from_product([(0, 1, 2), ('green', 'purple')], + ... names=["number", "color"]) + >>> idx + MultiIndex([(0, 'green'), + (0, 'purple'), + (1, 'green'), + (1, 'purple'), + (2, 'green'), + (2, 'purple')], + names=['number', 'color']) + >>> idx.drop([(1, 'green'), (2, 'purple')]) + MultiIndex([(0, 'green'), + (0, 'purple'), + (1, 'purple'), + (2, 'green')], + names=['number', 'color']) + + We can also drop from a specific level. + + >>> idx.drop('green', level='color') + MultiIndex([(0, 'purple'), + (1, 'purple'), + (2, 'purple')], + names=['number', 'color']) + + >>> idx.drop([1, 2], level=0) + MultiIndex([(0, 'green'), + (0, 'purple')], + names=['number', 'color']) + """ + if level is not None: + return self._drop_from_level(codes, level, errors) + + if not isinstance(codes, (np.ndarray, Index)): + try: + codes = com.index_labels_to_array(codes, dtype=np.dtype("object")) + except ValueError: + pass + + inds = [] + for level_codes in codes: + try: + loc = self.get_loc(level_codes) + # get_loc returns either an integer, a slice, or a boolean + # mask + if isinstance(loc, int): + inds.append(loc) + elif isinstance(loc, slice): + step = loc.step if loc.step is not None else 1 + inds.extend(range(loc.start, loc.stop, step)) + elif com.is_bool_indexer(loc): + if self._lexsort_depth == 0: + warnings.warn( + "dropping on a non-lexsorted multi-index " + "without a level parameter may impact performance.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) + loc = loc.nonzero()[0] + inds.extend(loc) + else: + msg = f"unsupported indexer of type {type(loc)}" + raise AssertionError(msg) + except KeyError: + if errors != "ignore": + raise + + return self.delete(inds) + + def _drop_from_level( + self, codes, level, errors: IgnoreRaise = "raise" + ) -> MultiIndex: + codes = com.index_labels_to_array(codes) + i = self._get_level_number(level) + index = self.levels[i] + values = index.get_indexer(codes) + # If nan should be dropped it will equal -1 here. We have to check which values + # are not nan and equal -1, this means they are missing in the index + nan_codes = isna(codes) + values[(np.equal(nan_codes, False)) & (values == -1)] = -2 + if index.shape[0] == self.shape[0]: + values[np.equal(nan_codes, True)] = -2 + + not_found = codes[values == -2] + if len(not_found) != 0 and errors != "ignore": + raise KeyError(f"labels {not_found} not found in level") + mask = ~algos.isin(self.codes[i], values) + + return self[mask] + + def swaplevel(self, i=-2, j=-1) -> MultiIndex: + """ + Swap level i with level j. + + Calling this method does not change the ordering of the values. + + Parameters + ---------- + i : int, str, default -2 + First level of index to be swapped. Can pass level name as string. + Type of parameters can be mixed. + j : int, str, default -1 + Second level of index to be swapped. Can pass level name as string. + Type of parameters can be mixed. + + Returns + ------- + MultiIndex + A new MultiIndex. + + See Also + -------- + Series.swaplevel : Swap levels i and j in a MultiIndex. + DataFrame.swaplevel : Swap levels i and j in a MultiIndex on a + particular axis. + + Examples + -------- + >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> mi + MultiIndex([('a', 'bb'), + ('a', 'aa'), + ('b', 'bb'), + ('b', 'aa')], + ) + >>> mi.swaplevel(0, 1) + MultiIndex([('bb', 'a'), + ('aa', 'a'), + ('bb', 'b'), + ('aa', 'b')], + ) + """ + new_levels = list(self.levels) + new_codes = list(self.codes) + new_names = list(self.names) + + i = self._get_level_number(i) + j = self._get_level_number(j) + + new_levels[i], new_levels[j] = new_levels[j], new_levels[i] + new_codes[i], new_codes[j] = new_codes[j], new_codes[i] + new_names[i], new_names[j] = new_names[j], new_names[i] + + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + def reorder_levels(self, order) -> MultiIndex: + """ + Rearrange levels using input order. May not drop or duplicate levels. + + Parameters + ---------- + order : list of int or list of str + List representing new level order. Reference level by number + (position) or by key (label). + + Returns + ------- + MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y']) + >>> mi + MultiIndex([(1, 3), + (2, 4)], + names=['x', 'y']) + + >>> mi.reorder_levels(order=[1, 0]) + MultiIndex([(3, 1), + (4, 2)], + names=['y', 'x']) + + >>> mi.reorder_levels(order=['y', 'x']) + MultiIndex([(3, 1), + (4, 2)], + names=['y', 'x']) + """ + order = [self._get_level_number(i) for i in order] + result = self._reorder_ilevels(order) + return result + + def _reorder_ilevels(self, order) -> MultiIndex: + if len(order) != self.nlevels: + raise AssertionError( + f"Length of order must be same as number of levels ({self.nlevels}), " + f"got {len(order)}" + ) + new_levels = [self.levels[i] for i in order] + new_codes = [self.codes[i] for i in order] + new_names = [self.names[i] for i in order] + + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + def _recode_for_new_levels( + self, new_levels, copy: bool = True + ) -> Generator[np.ndarray, None, None]: + if len(new_levels) > self.nlevels: + raise AssertionError( + f"Length of new_levels ({len(new_levels)}) " + f"must be <= self.nlevels ({self.nlevels})" + ) + for i in range(len(new_levels)): + yield recode_for_categories( + self.codes[i], self.levels[i], new_levels[i], copy=copy + ) + + def _get_codes_for_sorting(self) -> list[Categorical]: + """ + we are categorizing our codes by using the + available categories (all, not just observed) + excluding any missing ones (-1); this is in preparation + for sorting, where we need to disambiguate that -1 is not + a valid valid + """ + + def cats(level_codes): + return np.arange( + np.array(level_codes).max() + 1 if len(level_codes) else 0, + dtype=level_codes.dtype, + ) + + return [ + Categorical.from_codes(level_codes, cats(level_codes), True, validate=False) + for level_codes in self.codes + ] + + def sortlevel( + self, + level: IndexLabel = 0, + ascending: bool | list[bool] = True, + sort_remaining: bool = True, + na_position: str = "first", + ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: + """ + Sort MultiIndex at the requested level. + + The result will respect the original ordering of the associated + factor at that level. + + Parameters + ---------- + level : list-like, int or str, default 0 + If a string is given, must be a name of the level. + If list-like must be names or ints of levels. + ascending : bool, default True + False to sort in descending order. + Can also be a list to specify a directed ordering. + sort_remaining : sort by the remaining levels after level + na_position : {'first' or 'last'}, default 'first' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + .. versionadded:: 2.1.0 + + Returns + ------- + sorted_index : pd.MultiIndex + Resulting index. + indexer : np.ndarray[np.intp] + Indices of output values in original index. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]]) + >>> mi + MultiIndex([(0, 2), + (0, 1)], + ) + + >>> mi.sortlevel() + (MultiIndex([(0, 1), + (0, 2)], + ), array([1, 0])) + + >>> mi.sortlevel(sort_remaining=False) + (MultiIndex([(0, 2), + (0, 1)], + ), array([0, 1])) + + >>> mi.sortlevel(1) + (MultiIndex([(0, 1), + (0, 2)], + ), array([1, 0])) + + >>> mi.sortlevel(1, ascending=False) + (MultiIndex([(0, 2), + (0, 1)], + ), array([0, 1])) + """ + if not is_list_like(level): + level = [level] + # error: Item "Hashable" of "Union[Hashable, Sequence[Hashable]]" has + # no attribute "__iter__" (not iterable) + level = [ + self._get_level_number(lev) for lev in level # type: ignore[union-attr] + ] + sortorder = None + + codes = [self.codes[lev] for lev in level] + # we have a directed ordering via ascending + if isinstance(ascending, list): + if not len(level) == len(ascending): + raise ValueError("level must have same length as ascending") + elif sort_remaining: + codes.extend( + [self.codes[lev] for lev in range(len(self.levels)) if lev not in level] + ) + else: + sortorder = level[0] + + indexer = lexsort_indexer( + codes, orders=ascending, na_position=na_position, codes_given=True + ) + + indexer = ensure_platform_int(indexer) + new_codes = [level_codes.take(indexer) for level_codes in self.codes] + + new_index = MultiIndex( + codes=new_codes, + levels=self.levels, + names=self.names, + sortorder=sortorder, + verify_integrity=False, + ) + + return new_index, indexer + + def _wrap_reindex_result(self, target, indexer, preserve_names: bool): + if not isinstance(target, MultiIndex): + if indexer is None: + target = self + elif (indexer >= 0).all(): + target = self.take(indexer) + else: + try: + target = MultiIndex.from_tuples(target) + except TypeError: + # not all tuples, see test_constructor_dict_multiindex_reindex_flat + return target + + target = self._maybe_preserve_names(target, preserve_names) + return target + + def _maybe_preserve_names(self, target: Index, preserve_names: bool) -> Index: + if ( + preserve_names + and target.nlevels == self.nlevels + and target.names != self.names + ): + target = target.copy(deep=False) + target.names = self.names + return target + + # -------------------------------------------------------------------- + # Indexing Methods + + def _check_indexing_error(self, key) -> None: + if not is_hashable(key) or is_iterator(key): + # We allow tuples if they are hashable, whereas other Index + # subclasses require scalar. + # We have to explicitly exclude generators, as these are hashable. + raise InvalidIndexError(key) + + @cache_readonly + def _should_fallback_to_positional(self) -> bool: + """ + Should integer key(s) be treated as positional? + """ + # GH#33355 + return self.levels[0]._should_fallback_to_positional + + def _get_indexer_strict( + self, key, axis_name: str + ) -> tuple[Index, npt.NDArray[np.intp]]: + keyarr = key + if not isinstance(keyarr, Index): + keyarr = com.asarray_tuplesafe(keyarr) + + if len(keyarr) and not isinstance(keyarr[0], tuple): + indexer = self._get_indexer_level_0(keyarr) + + self._raise_if_missing(key, indexer, axis_name) + return self[indexer], indexer + + return super()._get_indexer_strict(key, axis_name) + + def _raise_if_missing(self, key, indexer, axis_name: str) -> None: + keyarr = key + if not isinstance(key, Index): + keyarr = com.asarray_tuplesafe(key) + + if len(keyarr) and not isinstance(keyarr[0], tuple): + # i.e. same condition for special case in MultiIndex._get_indexer_strict + + mask = indexer == -1 + if mask.any(): + check = self.levels[0].get_indexer(keyarr) + cmask = check == -1 + if cmask.any(): + raise KeyError(f"{keyarr[cmask]} not in index") + # We get here when levels still contain values which are not + # actually in Index anymore + raise KeyError(f"{keyarr} not in index") + else: + return super()._raise_if_missing(key, indexer, axis_name) + + def _get_indexer_level_0(self, target) -> npt.NDArray[np.intp]: + """ + Optimized equivalent to `self.get_level_values(0).get_indexer_for(target)`. + """ + lev = self.levels[0] + codes = self._codes[0] + cat = Categorical.from_codes(codes=codes, categories=lev, validate=False) + ci = Index(cat) + return ci.get_indexer_for(target) + + def get_slice_bound( + self, + label: Hashable | Sequence[Hashable], + side: Literal["left", "right"], + ) -> int: + """ + For an ordered MultiIndex, compute slice bound + that corresponds to given label. + + Returns leftmost (one-past-the-rightmost if `side=='right') position + of given label. + + Parameters + ---------- + label : object or tuple of objects + side : {'left', 'right'} + + Returns + ------- + int + Index of label. + + Notes + ----- + This method only works if level 0 index of the MultiIndex is lexsorted. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')]) + + Get the locations from the leftmost 'b' in the first level + until the end of the multiindex: + + >>> mi.get_slice_bound('b', side="left") + 1 + + Like above, but if you get the locations from the rightmost + 'b' in the first level and 'f' in the second level: + + >>> mi.get_slice_bound(('b','f'), side="right") + 3 + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + """ + if not isinstance(label, tuple): + label = (label,) + return self._partial_tup_index(label, side=side) + + # pylint: disable-next=useless-parent-delegation + def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: + """ + For an ordered MultiIndex, compute the slice locations for input + labels. + + The input labels can be tuples representing partial levels, e.g. for a + MultiIndex with 3 levels, you can pass a single value (corresponding to + the first level), or a 1-, 2-, or 3-tuple. + + Parameters + ---------- + start : label or tuple, default None + If None, defaults to the beginning + end : label or tuple + If None, defaults to the end + step : int or None + Slice step + + Returns + ------- + (start, end) : (int, int) + + Notes + ----- + This method only works if the MultiIndex is properly lexsorted. So, + if only the first 2 levels of a 3-level MultiIndex are lexsorted, + you can only pass two levels to ``.slice_locs``. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abbd'), list('deff')], + ... names=['A', 'B']) + + Get the slice locations from the beginning of 'b' in the first level + until the end of the multiindex: + + >>> mi.slice_locs(start='b') + (1, 4) + + Like above, but stop at the end of 'b' in the first level and 'f' in + the second level: + + >>> mi.slice_locs(start='b', end=('b', 'f')) + (1, 3) + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + """ + # This function adds nothing to its parent implementation (the magic + # happens in get_slice_bound method), but it adds meaningful doc. + return super().slice_locs(start, end, step) + + def _partial_tup_index(self, tup: tuple, side: Literal["left", "right"] = "left"): + if len(tup) > self._lexsort_depth: + raise UnsortedIndexError( + f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth " + f"({self._lexsort_depth})" + ) + + n = len(tup) + start, end = 0, len(self) + zipped = zip(tup, self.levels, self.codes) + for k, (lab, lev, level_codes) in enumerate(zipped): + section = level_codes[start:end] + + loc: npt.NDArray[np.intp] | np.intp | int + if lab not in lev and not isna(lab): + # short circuit + try: + loc = algos.searchsorted(lev, lab, side=side) + except TypeError as err: + # non-comparable e.g. test_slice_locs_with_type_mismatch + raise TypeError(f"Level type mismatch: {lab}") from err + if not is_integer(loc): + # non-comparable level, e.g. test_groupby_example + raise TypeError(f"Level type mismatch: {lab}") + if side == "right" and loc >= 0: + loc -= 1 + return start + algos.searchsorted(section, loc, side=side) + + idx = self._get_loc_single_level_index(lev, lab) + if isinstance(idx, slice) and k < n - 1: + # Get start and end value from slice, necessary when a non-integer + # interval is given as input GH#37707 + start = idx.start + end = idx.stop + elif k < n - 1: + # error: Incompatible types in assignment (expression has type + # "Union[ndarray[Any, dtype[signedinteger[Any]]] + end = start + algos.searchsorted( # type: ignore[assignment] + section, idx, side="right" + ) + # error: Incompatible types in assignment (expression has type + # "Union[ndarray[Any, dtype[signedinteger[Any]]] + start = start + algos.searchsorted( # type: ignore[assignment] + section, idx, side="left" + ) + elif isinstance(idx, slice): + idx = idx.start + return start + algos.searchsorted(section, idx, side=side) + else: + return start + algos.searchsorted(section, idx, side=side) + + def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: + """ + If key is NA value, location of index unify as -1. + + Parameters + ---------- + level_index: Index + key : label + + Returns + ------- + loc : int + If key is NA value, loc is -1 + Else, location of key in index. + + See Also + -------- + Index.get_loc : The get_loc method for (single-level) index. + """ + if is_scalar(key) and isna(key): + # TODO: need is_valid_na_for_dtype(key, level_index.dtype) + return -1 + else: + return level_index.get_loc(key) + + def get_loc(self, key): + """ + Get location for a label or a tuple of labels. + + The location is returned as an integer/slice or boolean + mask. + + Parameters + ---------- + key : label or tuple of labels (one for each level) + + Returns + ------- + int, slice object or boolean mask + If the key is past the lexsort depth, the return may be a + boolean mask array, otherwise it is always a slice or int. + + See Also + -------- + Index.get_loc : The get_loc method for (single-level) index. + MultiIndex.slice_locs : Get slice location given start label(s) and + end label(s). + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + + Notes + ----- + The key cannot be a slice, list of same-level labels, a boolean mask, + or a sequence of such. If you want to use those, use + :meth:`MultiIndex.get_locs` instead. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + + >>> mi.get_loc('b') + slice(1, 3, None) + + >>> mi.get_loc(('b', 'e')) + 1 + """ + self._check_indexing_error(key) + + def _maybe_to_slice(loc): + """convert integer indexer to boolean mask or slice if possible""" + if not isinstance(loc, np.ndarray) or loc.dtype != np.intp: + return loc + + loc = lib.maybe_indices_to_slice(loc, len(self)) + if isinstance(loc, slice): + return loc + + mask = np.empty(len(self), dtype="bool") + mask.fill(False) + mask[loc] = True + return mask + + if not isinstance(key, tuple): + loc = self._get_level_indexer(key, level=0) + return _maybe_to_slice(loc) + + keylen = len(key) + if self.nlevels < keylen: + raise KeyError( + f"Key length ({keylen}) exceeds index depth ({self.nlevels})" + ) + + if keylen == self.nlevels and self.is_unique: + # TODO: what if we have an IntervalIndex level? + # i.e. do we need _index_as_unique on that level? + try: + return self._engine.get_loc(key) + except KeyError as err: + raise KeyError(key) from err + except TypeError: + # e.g. test_partial_slicing_with_multiindex partial string slicing + loc, _ = self.get_loc_level(key, list(range(self.nlevels))) + return loc + + # -- partial selection or non-unique index + # break the key into 2 parts based on the lexsort_depth of the index; + # the first part returns a continuous slice of the index; the 2nd part + # needs linear search within the slice + i = self._lexsort_depth + lead_key, follow_key = key[:i], key[i:] + + if not lead_key: + start = 0 + stop = len(self) + else: + try: + start, stop = self.slice_locs(lead_key, lead_key) + except TypeError as err: + # e.g. test_groupby_example key = ((0, 0, 1, 2), "new_col") + # when self has 5 integer levels + raise KeyError(key) from err + + if start == stop: + raise KeyError(key) + + if not follow_key: + return slice(start, stop) + + warnings.warn( + "indexing past lexsort depth may impact performance.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) + + loc = np.arange(start, stop, dtype=np.intp) + + for i, k in enumerate(follow_key, len(lead_key)): + mask = self.codes[i][loc] == self._get_loc_single_level_index( + self.levels[i], k + ) + if not mask.all(): + loc = loc[mask] + if not len(loc): + raise KeyError(key) + + return _maybe_to_slice(loc) if len(loc) != stop - start else slice(start, stop) + + def get_loc_level(self, key, level: IndexLabel = 0, drop_level: bool = True): + """ + Get location and sliced index for requested label(s)/level(s). + + Parameters + ---------- + key : label or sequence of labels + level : int/level name or list thereof, optional + drop_level : bool, default True + If ``False``, the resulting index will not drop any level. + + Returns + ------- + tuple + A 2-tuple where the elements : + + Element 0: int, slice object or boolean array. + + Element 1: The resulting sliced multiindex/index. If the key + contains all levels, this will be ``None``. + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')], + ... names=['A', 'B']) + + >>> mi.get_loc_level('b') + (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B')) + + >>> mi.get_loc_level('e', level='B') + (array([False, True, False]), Index(['b'], dtype='object', name='A')) + + >>> mi.get_loc_level(['b', 'e']) + (1, None) + """ + if not isinstance(level, (list, tuple)): + level = self._get_level_number(level) + else: + level = [self._get_level_number(lev) for lev in level] + + loc, mi = self._get_loc_level(key, level=level) + if not drop_level: + if lib.is_integer(loc): + # Slice index must be an integer or None + mi = self[loc : loc + 1] + else: + mi = self[loc] + return loc, mi + + def _get_loc_level(self, key, level: int | list[int] = 0): + """ + get_loc_level but with `level` known to be positional, not name-based. + """ + + # different name to distinguish from maybe_droplevels + def maybe_mi_droplevels(indexer, levels): + """ + If level does not exist or all levels were dropped, the exception + has to be handled outside. + """ + new_index = self[indexer] + + for i in sorted(levels, reverse=True): + new_index = new_index._drop_level_numbers([i]) + + return new_index + + if isinstance(level, (tuple, list)): + if len(key) != len(level): + raise AssertionError( + "Key for location must have same length as number of levels" + ) + result = None + for lev, k in zip(level, key): + loc, new_index = self._get_loc_level(k, level=lev) + if isinstance(loc, slice): + mask = np.zeros(len(self), dtype=bool) + mask[loc] = True + loc = mask + result = loc if result is None else result & loc + + try: + # FIXME: we should be only dropping levels on which we are + # scalar-indexing + mi = maybe_mi_droplevels(result, level) + except ValueError: + # droplevel failed because we tried to drop all levels, + # i.e. len(level) == self.nlevels + mi = self[result] + + return result, mi + + # kludge for #1796 + if isinstance(key, list): + key = tuple(key) + + if isinstance(key, tuple) and level == 0: + try: + # Check if this tuple is a single key in our first level + if key in self.levels[0]: + indexer = self._get_level_indexer(key, level=level) + new_index = maybe_mi_droplevels(indexer, [0]) + return indexer, new_index + except (TypeError, InvalidIndexError): + pass + + if not any(isinstance(k, slice) for k in key): + if len(key) == self.nlevels and self.is_unique: + # Complete key in unique index -> standard get_loc + try: + return (self._engine.get_loc(key), None) + except KeyError as err: + raise KeyError(key) from err + except TypeError: + # e.g. partial string indexing + # test_partial_string_timestamp_multiindex + pass + + # partial selection + indexer = self.get_loc(key) + ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] + if len(ilevels) == self.nlevels: + if is_integer(indexer): + # we are dropping all levels + return indexer, None + + # TODO: in some cases we still need to drop some levels, + # e.g. test_multiindex_perf_warn + # test_partial_string_timestamp_multiindex + ilevels = [ + i + for i in range(len(key)) + if ( + not isinstance(key[i], str) + or not self.levels[i]._supports_partial_string_indexing + ) + and key[i] != slice(None, None) + ] + if len(ilevels) == self.nlevels: + # TODO: why? + ilevels = [] + return indexer, maybe_mi_droplevels(indexer, ilevels) + + else: + indexer = None + for i, k in enumerate(key): + if not isinstance(k, slice): + loc_level = self._get_level_indexer(k, level=i) + if isinstance(loc_level, slice): + if com.is_null_slice(loc_level) or com.is_full_slice( + loc_level, len(self) + ): + # everything + continue + + # e.g. test_xs_IndexSlice_argument_not_implemented + k_index = np.zeros(len(self), dtype=bool) + k_index[loc_level] = True + + else: + k_index = loc_level + + elif com.is_null_slice(k): + # taking everything, does not affect `indexer` below + continue + + else: + # FIXME: this message can be inaccurate, e.g. + # test_series_varied_multiindex_alignment + raise TypeError(f"Expected label or tuple of labels, got {key}") + + if indexer is None: + indexer = k_index + else: + indexer &= k_index + if indexer is None: + indexer = slice(None, None) + ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] + return indexer, maybe_mi_droplevels(indexer, ilevels) + else: + indexer = self._get_level_indexer(key, level=level) + if ( + isinstance(key, str) + and self.levels[level]._supports_partial_string_indexing + ): + # check to see if we did an exact lookup vs sliced + check = self.levels[level].get_loc(key) + if not is_integer(check): + # e.g. test_partial_string_timestamp_multiindex + return indexer, self[indexer] + + try: + result_index = maybe_mi_droplevels(indexer, [level]) + except ValueError: + result_index = self[indexer] + + return indexer, result_index + + def _get_level_indexer( + self, key, level: int = 0, indexer: npt.NDArray[np.bool_] | None = None + ): + # `level` kwarg is _always_ positional, never name + # return a boolean array or slice showing where the key is + # in the totality of values + # if the indexer is provided, then use this + + level_index = self.levels[level] + level_codes = self.codes[level] + + def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): + # Compute a bool indexer to identify the positions to take. + # If we have an existing indexer, we only need to examine the + # subset of positions where the existing indexer is True. + if indexer is not None: + # we only need to look at the subset of codes where the + # existing indexer equals True + codes = codes[indexer] + + if step is None or step == 1: + new_indexer = (codes >= start) & (codes < stop) + else: + r = np.arange(start, stop, step, dtype=codes.dtype) + new_indexer = algos.isin(codes, r) + + if indexer is None: + return new_indexer + + indexer = indexer.copy() + indexer[indexer] = new_indexer + return indexer + + if isinstance(key, slice): + # handle a slice, returning a slice if we can + # otherwise a boolean indexer + step = key.step + is_negative_step = step is not None and step < 0 + + try: + if key.start is not None: + start = level_index.get_loc(key.start) + elif is_negative_step: + start = len(level_index) - 1 + else: + start = 0 + + if key.stop is not None: + stop = level_index.get_loc(key.stop) + elif is_negative_step: + stop = 0 + elif isinstance(start, slice): + stop = len(level_index) + else: + stop = len(level_index) - 1 + except KeyError: + # we have a partial slice (like looking up a partial date + # string) + start = stop = level_index.slice_indexer(key.start, key.stop, key.step) + step = start.step + + if isinstance(start, slice) or isinstance(stop, slice): + # we have a slice for start and/or stop + # a partial date slicer on a DatetimeIndex generates a slice + # note that the stop ALREADY includes the stopped point (if + # it was a string sliced) + start = getattr(start, "start", start) + stop = getattr(stop, "stop", stop) + return convert_indexer(start, stop, step) + + elif level > 0 or self._lexsort_depth == 0 or step is not None: + # need to have like semantics here to right + # searching as when we are using a slice + # so adjust the stop by 1 (so we include stop) + stop = (stop - 1) if is_negative_step else (stop + 1) + return convert_indexer(start, stop, step) + else: + # sorted, so can return slice object -> view + i = algos.searchsorted(level_codes, start, side="left") + j = algos.searchsorted(level_codes, stop, side="right") + return slice(i, j, step) + + else: + idx = self._get_loc_single_level_index(level_index, key) + + if level > 0 or self._lexsort_depth == 0: + # Desired level is not sorted + if isinstance(idx, slice): + # test_get_loc_partial_timestamp_multiindex + locs = (level_codes >= idx.start) & (level_codes < idx.stop) + return locs + + locs = np.asarray(level_codes == idx, dtype=bool) + + if not locs.any(): + # The label is present in self.levels[level] but unused: + raise KeyError(key) + return locs + + if isinstance(idx, slice): + # e.g. test_partial_string_timestamp_multiindex + start = algos.searchsorted(level_codes, idx.start, side="left") + # NB: "left" here bc of slice semantics + end = algos.searchsorted(level_codes, idx.stop, side="left") + else: + start = algos.searchsorted(level_codes, idx, side="left") + end = algos.searchsorted(level_codes, idx, side="right") + + if start == end: + # The label is present in self.levels[level] but unused: + raise KeyError(key) + return slice(start, end) + + def get_locs(self, seq) -> npt.NDArray[np.intp]: + """ + Get location for a sequence of labels. + + Parameters + ---------- + seq : label, slice, list, mask or a sequence of such + You should use one of the above for each level. + If a level should not be used, set it to ``slice(None)``. + + Returns + ------- + numpy.ndarray + NumPy array of integers suitable for passing to iloc. + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.slice_locs : Get slice location given start label(s) and + end label(s). + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + + >>> mi.get_locs('b') # doctest: +SKIP + array([1, 2], dtype=int64) + + >>> mi.get_locs([slice(None), ['e', 'f']]) # doctest: +SKIP + array([1, 2], dtype=int64) + + >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP + array([2], dtype=int64) + """ + + # must be lexsorted to at least as many levels + true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] + if true_slices and true_slices[-1] >= self._lexsort_depth: + raise UnsortedIndexError( + "MultiIndex slicing requires the index to be lexsorted: slicing " + f"on levels {true_slices}, lexsort depth {self._lexsort_depth}" + ) + + if any(x is Ellipsis for x in seq): + raise NotImplementedError( + "MultiIndex does not support indexing with Ellipsis" + ) + + n = len(self) + + def _to_bool_indexer(indexer) -> npt.NDArray[np.bool_]: + if isinstance(indexer, slice): + new_indexer = np.zeros(n, dtype=np.bool_) + new_indexer[indexer] = True + return new_indexer + return indexer + + # a bool indexer for the positions we want to take + indexer: npt.NDArray[np.bool_] | None = None + + for i, k in enumerate(seq): + lvl_indexer: npt.NDArray[np.bool_] | slice | None = None + + if com.is_bool_indexer(k): + if len(k) != n: + raise ValueError( + "cannot index with a boolean indexer that " + "is not the same length as the index" + ) + lvl_indexer = np.asarray(k) + if indexer is None: + lvl_indexer = lvl_indexer.copy() + + elif is_list_like(k): + # a collection of labels to include from this level (these are or'd) + + # GH#27591 check if this is a single tuple key in the level + try: + lvl_indexer = self._get_level_indexer(k, level=i, indexer=indexer) + except (InvalidIndexError, TypeError, KeyError) as err: + # InvalidIndexError e.g. non-hashable, fall back to treating + # this as a sequence of labels + # KeyError it can be ambiguous if this is a label or sequence + # of labels + # github.com/pandas-dev/pandas/issues/39424#issuecomment-871626708 + for x in k: + if not is_hashable(x): + # e.g. slice + raise err + # GH 39424: Ignore not founds + # GH 42351: No longer ignore not founds & enforced in 2.0 + # TODO: how to handle IntervalIndex level? (no test cases) + item_indexer = self._get_level_indexer( + x, level=i, indexer=indexer + ) + if lvl_indexer is None: + lvl_indexer = _to_bool_indexer(item_indexer) + elif isinstance(item_indexer, slice): + lvl_indexer[item_indexer] = True # type: ignore[index] + else: + lvl_indexer |= item_indexer + + if lvl_indexer is None: + # no matches we are done + # test_loc_getitem_duplicates_multiindex_empty_indexer + return np.array([], dtype=np.intp) + + elif com.is_null_slice(k): + # empty slice + if indexer is None and i == len(seq) - 1: + return np.arange(n, dtype=np.intp) + continue + + else: + # a slice or a single label + lvl_indexer = self._get_level_indexer(k, level=i, indexer=indexer) + + # update indexer + lvl_indexer = _to_bool_indexer(lvl_indexer) + if indexer is None: + indexer = lvl_indexer + else: + indexer &= lvl_indexer + if not np.any(indexer) and np.any(lvl_indexer): + raise KeyError(seq) + + # empty indexer + if indexer is None: + return np.array([], dtype=np.intp) + + pos_indexer = indexer.nonzero()[0] + return self._reorder_indexer(seq, pos_indexer) + + # -------------------------------------------------------------------- + + def _reorder_indexer( + self, + seq: tuple[Scalar | Iterable | AnyArrayLike, ...], + indexer: npt.NDArray[np.intp], + ) -> npt.NDArray[np.intp]: + """ + Reorder an indexer of a MultiIndex (self) so that the labels are in the + same order as given in seq + + Parameters + ---------- + seq : label/slice/list/mask or a sequence of such + indexer: a position indexer of self + + Returns + ------- + indexer : a sorted position indexer of self ordered as seq + """ + + # check if sorting is necessary + need_sort = False + for i, k in enumerate(seq): + if com.is_null_slice(k) or com.is_bool_indexer(k) or is_scalar(k): + pass + elif is_list_like(k): + if len(k) <= 1: # type: ignore[arg-type] + pass + elif self._is_lexsorted(): + # If the index is lexsorted and the list_like label + # in seq are sorted then we do not need to sort + k_codes = self.levels[i].get_indexer(k) + k_codes = k_codes[k_codes >= 0] # Filter absent keys + # True if the given codes are not ordered + need_sort = (k_codes[:-1] > k_codes[1:]).any() + else: + need_sort = True + elif isinstance(k, slice): + if self._is_lexsorted(): + need_sort = k.step is not None and k.step < 0 + else: + need_sort = True + else: + need_sort = True + if need_sort: + break + if not need_sort: + return indexer + + n = len(self) + keys: tuple[np.ndarray, ...] = () + # For each level of the sequence in seq, map the level codes with the + # order they appears in a list-like sequence + # This mapping is then use to reorder the indexer + for i, k in enumerate(seq): + if is_scalar(k): + # GH#34603 we want to treat a scalar the same as an all equal list + k = [k] + if com.is_bool_indexer(k): + new_order = np.arange(n)[indexer] + elif is_list_like(k): + # Generate a map with all level codes as sorted initially + if not isinstance(k, (np.ndarray, ExtensionArray, Index, ABCSeries)): + k = sanitize_array(k, None) + k = algos.unique(k) + key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( + self.levels[i] + ) + # Set order as given in the indexer list + level_indexer = self.levels[i].get_indexer(k) + level_indexer = level_indexer[level_indexer >= 0] # Filter absent keys + key_order_map[level_indexer] = np.arange(len(level_indexer)) + + new_order = key_order_map[self.codes[i][indexer]] + elif isinstance(k, slice) and k.step is not None and k.step < 0: + # flip order for negative step + new_order = np.arange(n)[::-1][indexer] + elif isinstance(k, slice) and k.start is None and k.stop is None: + # slice(None) should not determine order GH#31330 + new_order = np.ones((n,), dtype=np.intp)[indexer] + else: + # For all other case, use the same order as the level + new_order = np.arange(n)[indexer] + keys = (new_order,) + keys + + # Find the reordering using lexsort on the keys mapping + ind = np.lexsort(keys) + return indexer[ind] + + def truncate(self, before=None, after=None) -> MultiIndex: + """ + Slice index between two labels / tuples, return new MultiIndex. + + Parameters + ---------- + before : label or tuple, can be partial. Default None + None defaults to start. + after : label or tuple, can be partial. Default None + None defaults to end. + + Returns + ------- + MultiIndex + The truncated MultiIndex. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z']]) + >>> mi + MultiIndex([('a', 'x'), ('b', 'y'), ('c', 'z')], + ) + >>> mi.truncate(before='a', after='b') + MultiIndex([('a', 'x'), ('b', 'y')], + ) + """ + if after and before and after < before: + raise ValueError("after < before") + + i, j = self.levels[0].slice_locs(before, after) + left, right = self.slice_locs(before, after) + + new_levels = list(self.levels) + new_levels[0] = new_levels[0][i:j] + + new_codes = [level_codes[left:right] for level_codes in self.codes] + new_codes[0] = new_codes[0] - i + + return MultiIndex( + levels=new_levels, + codes=new_codes, + names=self._names, + verify_integrity=False, + ) + + def equals(self, other: object) -> bool: + """ + Determines if two MultiIndex objects have the same labeling information + (the levels themselves do not necessarily have to be the same) + + See Also + -------- + equal_levels + """ + if self.is_(other): + return True + + if not isinstance(other, Index): + return False + + if len(self) != len(other): + return False + + if not isinstance(other, MultiIndex): + # d-level MultiIndex can equal d-tuple Index + if not self._should_compare(other): + # object Index or Categorical[object] may contain tuples + return False + return array_equivalent(self._values, other._values) + + if self.nlevels != other.nlevels: + return False + + for i in range(self.nlevels): + self_codes = self.codes[i] + other_codes = other.codes[i] + self_mask = self_codes == -1 + other_mask = other_codes == -1 + if not np.array_equal(self_mask, other_mask): + return False + self_codes = self_codes[~self_mask] + self_values = self.levels[i]._values.take(self_codes) + + other_codes = other_codes[~other_mask] + other_values = other.levels[i]._values.take(other_codes) + + # since we use NaT both datetime64 and timedelta64 we can have a + # situation where a level is typed say timedelta64 in self (IOW it + # has other values than NaT) but types datetime64 in other (where + # its all NaT) but these are equivalent + if len(self_values) == 0 and len(other_values) == 0: + continue + + if not isinstance(self_values, np.ndarray): + # i.e. ExtensionArray + if not self_values.equals(other_values): + return False + elif not isinstance(other_values, np.ndarray): + # i.e. other is ExtensionArray + if not other_values.equals(self_values): + return False + else: + if not array_equivalent(self_values, other_values): + return False + + return True + + def equal_levels(self, other: MultiIndex) -> bool: + """ + Return True if the levels of both MultiIndex objects are the same + + """ + if self.nlevels != other.nlevels: + return False + + for i in range(self.nlevels): + if not self.levels[i].equals(other.levels[i]): + return False + return True + + # -------------------------------------------------------------------- + # Set Methods + + def _union(self, other, sort) -> MultiIndex: + other, result_names = self._convert_can_do_setop(other) + if other.has_duplicates: + # This is only necessary if other has dupes, + # otherwise difference is faster + result = super()._union(other, sort) + + if isinstance(result, MultiIndex): + return result + return MultiIndex.from_arrays( + zip(*result), sortorder=None, names=result_names + ) + + else: + right_missing = other.difference(self, sort=False) + if len(right_missing): + result = self.append(right_missing) + else: + result = self._get_reconciled_name_object(other) + + if sort is not False: + try: + result = result.sort_values() + except TypeError: + if sort is True: + raise + warnings.warn( + "The values in the array are unorderable. " + "Pass `sort=False` to suppress this warning.", + RuntimeWarning, + stacklevel=find_stack_level(), + ) + return result + + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + return is_object_dtype(dtype) + + def _get_reconciled_name_object(self, other) -> MultiIndex: + """ + If the result of a set operation will be self, + return self, unless the names change, in which + case make a shallow copy of self. + """ + names = self._maybe_match_names(other) + if self.names != names: + # error: Cannot determine type of "rename" + return self.rename(names) # type: ignore[has-type] + return self + + def _maybe_match_names(self, other): + """ + Try to find common names to attach to the result of an operation between + a and b. Return a consensus list of names if they match at least partly + or list of None if they have completely different names. + """ + if len(self.names) != len(other.names): + return [None] * len(self.names) + names = [] + for a_name, b_name in zip(self.names, other.names): + if a_name == b_name: + names.append(a_name) + else: + # TODO: what if they both have np.nan for their names? + names.append(None) + return names + + def _wrap_intersection_result(self, other, result) -> MultiIndex: + _, result_names = self._convert_can_do_setop(other) + return result.set_names(result_names) + + def _wrap_difference_result(self, other, result: MultiIndex) -> MultiIndex: + _, result_names = self._convert_can_do_setop(other) + + if len(result) == 0: + return result.remove_unused_levels().set_names(result_names) + else: + return result.set_names(result_names) + + def _convert_can_do_setop(self, other): + result_names = self.names + + if not isinstance(other, Index): + if len(other) == 0: + return self[:0], self.names + else: + msg = "other must be a MultiIndex or a list of tuples" + try: + other = MultiIndex.from_tuples(other, names=self.names) + except (ValueError, TypeError) as err: + # ValueError raised by tuples_to_object_array if we + # have non-object dtype + raise TypeError(msg) from err + else: + result_names = get_unanimous_names(self, other) + + return other, result_names + + # -------------------------------------------------------------------- + + @doc(Index.astype) + def astype(self, dtype, copy: bool = True): + dtype = pandas_dtype(dtype) + if isinstance(dtype, CategoricalDtype): + msg = "> 1 ndim Categorical are not supported at this time" + raise NotImplementedError(msg) + if not is_object_dtype(dtype): + raise TypeError( + "Setting a MultiIndex dtype to anything other than object " + "is not supported" + ) + if copy is True: + return self._view() + return self + + def _validate_fill_value(self, item): + if isinstance(item, MultiIndex): + # GH#43212 + if item.nlevels != self.nlevels: + raise ValueError("Item must have length equal to number of levels.") + return item._values + elif not isinstance(item, tuple): + # Pad the key with empty strings if lower levels of the key + # aren't specified: + item = (item,) + ("",) * (self.nlevels - 1) + elif len(item) != self.nlevels: + raise ValueError("Item must have length equal to number of levels.") + return item + + def putmask(self, mask, value: MultiIndex) -> MultiIndex: + """ + Return a new MultiIndex of the values set with the mask. + + Parameters + ---------- + mask : array like + value : MultiIndex + Must either be the same length as self or length one + + Returns + ------- + MultiIndex + """ + mask, noop = validate_putmask(self, mask) + if noop: + return self.copy() + + if len(mask) == len(value): + subset = value[mask].remove_unused_levels() + else: + subset = value.remove_unused_levels() + + new_levels = [] + new_codes = [] + + for i, (value_level, level, level_codes) in enumerate( + zip(subset.levels, self.levels, self.codes) + ): + new_level = level.union(value_level, sort=False) + value_codes = new_level.get_indexer_for(subset.get_level_values(i)) + new_code = ensure_int64(level_codes) + new_code[mask] = value_codes + new_levels.append(new_level) + new_codes.append(new_code) + + return MultiIndex( + levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False + ) + + def insert(self, loc: int, item) -> MultiIndex: + """ + Make new MultiIndex inserting new item at location + + Parameters + ---------- + loc : int + item : tuple + Must be same length as number of levels in the MultiIndex + + Returns + ------- + new_index : Index + """ + item = self._validate_fill_value(item) + + new_levels = [] + new_codes = [] + for k, level, level_codes in zip(item, self.levels, self.codes): + if k not in level: + # have to insert into level + # must insert at end otherwise you have to recompute all the + # other codes + lev_loc = len(level) + level = level.insert(lev_loc, k) + else: + lev_loc = level.get_loc(k) + + new_levels.append(level) + new_codes.append(np.insert(ensure_int64(level_codes), loc, lev_loc)) + + return MultiIndex( + levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False + ) + + def delete(self, loc) -> MultiIndex: + """ + Make new index with passed location deleted + + Returns + ------- + new_index : MultiIndex + """ + new_codes = [np.delete(level_codes, loc) for level_codes in self.codes] + return MultiIndex( + levels=self.levels, + codes=new_codes, + names=self.names, + verify_integrity=False, + ) + + @doc(Index.isin) + def isin(self, values, level=None) -> npt.NDArray[np.bool_]: + if isinstance(values, Generator): + values = list(values) + + if level is None: + if len(values) == 0: + return np.zeros((len(self),), dtype=np.bool_) + if not isinstance(values, MultiIndex): + values = MultiIndex.from_tuples(values) + return values.unique().get_indexer_for(self) != -1 + else: + num = self._get_level_number(level) + levs = self.get_level_values(num) + + if levs.size == 0: + return np.zeros(len(levs), dtype=np.bool_) + return levs.isin(values) + + # error: Incompatible types in assignment (expression has type overloaded function, + # base class "Index" defined the type as "Callable[[Index, Any, bool], Any]") + rename = Index.set_names # type: ignore[assignment] + + # --------------------------------------------------------------- + # Arithmetic/Numeric Methods - Disabled + + __add__ = make_invalid_op("__add__") + __radd__ = make_invalid_op("__radd__") + __iadd__ = make_invalid_op("__iadd__") + __sub__ = make_invalid_op("__sub__") + __rsub__ = make_invalid_op("__rsub__") + __isub__ = make_invalid_op("__isub__") + __pow__ = make_invalid_op("__pow__") + __rpow__ = make_invalid_op("__rpow__") + __mul__ = make_invalid_op("__mul__") + __rmul__ = make_invalid_op("__rmul__") + __floordiv__ = make_invalid_op("__floordiv__") + __rfloordiv__ = make_invalid_op("__rfloordiv__") + __truediv__ = make_invalid_op("__truediv__") + __rtruediv__ = make_invalid_op("__rtruediv__") + __mod__ = make_invalid_op("__mod__") + __rmod__ = make_invalid_op("__rmod__") + __divmod__ = make_invalid_op("__divmod__") + __rdivmod__ = make_invalid_op("__rdivmod__") + # Unary methods disabled + __neg__ = make_invalid_op("__neg__") + __pos__ = make_invalid_op("__pos__") + __abs__ = make_invalid_op("__abs__") + __invert__ = make_invalid_op("__invert__") + + +def _lexsort_depth(codes: list[np.ndarray], nlevels: int) -> int: + """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted.""" + int64_codes = [ensure_int64(level_codes) for level_codes in codes] + for k in range(nlevels, 0, -1): + if libalgos.is_lexsorted(int64_codes[:k]): + return k + return 0 + + +def sparsify_labels(label_list, start: int = 0, sentinel: object = ""): + pivoted = list(zip(*label_list)) + k = len(label_list) + + result = pivoted[: start + 1] + prev = pivoted[start] + + for cur in pivoted[start + 1 :]: + sparse_cur = [] + + for i, (p, t) in enumerate(zip(prev, cur)): + if i == k - 1: + sparse_cur.append(t) + # error: Argument 1 to "append" of "list" has incompatible + # type "list[Any]"; expected "tuple[Any, ...]" + result.append(sparse_cur) # type: ignore[arg-type] + break + + if p == t: + sparse_cur.append(sentinel) + else: + sparse_cur.extend(cur[i:]) + # error: Argument 1 to "append" of "list" has incompatible + # type "list[Any]"; expected "tuple[Any, ...]" + result.append(sparse_cur) # type: ignore[arg-type] + break + + prev = cur + + return list(zip(*result)) + + +def _get_na_rep(dtype: DtypeObj) -> str: + if isinstance(dtype, ExtensionDtype): + return f"{dtype.na_value}" + else: + dtype_type = dtype.type + + return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype_type, "NaN") + + +def maybe_droplevels(index: Index, key) -> Index: + """ + Attempt to drop level or levels from the given index. + + Parameters + ---------- + index: Index + key : scalar or tuple + + Returns + ------- + Index + """ + # drop levels + original_index = index + if isinstance(key, tuple): + # Caller is responsible for ensuring the key is not an entry in the first + # level of the MultiIndex. + for _ in key: + try: + index = index._drop_level_numbers([0]) + except ValueError: + # we have dropped too much, so back out + return original_index + else: + try: + index = index._drop_level_numbers([0]) + except ValueError: + pass + + return index + + +def _coerce_indexer_frozen(array_like, categories, copy: bool = False) -> np.ndarray: + """ + Coerce the array-like indexer to the smallest integer dtype that can encode all + of the given categories. + + Parameters + ---------- + array_like : array-like + categories : array-like + copy : bool + + Returns + ------- + np.ndarray + Non-writeable. + """ + array_like = coerce_indexer_dtype(array_like, categories) + if copy: + array_like = array_like.copy() + array_like.flags.writeable = False + return array_like + + +def _require_listlike(level, arr, arrname: str): + """ + Ensure that level is either None or listlike, and arr is list-of-listlike. + """ + if level is not None and not is_list_like(level): + if not is_list_like(arr): + raise TypeError(f"{arrname} must be list-like") + if len(arr) > 0 and is_list_like(arr[0]): + raise TypeError(f"{arrname} must be list-like") + level = [level] + arr = [arr] + elif level is None or is_list_like(level): + if not is_list_like(arr) or not is_list_like(arr[0]): + raise TypeError(f"{arrname} must be list of lists-like") + return level, arr diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/period.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/period.py new file mode 100644 index 0000000000000000000000000000000000000000..b2f1933800fd383df9dc52a211b54190985fc32e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/period.py @@ -0,0 +1,614 @@ +from __future__ import annotations + +from datetime import ( + datetime, + timedelta, +) +from typing import TYPE_CHECKING +import warnings + +import numpy as np + +from pandas._libs import index as libindex +from pandas._libs.tslibs import ( + BaseOffset, + NaT, + Period, + Resolution, + Tick, +) +from pandas._libs.tslibs.dtypes import OFFSET_TO_PERIOD_FREQSTR +from pandas.util._decorators import ( + cache_readonly, + doc, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import is_integer +from pandas.core.dtypes.dtypes import PeriodDtype +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import is_valid_na_for_dtype + +from pandas.core.arrays.period import ( + PeriodArray, + period_array, + raise_on_incompatible, + validate_dtype_freq, +) +import pandas.core.common as com +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import maybe_extract_name +from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin +from pandas.core.indexes.datetimes import ( + DatetimeIndex, + Index, +) +from pandas.core.indexes.extension import inherit_names + +if TYPE_CHECKING: + from collections.abc import Hashable + + from pandas._typing import ( + Dtype, + DtypeObj, + Self, + npt, + ) + + +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update({"target_klass": "PeriodIndex or list of Periods"}) +_shared_doc_kwargs = { + "klass": "PeriodArray", +} + +# --- Period index sketch + + +def _new_PeriodIndex(cls, **d): + # GH13277 for unpickling + values = d.pop("data") + if values.dtype == "int64": + freq = d.pop("freq", None) + dtype = PeriodDtype(freq) + values = PeriodArray(values, dtype=dtype) + return cls._simple_new(values, **d) + else: + return cls(values, **d) + + +@inherit_names( + ["strftime", "start_time", "end_time"] + PeriodArray._field_ops, + PeriodArray, + wrap=True, +) +@inherit_names(["is_leap_year"], PeriodArray) +class PeriodIndex(DatetimeIndexOpsMixin): + """ + Immutable ndarray holding ordinal values indicating regular periods in time. + + Index keys are boxed to Period objects which carries the metadata (eg, + frequency information). + + Parameters + ---------- + data : array-like (1d int np.ndarray or PeriodArray), optional + Optional period-like data to construct index with. + copy : bool + Make a copy of input ndarray. + freq : str or period object, optional + One of pandas period strings or corresponding objects. + year : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. + month : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. + quarter : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. + day : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. + hour : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. + minute : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. + second : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. + dtype : str or PeriodDtype, default None + + Attributes + ---------- + day + dayofweek + day_of_week + dayofyear + day_of_year + days_in_month + daysinmonth + end_time + freq + freqstr + hour + is_leap_year + minute + month + quarter + qyear + second + start_time + week + weekday + weekofyear + year + + Methods + ------- + asfreq + strftime + to_timestamp + from_fields + from_ordinals + + See Also + -------- + Index : The base pandas Index type. + Period : Represents a period of time. + DatetimeIndex : Index with datetime64 data. + TimedeltaIndex : Index of timedelta64 data. + period_range : Create a fixed-frequency PeriodIndex. + + Examples + -------- + >>> idx = pd.PeriodIndex.from_fields(year=[2000, 2002], quarter=[1, 3]) + >>> idx + PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]') + """ + + _typ = "periodindex" + + _data: PeriodArray + freq: BaseOffset + dtype: PeriodDtype + + _data_cls = PeriodArray + _supports_partial_string_indexing = True + + @property + def _engine_type(self) -> type[libindex.PeriodEngine]: + return libindex.PeriodEngine + + @cache_readonly + def _resolution_obj(self) -> Resolution: + # for compat with DatetimeIndex + return self.dtype._resolution_obj + + # -------------------------------------------------------------------- + # methods that dispatch to array and wrap result in Index + # These are defined here instead of via inherit_names for mypy + + @doc( + PeriodArray.asfreq, + other="pandas.arrays.PeriodArray", + other_name="PeriodArray", + **_shared_doc_kwargs, + ) + def asfreq(self, freq=None, how: str = "E") -> Self: + arr = self._data.asfreq(freq, how) + return type(self)._simple_new(arr, name=self.name) + + @doc(PeriodArray.to_timestamp) + def to_timestamp(self, freq=None, how: str = "start") -> DatetimeIndex: + arr = self._data.to_timestamp(freq, how) + return DatetimeIndex._simple_new(arr, name=self.name) + + @property + @doc(PeriodArray.hour.fget) + def hour(self) -> Index: + return Index(self._data.hour, name=self.name) + + @property + @doc(PeriodArray.minute.fget) + def minute(self) -> Index: + return Index(self._data.minute, name=self.name) + + @property + @doc(PeriodArray.second.fget) + def second(self) -> Index: + return Index(self._data.second, name=self.name) + + # ------------------------------------------------------------------------ + # Index Constructors + + def __new__( + cls, + data=None, + ordinal=None, + freq=None, + dtype: Dtype | None = None, + copy: bool = False, + name: Hashable | None = None, + **fields, + ) -> Self: + valid_field_set = { + "year", + "month", + "day", + "quarter", + "hour", + "minute", + "second", + } + + refs = None + if not copy and isinstance(data, (Index, ABCSeries)): + refs = data._references + + if not set(fields).issubset(valid_field_set): + argument = next(iter(set(fields) - valid_field_set)) + raise TypeError(f"__new__() got an unexpected keyword argument {argument}") + elif len(fields): + # GH#55960 + warnings.warn( + "Constructing PeriodIndex from fields is deprecated. Use " + "PeriodIndex.from_fields instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if ordinal is not None: + # GH#55960 + warnings.warn( + "The 'ordinal' keyword in PeriodIndex is deprecated and will " + "be removed in a future version. Use PeriodIndex.from_ordinals " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + name = maybe_extract_name(name, data, cls) + + if data is None and ordinal is None: + # range-based. + if not fields: + # test_pickle_compat_construction + cls._raise_scalar_data_error(None) + data = cls.from_fields(**fields, freq=freq)._data + copy = False + + elif fields: + if data is not None: + raise ValueError("Cannot pass both data and fields") + raise ValueError("Cannot pass both ordinal and fields") + + else: + freq = validate_dtype_freq(dtype, freq) + + # PeriodIndex allow PeriodIndex(period_index, freq=different) + # Let's not encourage that kind of behavior in PeriodArray. + + if freq and isinstance(data, cls) and data.freq != freq: + # TODO: We can do some of these with no-copy / coercion? + # e.g. D -> 2D seems to be OK + data = data.asfreq(freq) + + if data is None and ordinal is not None: + ordinal = np.asarray(ordinal, dtype=np.int64) + dtype = PeriodDtype(freq) + data = PeriodArray(ordinal, dtype=dtype) + elif data is not None and ordinal is not None: + raise ValueError("Cannot pass both data and ordinal") + else: + # don't pass copy here, since we copy later. + data = period_array(data=data, freq=freq) + + if copy: + data = data.copy() + + return cls._simple_new(data, name=name, refs=refs) + + @classmethod + def from_fields( + cls, + *, + year=None, + quarter=None, + month=None, + day=None, + hour=None, + minute=None, + second=None, + freq=None, + ) -> Self: + fields = { + "year": year, + "quarter": quarter, + "month": month, + "day": day, + "hour": hour, + "minute": minute, + "second": second, + } + fields = {key: value for key, value in fields.items() if value is not None} + arr = PeriodArray._from_fields(fields=fields, freq=freq) + return cls._simple_new(arr) + + @classmethod + def from_ordinals(cls, ordinals, *, freq, name=None) -> Self: + ordinals = np.asarray(ordinals, dtype=np.int64) + dtype = PeriodDtype(freq) + data = PeriodArray._simple_new(ordinals, dtype=dtype) + return cls._simple_new(data, name=name) + + # ------------------------------------------------------------------------ + # Data + + @property + def values(self) -> npt.NDArray[np.object_]: + return np.asarray(self, dtype=object) + + def _maybe_convert_timedelta(self, other) -> int | npt.NDArray[np.int64]: + """ + Convert timedelta-like input to an integer multiple of self.freq + + Parameters + ---------- + other : timedelta, np.timedelta64, DateOffset, int, np.ndarray + + Returns + ------- + converted : int, np.ndarray[int64] + + Raises + ------ + IncompatibleFrequency : if the input cannot be written as a multiple + of self.freq. Note IncompatibleFrequency subclasses ValueError. + """ + if isinstance(other, (timedelta, np.timedelta64, Tick, np.ndarray)): + if isinstance(self.freq, Tick): + # _check_timedeltalike_freq_compat will raise if incompatible + delta = self._data._check_timedeltalike_freq_compat(other) + return delta + elif isinstance(other, BaseOffset): + if other.base == self.freq.base: + return other.n + + raise raise_on_incompatible(self, other) + elif is_integer(other): + assert isinstance(other, int) + return other + + # raise when input doesn't have freq + raise raise_on_incompatible(self, None) + + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + return self.dtype == dtype + + # ------------------------------------------------------------------------ + # Index Methods + + def asof_locs(self, where: Index, mask: npt.NDArray[np.bool_]) -> np.ndarray: + """ + where : array of timestamps + mask : np.ndarray[bool] + Array of booleans where data is not NA. + """ + if isinstance(where, DatetimeIndex): + where = PeriodIndex(where._values, freq=self.freq) + elif not isinstance(where, PeriodIndex): + raise TypeError("asof_locs `where` must be DatetimeIndex or PeriodIndex") + + return super().asof_locs(where, mask) + + @property + def is_full(self) -> bool: + """ + Returns True if this PeriodIndex is range-like in that all Periods + between start and end are present, in order. + """ + if len(self) == 0: + return True + if not self.is_monotonic_increasing: + raise ValueError("Index is not monotonic") + values = self.asi8 + return bool(((values[1:] - values[:-1]) < 2).all()) + + @property + def inferred_type(self) -> str: + # b/c data is represented as ints make sure we can't have ambiguous + # indexing + return "period" + + # ------------------------------------------------------------------------ + # Indexing Methods + + def _convert_tolerance(self, tolerance, target): + # Returned tolerance must be in dtype/units so that + # `|self._get_engine_target() - target._engine_target()| <= tolerance` + # is meaningful. Since PeriodIndex returns int64 for engine_target, + # we may need to convert timedelta64 tolerance to int64. + tolerance = super()._convert_tolerance(tolerance, target) + + if self.dtype == target.dtype: + # convert tolerance to i8 + tolerance = self._maybe_convert_timedelta(tolerance) + + return tolerance + + def get_loc(self, key): + """ + Get integer location for requested label. + + Parameters + ---------- + key : Period, NaT, str, or datetime + String or datetime key must be parsable as Period. + + Returns + ------- + loc : int or ndarray[int64] + + Raises + ------ + KeyError + Key is not present in the index. + TypeError + If key is listlike or otherwise not hashable. + """ + orig_key = key + + self._check_indexing_error(key) + + if is_valid_na_for_dtype(key, self.dtype): + key = NaT + + elif isinstance(key, str): + try: + parsed, reso = self._parse_with_reso(key) + except ValueError as err: + # A string with invalid format + raise KeyError(f"Cannot interpret '{key}' as period") from err + + if self._can_partial_date_slice(reso): + try: + return self._partial_date_slice(reso, parsed) + except KeyError as err: + raise KeyError(key) from err + + if reso == self._resolution_obj: + # the reso < self._resolution_obj case goes + # through _get_string_slice + key = self._cast_partial_indexing_scalar(parsed) + else: + raise KeyError(key) + + elif isinstance(key, Period): + self._disallow_mismatched_indexing(key) + + elif isinstance(key, datetime): + key = self._cast_partial_indexing_scalar(key) + + else: + # in particular integer, which Period constructor would cast to string + raise KeyError(key) + + try: + return Index.get_loc(self, key) + except KeyError as err: + raise KeyError(orig_key) from err + + def _disallow_mismatched_indexing(self, key: Period) -> None: + if key._dtype != self.dtype: + raise KeyError(key) + + def _cast_partial_indexing_scalar(self, label: datetime) -> Period: + try: + period = Period(label, freq=self.freq) + except ValueError as err: + # we cannot construct the Period + raise KeyError(label) from err + return period + + @doc(DatetimeIndexOpsMixin._maybe_cast_slice_bound) + def _maybe_cast_slice_bound(self, label, side: str): + if isinstance(label, datetime): + label = self._cast_partial_indexing_scalar(label) + + return super()._maybe_cast_slice_bound(label, side) + + def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): + freq = OFFSET_TO_PERIOD_FREQSTR.get(reso.attr_abbrev, reso.attr_abbrev) + iv = Period(parsed, freq=freq) + return (iv.asfreq(self.freq, how="start"), iv.asfreq(self.freq, how="end")) + + @doc(DatetimeIndexOpsMixin.shift) + def shift(self, periods: int = 1, freq=None) -> Self: + if freq is not None: + raise TypeError( + f"`freq` argument is not supported for {type(self).__name__}.shift" + ) + return self + periods + + +def period_range( + start=None, + end=None, + periods: int | None = None, + freq=None, + name: Hashable | None = None, +) -> PeriodIndex: + """ + Return a fixed frequency PeriodIndex. + + The day (calendar) is the default frequency. + + Parameters + ---------- + start : str, datetime, date, pandas.Timestamp, or period-like, default None + Left bound for generating periods. + end : str, datetime, date, pandas.Timestamp, or period-like, default None + Right bound for generating periods. + periods : int, default None + Number of periods to generate. + freq : str or DateOffset, optional + Frequency alias. By default the freq is taken from `start` or `end` + if those are Period objects. Otherwise, the default is ``"D"`` for + daily frequency. + name : str, default None + Name of the resulting PeriodIndex. + + Returns + ------- + PeriodIndex + + Notes + ----- + Of the three parameters: ``start``, ``end``, and ``periods``, exactly two + must be specified. + + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + >>> pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') + PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', + '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', + '2018-01'], + dtype='period[M]') + + If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor + endpoints for a ``PeriodIndex`` with frequency matching that of the + ``period_range`` constructor. + + >>> pd.period_range(start=pd.Period('2017Q1', freq='Q'), + ... end=pd.Period('2017Q2', freq='Q'), freq='M') + PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'], + dtype='period[M]') + """ + if com.count_not_none(start, end, periods) != 2: + raise ValueError( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) + if freq is None and (not isinstance(start, Period) and not isinstance(end, Period)): + freq = "D" + + data, freq = PeriodArray._generate_range(start, end, periods, freq) + dtype = PeriodDtype(freq) + data = PeriodArray(data, dtype=dtype) + return PeriodIndex(data, name=name) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/range.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/range.py new file mode 100644 index 0000000000000000000000000000000000000000..62afcf8badb50d95f2cc006bf8d89e79f6fe8615 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/range.py @@ -0,0 +1,1187 @@ +from __future__ import annotations + +from collections.abc import ( + Hashable, + Iterator, +) +from datetime import timedelta +import operator +from sys import getsizeof +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + cast, + overload, +) + +import numpy as np + +from pandas._libs import ( + index as libindex, + lib, +) +from pandas._libs.algos import unique_deltas +from pandas._libs.lib import no_default +from pandas.compat.numpy import function as nv +from pandas.util._decorators import ( + cache_readonly, + deprecate_nonkeyword_arguments, + doc, +) + +from pandas.core.dtypes.common import ( + ensure_platform_int, + ensure_python_int, + is_float, + is_integer, + is_scalar, + is_signed_integer_dtype, +) +from pandas.core.dtypes.generic import ABCTimedeltaIndex + +from pandas.core import ops +import pandas.core.common as com +from pandas.core.construction import extract_array +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import ( + Index, + maybe_extract_name, +) +from pandas.core.ops.common import unpack_zerodim_and_defer + +if TYPE_CHECKING: + from pandas._typing import ( + Axis, + Dtype, + NaPosition, + Self, + npt, + ) +_empty_range = range(0) +_dtype_int64 = np.dtype(np.int64) + + +class RangeIndex(Index): + """ + Immutable Index implementing a monotonic integer range. + + RangeIndex is a memory-saving special case of an Index limited to representing + monotonic ranges with a 64-bit dtype. Using RangeIndex may in some instances + improve computing speed. + + This is the default index type used + by DataFrame and Series when no explicit index is provided by the user. + + Parameters + ---------- + start : int (default: 0), range, or other RangeIndex instance + If int and "stop" is not given, interpreted as "stop" instead. + stop : int (default: 0) + step : int (default: 1) + dtype : np.int64 + Unused, accepted for homogeneity with other index types. + copy : bool, default False + Unused, accepted for homogeneity with other index types. + name : object, optional + Name to be stored in the index. + + Attributes + ---------- + start + stop + step + + Methods + ------- + from_range + + See Also + -------- + Index : The base pandas Index type. + + Examples + -------- + >>> list(pd.RangeIndex(5)) + [0, 1, 2, 3, 4] + + >>> list(pd.RangeIndex(-2, 4)) + [-2, -1, 0, 1, 2, 3] + + >>> list(pd.RangeIndex(0, 10, 2)) + [0, 2, 4, 6, 8] + + >>> list(pd.RangeIndex(2, -10, -3)) + [2, -1, -4, -7] + + >>> list(pd.RangeIndex(0)) + [] + + >>> list(pd.RangeIndex(1, 0)) + [] + """ + + _typ = "rangeindex" + _dtype_validation_metadata = (is_signed_integer_dtype, "signed integer") + _range: range + _values: np.ndarray + + @property + def _engine_type(self) -> type[libindex.Int64Engine]: + return libindex.Int64Engine + + # -------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + start=None, + stop=None, + step=None, + dtype: Dtype | None = None, + copy: bool = False, + name: Hashable | None = None, + ) -> Self: + cls._validate_dtype(dtype) + name = maybe_extract_name(name, start, cls) + + # RangeIndex + if isinstance(start, cls): + return start.copy(name=name) + elif isinstance(start, range): + return cls._simple_new(start, name=name) + + # validate the arguments + if com.all_none(start, stop, step): + raise TypeError("RangeIndex(...) must be called with integers") + + start = ensure_python_int(start) if start is not None else 0 + + if stop is None: + start, stop = 0, start + else: + stop = ensure_python_int(stop) + + step = ensure_python_int(step) if step is not None else 1 + if step == 0: + raise ValueError("Step must not be zero") + + rng = range(start, stop, step) + return cls._simple_new(rng, name=name) + + @classmethod + def from_range(cls, data: range, name=None, dtype: Dtype | None = None) -> Self: + """ + Create :class:`pandas.RangeIndex` from a ``range`` object. + + Returns + ------- + RangeIndex + + Examples + -------- + >>> pd.RangeIndex.from_range(range(5)) + RangeIndex(start=0, stop=5, step=1) + + >>> pd.RangeIndex.from_range(range(2, -10, -3)) + RangeIndex(start=2, stop=-10, step=-3) + """ + if not isinstance(data, range): + raise TypeError( + f"{cls.__name__}(...) must be called with object coercible to a " + f"range, {repr(data)} was passed" + ) + cls._validate_dtype(dtype) + return cls._simple_new(data, name=name) + + # error: Argument 1 of "_simple_new" is incompatible with supertype "Index"; + # supertype defines the argument type as + # "Union[ExtensionArray, ndarray[Any, Any]]" [override] + @classmethod + def _simple_new( # type: ignore[override] + cls, values: range, name: Hashable | None = None + ) -> Self: + result = object.__new__(cls) + + assert isinstance(values, range) + + result._range = values + result._name = name + result._cache = {} + result._reset_identity() + result._references = None + return result + + @classmethod + def _validate_dtype(cls, dtype: Dtype | None) -> None: + if dtype is None: + return + + validation_func, expected = cls._dtype_validation_metadata + if not validation_func(dtype): + raise ValueError( + f"Incorrect `dtype` passed: expected {expected}, received {dtype}" + ) + + # -------------------------------------------------------------------- + + # error: Return type "Type[Index]" of "_constructor" incompatible with return + # type "Type[RangeIndex]" in supertype "Index" + @cache_readonly + def _constructor(self) -> type[Index]: # type: ignore[override] + """return the class to use for construction""" + return Index + + # error: Signature of "_data" incompatible with supertype "Index" + @cache_readonly + def _data(self) -> np.ndarray: # type: ignore[override] + """ + An int array that for performance reasons is created only when needed. + + The constructed array is saved in ``_cache``. + """ + return np.arange(self.start, self.stop, self.step, dtype=np.int64) + + def _get_data_as_items(self) -> list[tuple[str, int]]: + """return a list of tuples of start, stop, step""" + rng = self._range + return [("start", rng.start), ("stop", rng.stop), ("step", rng.step)] + + def __reduce__(self): + d = {"name": self._name} + d.update(dict(self._get_data_as_items())) + return ibase._new_Index, (type(self), d), None + + # -------------------------------------------------------------------- + # Rendering Methods + + def _format_attrs(self): + """ + Return a list of tuples of the (attr, formatted_value) + """ + attrs = cast("list[tuple[str, str | int]]", self._get_data_as_items()) + if self._name is not None: + attrs.append(("name", ibase.default_pprint(self._name))) + return attrs + + def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: + # Equivalent to Index implementation, but faster + if not len(self._range): + return header + first_val_str = str(self._range[0]) + last_val_str = str(self._range[-1]) + max_length = max(len(first_val_str), len(last_val_str)) + + return header + [f"{x:<{max_length}}" for x in self._range] + + # -------------------------------------------------------------------- + + @property + def start(self) -> int: + """ + The value of the `start` parameter (``0`` if this was not supplied). + + Examples + -------- + >>> idx = pd.RangeIndex(5) + >>> idx.start + 0 + + >>> idx = pd.RangeIndex(2, -10, -3) + >>> idx.start + 2 + """ + # GH 25710 + return self._range.start + + @property + def stop(self) -> int: + """ + The value of the `stop` parameter. + + Examples + -------- + >>> idx = pd.RangeIndex(5) + >>> idx.stop + 5 + + >>> idx = pd.RangeIndex(2, -10, -3) + >>> idx.stop + -10 + """ + return self._range.stop + + @property + def step(self) -> int: + """ + The value of the `step` parameter (``1`` if this was not supplied). + + Examples + -------- + >>> idx = pd.RangeIndex(5) + >>> idx.step + 1 + + >>> idx = pd.RangeIndex(2, -10, -3) + >>> idx.step + -3 + + Even if :class:`pandas.RangeIndex` is empty, ``step`` is still ``1`` if + not supplied. + + >>> idx = pd.RangeIndex(1, 0) + >>> idx.step + 1 + """ + # GH 25710 + return self._range.step + + @cache_readonly + def nbytes(self) -> int: + """ + Return the number of bytes in the underlying data. + """ + rng = self._range + return getsizeof(rng) + sum( + getsizeof(getattr(rng, attr_name)) + for attr_name in ["start", "stop", "step"] + ) + + def memory_usage(self, deep: bool = False) -> int: + """ + Memory usage of my values + + Parameters + ---------- + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption + + Returns + ------- + bytes used + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False + + See Also + -------- + numpy.ndarray.nbytes + """ + return self.nbytes + + @property + def dtype(self) -> np.dtype: + return _dtype_int64 + + @property + def is_unique(self) -> bool: + """return if the index has unique values""" + return True + + @cache_readonly + def is_monotonic_increasing(self) -> bool: + return self._range.step > 0 or len(self) <= 1 + + @cache_readonly + def is_monotonic_decreasing(self) -> bool: + return self._range.step < 0 or len(self) <= 1 + + def __contains__(self, key: Any) -> bool: + hash(key) + try: + key = ensure_python_int(key) + except TypeError: + return False + return key in self._range + + @property + def inferred_type(self) -> str: + return "integer" + + # -------------------------------------------------------------------- + # Indexing Methods + + @doc(Index.get_loc) + def get_loc(self, key) -> int: + if is_integer(key) or (is_float(key) and key.is_integer()): + new_key = int(key) + try: + return self._range.index(new_key) + except ValueError as err: + raise KeyError(key) from err + if isinstance(key, Hashable): + raise KeyError(key) + self._check_indexing_error(key) + raise KeyError(key) + + def _get_indexer( + self, + target: Index, + method: str | None = None, + limit: int | None = None, + tolerance=None, + ) -> npt.NDArray[np.intp]: + if com.any_not_none(method, tolerance, limit): + return super()._get_indexer( + target, method=method, tolerance=tolerance, limit=limit + ) + + if self.step > 0: + start, stop, step = self.start, self.stop, self.step + else: + # GH 28678: work on reversed range for simplicity + reverse = self._range[::-1] + start, stop, step = reverse.start, reverse.stop, reverse.step + + target_array = np.asarray(target) + locs = target_array - start + valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) + locs[~valid] = -1 + locs[valid] = locs[valid] / step + + if step != self.step: + # We reversed this range: transform to original locs + locs[valid] = len(self) - 1 - locs[valid] + return ensure_platform_int(locs) + + @cache_readonly + def _should_fallback_to_positional(self) -> bool: + """ + Should an integer key be treated as positional? + """ + return False + + # -------------------------------------------------------------------- + + def tolist(self) -> list[int]: + return list(self._range) + + @doc(Index.__iter__) + def __iter__(self) -> Iterator[int]: + yield from self._range + + @doc(Index._shallow_copy) + def _shallow_copy(self, values, name: Hashable = no_default): + name = self._name if name is no_default else name + + if values.dtype.kind == "f": + return Index(values, name=name, dtype=np.float64) + # GH 46675 & 43885: If values is equally spaced, return a + # more memory-compact RangeIndex instead of Index with 64-bit dtype + unique_diffs = unique_deltas(values) + if len(unique_diffs) == 1 and unique_diffs[0] != 0: + diff = unique_diffs[0] + new_range = range(values[0], values[-1] + diff, diff) + return type(self)._simple_new(new_range, name=name) + else: + return self._constructor._simple_new(values, name=name) + + def _view(self) -> Self: + result = type(self)._simple_new(self._range, name=self._name) + result._cache = self._cache + return result + + @doc(Index.copy) + def copy(self, name: Hashable | None = None, deep: bool = False) -> Self: + name = self._validate_names(name=name, deep=deep)[0] + new_index = self._rename(name=name) + return new_index + + def _minmax(self, meth: str): + no_steps = len(self) - 1 + if no_steps == -1: + return np.nan + elif (meth == "min" and self.step > 0) or (meth == "max" and self.step < 0): + return self.start + + return self.start + self.step * no_steps + + def min(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + """The minimum value of the RangeIndex""" + nv.validate_minmax_axis(axis) + nv.validate_min(args, kwargs) + return self._minmax("min") + + def max(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + """The maximum value of the RangeIndex""" + nv.validate_minmax_axis(axis) + nv.validate_max(args, kwargs) + return self._minmax("max") + + def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: + """ + Returns the indices that would sort the index and its + underlying data. + + Returns + ------- + np.ndarray[np.intp] + + See Also + -------- + numpy.ndarray.argsort + """ + ascending = kwargs.pop("ascending", True) # EA compat + kwargs.pop("kind", None) # e.g. "mergesort" is irrelevant + nv.validate_argsort(args, kwargs) + + if self._range.step > 0: + result = np.arange(len(self), dtype=np.intp) + else: + result = np.arange(len(self) - 1, -1, -1, dtype=np.intp) + + if not ascending: + result = result[::-1] + return result + + def factorize( + self, + sort: bool = False, + use_na_sentinel: bool = True, + ) -> tuple[npt.NDArray[np.intp], RangeIndex]: + codes = np.arange(len(self), dtype=np.intp) + uniques = self + if sort and self.step < 0: + codes = codes[::-1] + uniques = uniques[::-1] + return codes, uniques + + def equals(self, other: object) -> bool: + """ + Determines if two Index objects contain the same elements. + """ + if isinstance(other, RangeIndex): + return self._range == other._range + return super().equals(other) + + # error: Signature of "sort_values" incompatible with supertype "Index" + @overload # type: ignore[override] + def sort_values( + self, + *, + return_indexer: Literal[False] = ..., + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> Self: + ... + + @overload + def sort_values( + self, + *, + return_indexer: Literal[True], + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> tuple[Self, np.ndarray | RangeIndex]: + ... + + @overload + def sort_values( + self, + *, + return_indexer: bool = ..., + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> Self | tuple[Self, np.ndarray | RangeIndex]: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self"], name="sort_values" + ) + def sort_values( + self, + return_indexer: bool = False, + ascending: bool = True, + na_position: NaPosition = "last", + key: Callable | None = None, + ) -> Self | tuple[Self, np.ndarray | RangeIndex]: + if key is not None: + return super().sort_values( + return_indexer=return_indexer, + ascending=ascending, + na_position=na_position, + key=key, + ) + else: + sorted_index = self + inverse_indexer = False + if ascending: + if self.step < 0: + sorted_index = self[::-1] + inverse_indexer = True + else: + if self.step > 0: + sorted_index = self[::-1] + inverse_indexer = True + + if return_indexer: + if inverse_indexer: + rng = range(len(self) - 1, -1, -1) + else: + rng = range(len(self)) + return sorted_index, RangeIndex(rng) + else: + return sorted_index + + # -------------------------------------------------------------------- + # Set Operations + + def _intersection(self, other: Index, sort: bool = False): + # caller is responsible for checking self and other are both non-empty + + if not isinstance(other, RangeIndex): + return super()._intersection(other, sort=sort) + + first = self._range[::-1] if self.step < 0 else self._range + second = other._range[::-1] if other.step < 0 else other._range + + # check whether intervals intersect + # deals with in- and decreasing ranges + int_low = max(first.start, second.start) + int_high = min(first.stop, second.stop) + if int_high <= int_low: + return self._simple_new(_empty_range) + + # Method hint: linear Diophantine equation + # solve intersection problem + # performance hint: for identical step sizes, could use + # cheaper alternative + gcd, s, _ = self._extended_gcd(first.step, second.step) + + # check whether element sets intersect + if (first.start - second.start) % gcd: + return self._simple_new(_empty_range) + + # calculate parameters for the RangeIndex describing the + # intersection disregarding the lower bounds + tmp_start = first.start + (second.start - first.start) * first.step // gcd * s + new_step = first.step * second.step // gcd + new_range = range(tmp_start, int_high, new_step) + new_index = self._simple_new(new_range) + + # adjust index to limiting interval + new_start = new_index._min_fitting_element(int_low) + new_range = range(new_start, new_index.stop, new_index.step) + new_index = self._simple_new(new_range) + + if (self.step < 0 and other.step < 0) is not (new_index.step < 0): + new_index = new_index[::-1] + + if sort is None: + new_index = new_index.sort_values() + + return new_index + + def _min_fitting_element(self, lower_limit: int) -> int: + """Returns the smallest element greater than or equal to the limit""" + no_steps = -(-(lower_limit - self.start) // abs(self.step)) + return self.start + abs(self.step) * no_steps + + def _extended_gcd(self, a: int, b: int) -> tuple[int, int, int]: + """ + Extended Euclidean algorithms to solve Bezout's identity: + a*x + b*y = gcd(x, y) + Finds one particular solution for x, y: s, t + Returns: gcd, s, t + """ + s, old_s = 0, 1 + t, old_t = 1, 0 + r, old_r = b, a + while r: + quotient = old_r // r + old_r, r = r, old_r - quotient * r + old_s, s = s, old_s - quotient * s + old_t, t = t, old_t - quotient * t + return old_r, old_s, old_t + + def _range_in_self(self, other: range) -> bool: + """Check if other range is contained in self""" + # https://stackoverflow.com/a/32481015 + if not other: + return True + if not self._range: + return False + if len(other) > 1 and other.step % self._range.step: + return False + return other.start in self._range and other[-1] in self._range + + def _union(self, other: Index, sort: bool | None): + """ + Form the union of two Index objects and sorts if possible + + Parameters + ---------- + other : Index or array-like + + sort : bool or None, default None + Whether to sort (monotonically increasing) the resulting index. + ``sort=None|True`` returns a ``RangeIndex`` if possible or a sorted + ``Index`` with a int64 dtype if not. + ``sort=False`` can return a ``RangeIndex`` if self is monotonically + increasing and other is fully contained in self. Otherwise, returns + an unsorted ``Index`` with an int64 dtype. + + Returns + ------- + union : Index + """ + if isinstance(other, RangeIndex): + if sort in (None, True) or ( + sort is False and self.step > 0 and self._range_in_self(other._range) + ): + # GH 47557: Can still return a RangeIndex + # if other range in self and sort=False + start_s, step_s = self.start, self.step + end_s = self.start + self.step * (len(self) - 1) + start_o, step_o = other.start, other.step + end_o = other.start + other.step * (len(other) - 1) + if self.step < 0: + start_s, step_s, end_s = end_s, -step_s, start_s + if other.step < 0: + start_o, step_o, end_o = end_o, -step_o, start_o + if len(self) == 1 and len(other) == 1: + step_s = step_o = abs(self.start - other.start) + elif len(self) == 1: + step_s = step_o + elif len(other) == 1: + step_o = step_s + start_r = min(start_s, start_o) + end_r = max(end_s, end_o) + if step_o == step_s: + if ( + (start_s - start_o) % step_s == 0 + and (start_s - end_o) <= step_s + and (start_o - end_s) <= step_s + ): + return type(self)(start_r, end_r + step_s, step_s) + if ( + (step_s % 2 == 0) + and (abs(start_s - start_o) == step_s / 2) + and (abs(end_s - end_o) == step_s / 2) + ): + # e.g. range(0, 10, 2) and range(1, 11, 2) + # but not range(0, 20, 4) and range(1, 21, 4) GH#44019 + return type(self)(start_r, end_r + step_s / 2, step_s / 2) + + elif step_o % step_s == 0: + if ( + (start_o - start_s) % step_s == 0 + and (start_o + step_s >= start_s) + and (end_o - step_s <= end_s) + ): + return type(self)(start_r, end_r + step_s, step_s) + elif step_s % step_o == 0: + if ( + (start_s - start_o) % step_o == 0 + and (start_s + step_o >= start_o) + and (end_s - step_o <= end_o) + ): + return type(self)(start_r, end_r + step_o, step_o) + + return super()._union(other, sort=sort) + + def _difference(self, other, sort=None): + # optimized set operation if we have another RangeIndex + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, result_name = self._convert_can_do_setop(other) + + if not isinstance(other, RangeIndex): + return super()._difference(other, sort=sort) + + if sort is not False and self.step < 0: + return self[::-1]._difference(other) + + res_name = ops.get_op_result_name(self, other) + + first = self._range[::-1] if self.step < 0 else self._range + overlap = self.intersection(other) + if overlap.step < 0: + overlap = overlap[::-1] + + if len(overlap) == 0: + return self.rename(name=res_name) + if len(overlap) == len(self): + return self[:0].rename(res_name) + + # overlap.step will always be a multiple of self.step (see _intersection) + + if len(overlap) == 1: + if overlap[0] == self[0]: + return self[1:] + + elif overlap[0] == self[-1]: + return self[:-1] + + elif len(self) == 3 and overlap[0] == self[1]: + return self[::2] + + else: + return super()._difference(other, sort=sort) + + elif len(overlap) == 2 and overlap[0] == first[0] and overlap[-1] == first[-1]: + # e.g. range(-8, 20, 7) and range(13, -9, -3) + return self[1:-1] + + if overlap.step == first.step: + if overlap[0] == first.start: + # The difference is everything after the intersection + new_rng = range(overlap[-1] + first.step, first.stop, first.step) + elif overlap[-1] == first[-1]: + # The difference is everything before the intersection + new_rng = range(first.start, overlap[0], first.step) + elif overlap._range == first[1:-1]: + # e.g. range(4) and range(1, 3) + step = len(first) - 1 + new_rng = first[::step] + else: + # The difference is not range-like + # e.g. range(1, 10, 1) and range(3, 7, 1) + return super()._difference(other, sort=sort) + + else: + # We must have len(self) > 1, bc we ruled out above + # len(overlap) == 0 and len(overlap) == len(self) + assert len(self) > 1 + + if overlap.step == first.step * 2: + if overlap[0] == first[0] and overlap[-1] in (first[-1], first[-2]): + # e.g. range(1, 10, 1) and range(1, 10, 2) + new_rng = first[1::2] + + elif overlap[0] == first[1] and overlap[-1] in (first[-1], first[-2]): + # e.g. range(1, 10, 1) and range(2, 10, 2) + new_rng = first[::2] + + else: + # We can get here with e.g. range(20) and range(0, 10, 2) + return super()._difference(other, sort=sort) + + else: + # e.g. range(10) and range(0, 10, 3) + return super()._difference(other, sort=sort) + + new_index = type(self)._simple_new(new_rng, name=res_name) + if first is not self._range: + new_index = new_index[::-1] + + return new_index + + def symmetric_difference( + self, other, result_name: Hashable | None = None, sort=None + ): + if not isinstance(other, RangeIndex) or sort is not None: + return super().symmetric_difference(other, result_name, sort) + + left = self.difference(other) + right = other.difference(self) + result = left.union(right) + + if result_name is not None: + result = result.rename(result_name) + return result + + # -------------------------------------------------------------------- + + # error: Return type "Index" of "delete" incompatible with return type + # "RangeIndex" in supertype "Index" + def delete(self, loc) -> Index: # type: ignore[override] + # In some cases we can retain RangeIndex, see also + # DatetimeTimedeltaMixin._get_delete_Freq + if is_integer(loc): + if loc in (0, -len(self)): + return self[1:] + if loc in (-1, len(self) - 1): + return self[:-1] + if len(self) == 3 and loc in (1, -2): + return self[::2] + + elif lib.is_list_like(loc): + slc = lib.maybe_indices_to_slice(np.asarray(loc, dtype=np.intp), len(self)) + + if isinstance(slc, slice): + # defer to RangeIndex._difference, which is optimized to return + # a RangeIndex whenever possible + other = self[slc] + return self.difference(other, sort=False) + + return super().delete(loc) + + def insert(self, loc: int, item) -> Index: + if len(self) and (is_integer(item) or is_float(item)): + # We can retain RangeIndex is inserting at the beginning or end, + # or right in the middle. + rng = self._range + if loc == 0 and item == self[0] - self.step: + new_rng = range(rng.start - rng.step, rng.stop, rng.step) + return type(self)._simple_new(new_rng, name=self._name) + + elif loc == len(self) and item == self[-1] + self.step: + new_rng = range(rng.start, rng.stop + rng.step, rng.step) + return type(self)._simple_new(new_rng, name=self._name) + + elif len(self) == 2 and item == self[0] + self.step / 2: + # e.g. inserting 1 into [0, 2] + step = int(self.step / 2) + new_rng = range(self.start, self.stop, step) + return type(self)._simple_new(new_rng, name=self._name) + + return super().insert(loc, item) + + def _concat(self, indexes: list[Index], name: Hashable) -> Index: + """ + Overriding parent method for the case of all RangeIndex instances. + + When all members of "indexes" are of type RangeIndex: result will be + RangeIndex if possible, Index with a int64 dtype otherwise. E.g.: + indexes = [RangeIndex(3), RangeIndex(3, 6)] -> RangeIndex(6) + indexes = [RangeIndex(3), RangeIndex(4, 6)] -> Index([0,1,2,4,5], dtype='int64') + """ + if not all(isinstance(x, RangeIndex) for x in indexes): + return super()._concat(indexes, name) + + elif len(indexes) == 1: + return indexes[0] + + rng_indexes = cast(list[RangeIndex], indexes) + + start = step = next_ = None + + # Filter the empty indexes + non_empty_indexes = [obj for obj in rng_indexes if len(obj)] + + for obj in non_empty_indexes: + rng = obj._range + + if start is None: + # This is set by the first non-empty index + start = rng.start + if step is None and len(rng) > 1: + step = rng.step + elif step is None: + # First non-empty index had only one element + if rng.start == start: + values = np.concatenate([x._values for x in rng_indexes]) + result = self._constructor(values) + return result.rename(name) + + step = rng.start - start + + non_consecutive = (step != rng.step and len(rng) > 1) or ( + next_ is not None and rng.start != next_ + ) + if non_consecutive: + result = self._constructor( + np.concatenate([x._values for x in rng_indexes]) + ) + return result.rename(name) + + if step is not None: + next_ = rng[-1] + step + + if non_empty_indexes: + # Get the stop value from "next" or alternatively + # from the last non-empty index + stop = non_empty_indexes[-1].stop if next_ is None else next_ + return RangeIndex(start, stop, step).rename(name) + + # Here all "indexes" had 0 length, i.e. were empty. + # In this case return an empty range index. + return RangeIndex(0, 0).rename(name) + + def __len__(self) -> int: + """ + return the length of the RangeIndex + """ + return len(self._range) + + @property + def size(self) -> int: + return len(self) + + def __getitem__(self, key): + """ + Conserve RangeIndex type for scalar and slice keys. + """ + if isinstance(key, slice): + return self._getitem_slice(key) + elif is_integer(key): + new_key = int(key) + try: + return self._range[new_key] + except IndexError as err: + raise IndexError( + f"index {key} is out of bounds for axis 0 with size {len(self)}" + ) from err + elif is_scalar(key): + raise IndexError( + "only integers, slices (`:`), " + "ellipsis (`...`), numpy.newaxis (`None`) " + "and integer or boolean " + "arrays are valid indices" + ) + return super().__getitem__(key) + + def _getitem_slice(self, slobj: slice) -> Self: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + res = self._range[slobj] + return type(self)._simple_new(res, name=self._name) + + @unpack_zerodim_and_defer("__floordiv__") + def __floordiv__(self, other): + if is_integer(other) and other != 0: + if len(self) == 0 or self.start % other == 0 and self.step % other == 0: + start = self.start // other + step = self.step // other + stop = start + len(self) * step + new_range = range(start, stop, step or 1) + return self._simple_new(new_range, name=self._name) + if len(self) == 1: + start = self.start // other + new_range = range(start, start + 1, 1) + return self._simple_new(new_range, name=self._name) + + return super().__floordiv__(other) + + # -------------------------------------------------------------------- + # Reductions + + def all(self, *args, **kwargs) -> bool: + return 0 not in self._range + + def any(self, *args, **kwargs) -> bool: + return any(self._range) + + # -------------------------------------------------------------------- + + def _cmp_method(self, other, op): + if isinstance(other, RangeIndex) and self._range == other._range: + # Both are immutable so if ._range attr. are equal, shortcut is possible + return super()._cmp_method(self, op) + return super()._cmp_method(other, op) + + def _arith_method(self, other, op): + """ + Parameters + ---------- + other : Any + op : callable that accepts 2 params + perform the binary op + """ + + if isinstance(other, ABCTimedeltaIndex): + # Defer to TimedeltaIndex implementation + return NotImplemented + elif isinstance(other, (timedelta, np.timedelta64)): + # GH#19333 is_integer evaluated True on timedelta64, + # so we need to catch these explicitly + return super()._arith_method(other, op) + elif lib.is_np_dtype(getattr(other, "dtype", None), "m"): + # Must be an np.ndarray; GH#22390 + return super()._arith_method(other, op) + + if op in [ + operator.pow, + ops.rpow, + operator.mod, + ops.rmod, + operator.floordiv, + ops.rfloordiv, + divmod, + ops.rdivmod, + ]: + return super()._arith_method(other, op) + + step: Callable | None = None + if op in [operator.mul, ops.rmul, operator.truediv, ops.rtruediv]: + step = op + + # TODO: if other is a RangeIndex we may have more efficient options + right = extract_array(other, extract_numpy=True, extract_range=True) + left = self + + try: + # apply if we have an override + if step: + with np.errstate(all="ignore"): + rstep = step(left.step, right) + + # we don't have a representable op + # so return a base index + if not is_integer(rstep) or not rstep: + raise ValueError + + # GH#53255 + else: + rstep = -left.step if op == ops.rsub else left.step + + with np.errstate(all="ignore"): + rstart = op(left.start, right) + rstop = op(left.stop, right) + + res_name = ops.get_op_result_name(self, other) + result = type(self)(rstart, rstop, rstep, name=res_name) + + # for compat with numpy / Index with int64 dtype + # even if we can represent as a RangeIndex, return + # as a float64 Index if we have float-like descriptors + if not all(is_integer(x) for x in [rstart, rstop, rstep]): + result = result.astype("float64") + + return result + + except (ValueError, TypeError, ZeroDivisionError): + # test_arithmetic_explicit_conversions + return super()._arith_method(other, op) + + # error: Return type "Index" of "take" incompatible with return type + # "RangeIndex" in supertype "Index" + def take( # type: ignore[override] + self, + indices, + axis: Axis = 0, + allow_fill: bool = True, + fill_value=None, + **kwargs, + ) -> Index: + if kwargs: + nv.validate_take((), kwargs) + if is_scalar(indices): + raise TypeError("Expected indices to be array-like") + indices = ensure_platform_int(indices) + + # raise an exception if allow_fill is True and fill_value is not None + self._maybe_disallow_fill(allow_fill, fill_value, indices) + + if len(indices) == 0: + taken = np.array([], dtype=self.dtype) + else: + ind_max = indices.max() + if ind_max >= len(self): + raise IndexError( + f"index {ind_max} is out of bounds for axis 0 with size {len(self)}" + ) + ind_min = indices.min() + if ind_min < -len(self): + raise IndexError( + f"index {ind_min} is out of bounds for axis 0 with size {len(self)}" + ) + taken = indices.astype(self.dtype, casting="safe") + if ind_min < 0: + taken %= len(self) + if self.step != 1: + taken *= self.step + if self.start != 0: + taken += self.start + + # _constructor so RangeIndex-> Index with an int64 dtype + return self._constructor._simple_new(taken, name=self.name) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/timedeltas.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/timedeltas.py new file mode 100644 index 0000000000000000000000000000000000000000..08a265ba4764892fde0bc50670b6706ff788c8bc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/indexes/timedeltas.py @@ -0,0 +1,356 @@ +""" implement the TimedeltaIndex """ +from __future__ import annotations + +from typing import TYPE_CHECKING +import warnings + +from pandas._libs import ( + index as libindex, + lib, +) +from pandas._libs.tslibs import ( + Resolution, + Timedelta, + to_offset, +) +from pandas._libs.tslibs.timedeltas import disallow_ambiguous_unit +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.generic import ABCSeries + +from pandas.core.arrays.timedeltas import TimedeltaArray +import pandas.core.common as com +from pandas.core.indexes.base import ( + Index, + maybe_extract_name, +) +from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin +from pandas.core.indexes.extension import inherit_names + +if TYPE_CHECKING: + from pandas._typing import DtypeObj + + +@inherit_names( + ["__neg__", "__pos__", "__abs__", "total_seconds", "round", "floor", "ceil"] + + TimedeltaArray._field_ops, + TimedeltaArray, + wrap=True, +) +@inherit_names( + [ + "components", + "to_pytimedelta", + "sum", + "std", + "median", + ], + TimedeltaArray, +) +class TimedeltaIndex(DatetimeTimedeltaMixin): + """ + Immutable Index of timedelta64 data. + + Represented internally as int64, and scalars returned Timedelta objects. + + Parameters + ---------- + data : array-like (1-dimensional), optional + Optional timedelta-like data to construct index with. + unit : {'D', 'h', 'm', 's', 'ms', 'us', 'ns'}, optional + The unit of ``data``. + + .. deprecated:: 2.2.0 + Use ``pd.to_timedelta`` instead. + + freq : str or pandas offset object, optional + One of pandas date offset strings or corresponding objects. The string + ``'infer'`` can be passed in order to set the frequency of the index as + the inferred frequency upon creation. + dtype : numpy.dtype or str, default None + Valid ``numpy`` dtypes are ``timedelta64[ns]``, ``timedelta64[us]``, + ``timedelta64[ms]``, and ``timedelta64[s]``. + copy : bool + Make a copy of input array. + name : object + Name to be stored in the index. + + Attributes + ---------- + days + seconds + microseconds + nanoseconds + components + inferred_freq + + Methods + ------- + to_pytimedelta + to_series + round + floor + ceil + to_frame + mean + + See Also + -------- + Index : The base pandas Index type. + Timedelta : Represents a duration between two dates or times. + DatetimeIndex : Index of datetime64 data. + PeriodIndex : Index of Period data. + timedelta_range : Create a fixed-frequency TimedeltaIndex. + + Notes + ----- + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + >>> pd.TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days']) + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq=None) + + We can also let pandas infer the frequency when possible. + + >>> pd.TimedeltaIndex(np.arange(5) * 24 * 3600 * 1e9, freq='infer') + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq='D') + """ + + _typ = "timedeltaindex" + + _data_cls = TimedeltaArray + + @property + def _engine_type(self) -> type[libindex.TimedeltaEngine]: + return libindex.TimedeltaEngine + + _data: TimedeltaArray + + # Use base class method instead of DatetimeTimedeltaMixin._get_string_slice + _get_string_slice = Index._get_string_slice + + # error: Signature of "_resolution_obj" incompatible with supertype + # "DatetimeIndexOpsMixin" + @property + def _resolution_obj(self) -> Resolution | None: # type: ignore[override] + return self._data._resolution_obj + + # ------------------------------------------------------------------- + # Constructors + + def __new__( + cls, + data=None, + unit=lib.no_default, + freq=lib.no_default, + closed=lib.no_default, + dtype=None, + copy: bool = False, + name=None, + ): + if closed is not lib.no_default: + # GH#52628 + warnings.warn( + f"The 'closed' keyword in {cls.__name__} construction is " + "deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if unit is not lib.no_default: + # GH#55499 + warnings.warn( + f"The 'unit' keyword in {cls.__name__} construction is " + "deprecated and will be removed in a future version. " + "Use pd.to_timedelta instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + unit = None + + name = maybe_extract_name(name, data, cls) + + if is_scalar(data): + cls._raise_scalar_data_error(data) + + disallow_ambiguous_unit(unit) + if dtype is not None: + dtype = pandas_dtype(dtype) + + if ( + isinstance(data, TimedeltaArray) + and freq is lib.no_default + and (dtype is None or dtype == data.dtype) + ): + if copy: + data = data.copy() + return cls._simple_new(data, name=name) + + if ( + isinstance(data, TimedeltaIndex) + and freq is lib.no_default + and name is None + and (dtype is None or dtype == data.dtype) + ): + if copy: + return data.copy() + else: + return data._view() + + # - Cases checked above all return/raise before reaching here - # + + tdarr = TimedeltaArray._from_sequence_not_strict( + data, freq=freq, unit=unit, dtype=dtype, copy=copy + ) + refs = None + if not copy and isinstance(data, (ABCSeries, Index)): + refs = data._references + + return cls._simple_new(tdarr, name=name, refs=refs) + + # ------------------------------------------------------------------- + + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + return lib.is_np_dtype(dtype, "m") # aka self._data._is_recognized_dtype + + # ------------------------------------------------------------------- + # Indexing Methods + + def get_loc(self, key): + """ + Get integer location for requested label + + Returns + ------- + loc : int, slice, or ndarray[int] + """ + self._check_indexing_error(key) + + try: + key = self._data._validate_scalar(key, unbox=False) + except TypeError as err: + raise KeyError(key) from err + + return Index.get_loc(self, key) + + def _parse_with_reso(self, label: str): + # the "with_reso" is a no-op for TimedeltaIndex + parsed = Timedelta(label) + return parsed, None + + def _parsed_string_to_bounds(self, reso, parsed: Timedelta): + # reso is unused, included to match signature of DTI/PI + lbound = parsed.round(parsed.resolution_string) + rbound = lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") + return lbound, rbound + + # ------------------------------------------------------------------- + + @property + def inferred_type(self) -> str: + return "timedelta64" + + +def timedelta_range( + start=None, + end=None, + periods: int | None = None, + freq=None, + name=None, + closed=None, + *, + unit: str | None = None, +) -> TimedeltaIndex: + """ + Return a fixed frequency TimedeltaIndex with day as the default. + + Parameters + ---------- + start : str or timedelta-like, default None + Left bound for generating timedeltas. + end : str or timedelta-like, default None + Right bound for generating timedeltas. + periods : int, default None + Number of periods to generate. + freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'D' + Frequency strings can have multiples, e.g. '5h'. + name : str, default None + Name of the resulting TimedeltaIndex. + closed : str, default None + Make the interval closed with respect to the given frequency to + the 'left', 'right', or both sides (None). + unit : str, default None + Specify the desired resolution of the result. + + .. versionadded:: 2.0.0 + + Returns + ------- + TimedeltaIndex + + Notes + ----- + Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, + exactly three must be specified. If ``freq`` is omitted, the resulting + ``TimedeltaIndex`` will have ``periods`` linearly spaced elements between + ``start`` and ``end`` (closed on both sides). + + To learn more about the frequency strings, please see `this link + `__. + + Examples + -------- + >>> pd.timedelta_range(start='1 day', periods=4) + TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq='D') + + The ``closed`` parameter specifies which endpoint is included. The default + behavior is to include both endpoints. + + >>> pd.timedelta_range(start='1 day', periods=4, closed='right') + TimedeltaIndex(['2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq='D') + + The ``freq`` parameter specifies the frequency of the TimedeltaIndex. + Only fixed frequencies can be passed, non-fixed frequencies such as + 'M' (month end) will raise. + + >>> pd.timedelta_range(start='1 day', end='2 days', freq='6h') + TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00', + '1 days 18:00:00', '2 days 00:00:00'], + dtype='timedelta64[ns]', freq='6h') + + Specify ``start``, ``end``, and ``periods``; the frequency is generated + automatically (linearly spaced). + + >>> pd.timedelta_range(start='1 day', end='5 days', periods=4) + TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00', + '5 days 00:00:00'], + dtype='timedelta64[ns]', freq=None) + + **Specify a unit** + + >>> pd.timedelta_range("1 Day", periods=3, freq="100000D", unit="s") + TimedeltaIndex(['1 days', '100001 days', '200001 days'], + dtype='timedelta64[s]', freq='100000D') + """ + if freq is None and com.any_none(periods, start, end): + freq = "D" + + freq = to_offset(freq) + tdarr = TimedeltaArray._generate_range( + start, end, periods, freq, closed=closed, unit=unit + ) + return TimedeltaIndex._simple_new(tdarr, name=name) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdf28dee5280389a31f109deabd539569b309e46 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/buffer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/buffer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9a74cedfd981b4fb08583d2cc0f07e733352484 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/buffer.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/column.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/column.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3a7f19146b771884bb7e91f538fa7f7e1d30f59 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/column.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/dataframe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/dataframe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc44fe817dbb87244953862b52c2bae8e9a37250 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/dataframe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/dataframe_protocol.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/dataframe_protocol.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d54af43ec3f4aadb8ced9d8ba2a47b5eef551b0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/dataframe_protocol.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/from_dataframe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/from_dataframe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ee17fdf648f5b3b4611ffc18681451e950403cc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/from_dataframe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8fc4143aa69afdd6253882dd4a633db28a947dcf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/__pycache__/utils.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/buffer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..5d24325e67f62a5da0fa3863d81576dd61f86869 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/buffer.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, +) + +from pandas.core.interchange.dataframe_protocol import ( + Buffer, + DlpackDeviceType, +) + +if TYPE_CHECKING: + import numpy as np + import pyarrow as pa + + +class PandasBuffer(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: + """ + Handle only regular columns (= numpy arrays) for now. + """ + if x.strides[0] and not x.strides == (x.dtype.itemsize,): + # The protocol does not support strided buffers, so a copy is + # necessary. If that's not allowed, we need to raise an exception. + if allow_copy: + x = x.copy() + else: + raise RuntimeError( + "Exports cannot be zero-copy in the case " + "of a non-contiguous buffer" + ) + + # Store the numpy array in which the data resides as a private + # attribute, so we can use it to retrieve the public attributes + self._x = x + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._x.size * self._x.dtype.itemsize + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._x.__array_interface__["data"][0] + + def __dlpack__(self) -> Any: + """ + Represent this structure as DLPack interface. + """ + return self._x.__dlpack__() + + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + return ( + "PandasBuffer(" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": self.__dlpack_device__()[0].name, + } + ) + + ")" + ) + + +class PandasBufferPyarrow(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__( + self, + buffer: pa.Buffer, + *, + length: int, + ) -> None: + """ + Handle pyarrow chunked arrays. + """ + self._buffer = buffer + self._length = length + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._buffer.size + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._buffer.address + + def __dlpack__(self) -> Any: + """ + Represent this structure as DLPack interface. + """ + raise NotImplementedError() + + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + return ( + "PandasBuffer[pyarrow](" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": "CPU", + } + ) + + ")" + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/column.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/column.py new file mode 100644 index 0000000000000000000000000000000000000000..d59a3df694bb3f6ab5716c25592704d49737a215 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/column.py @@ -0,0 +1,461 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, +) + +import numpy as np + +from pandas._libs.lib import infer_dtype +from pandas._libs.tslibs import iNaT +from pandas.errors import NoBufferPresent +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.dtypes import BaseMaskedDtype + +import pandas as pd +from pandas import ( + ArrowDtype, + DatetimeTZDtype, +) +from pandas.api.types import is_string_dtype +from pandas.core.interchange.buffer import ( + PandasBuffer, + PandasBufferPyarrow, +) +from pandas.core.interchange.dataframe_protocol import ( + Column, + ColumnBuffers, + ColumnNullType, + DtypeKind, +) +from pandas.core.interchange.utils import ( + ArrowCTypes, + Endianness, + dtype_to_arrow_c_fmt, +) + +if TYPE_CHECKING: + from pandas.core.interchange.dataframe_protocol import Buffer + +_NP_KINDS = { + "i": DtypeKind.INT, + "u": DtypeKind.UINT, + "f": DtypeKind.FLOAT, + "b": DtypeKind.BOOL, + "U": DtypeKind.STRING, + "M": DtypeKind.DATETIME, + "m": DtypeKind.DATETIME, +} + +_NULL_DESCRIPTION = { + DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None), + DtypeKind.DATETIME: (ColumnNullType.USE_SENTINEL, iNaT), + DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None), + DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None), + DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None), + # Null values for categoricals are stored as `-1` sentinel values + # in the category date (e.g., `col.values.codes` is int8 np.ndarray) + DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1), + # follow Arrow in using 1 as valid value and 0 for missing/null value + DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0), +} + +_NO_VALIDITY_BUFFER = { + ColumnNullType.NON_NULLABLE: "This column is non-nullable", + ColumnNullType.USE_NAN: "This column uses NaN as null", + ColumnNullType.USE_SENTINEL: "This column uses a sentinel value", +} + + +class PandasColumn(Column): + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: + """ + Note: doesn't deal with extension arrays yet, just assume a regular + Series/ndarray for now. + """ + if isinstance(column, pd.DataFrame): + raise TypeError( + "Expected a Series, got a DataFrame. This likely happened " + "because you called __dataframe__ on a DataFrame which, " + "after converting column names to string, resulted in duplicated " + f"names: {column.columns}. Please rename these columns before " + "using the interchange protocol." + ) + if not isinstance(column, pd.Series): + raise NotImplementedError(f"Columns of type {type(column)} not handled yet") + + # Store the column as a private attribute + self._col = column + self._allow_copy = allow_copy + + def size(self) -> int: + """ + Size of the column, in elements. + """ + return self._col.size + + @property + def offset(self) -> int: + """ + Offset of first element. Always zero. + """ + # TODO: chunks are implemented now, probably this should return something + return 0 + + @cache_readonly + def dtype(self) -> tuple[DtypeKind, int, str, str]: + dtype = self._col.dtype + + if isinstance(dtype, pd.CategoricalDtype): + codes = self._col.values.codes + ( + _, + bitwidth, + c_arrow_dtype_f_str, + _, + ) = self._dtype_from_pandasdtype(codes.dtype) + return ( + DtypeKind.CATEGORICAL, + bitwidth, + c_arrow_dtype_f_str, + Endianness.NATIVE, + ) + elif is_string_dtype(dtype): + if infer_dtype(self._col) in ("string", "empty"): + return ( + DtypeKind.STRING, + 8, + dtype_to_arrow_c_fmt(dtype), + Endianness.NATIVE, + ) + raise NotImplementedError("Non-string object dtypes are not supported yet") + else: + return self._dtype_from_pandasdtype(dtype) + + def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: + """ + See `self.dtype` for details. + """ + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled + # datetime and timedelta both map to datetime (is timedelta handled?) + + kind = _NP_KINDS.get(dtype.kind, None) + if kind is None: + # Not a NumPy dtype. Check if it's a categorical maybe + raise ValueError(f"Data type {dtype} not supported by interchange protocol") + if isinstance(dtype, ArrowDtype): + byteorder = dtype.numpy_dtype.byteorder + elif isinstance(dtype, DatetimeTZDtype): + byteorder = dtype.base.byteorder # type: ignore[union-attr] + elif isinstance(dtype, BaseMaskedDtype): + byteorder = dtype.numpy_dtype.byteorder + else: + byteorder = dtype.byteorder + + if dtype == "bool[pyarrow]": + # return early to avoid the `* 8` below, as this is a bitmask + # rather than a bytemask + return ( + kind, + dtype.itemsize, # pyright: ignore[reportGeneralTypeIssues] + ArrowCTypes.BOOL, + byteorder, + ) + + return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder + + @property + def describe_categorical(self): + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding for categorical values. + + Raises TypeError if the dtype is not categorical + + Content of returned dict: + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of indices to + category values (e.g. an array of cat1, cat2, ...). + None if not a dictionary-style categorical. + """ + if not self.dtype[0] == DtypeKind.CATEGORICAL: + raise TypeError( + "describe_categorical only works on a column with categorical dtype!" + ) + + return { + "is_ordered": self._col.cat.ordered, + "is_dictionary": True, + "categories": PandasColumn(pd.Series(self._col.cat.categories)), + } + + @property + def describe_null(self): + if isinstance(self._col.dtype, BaseMaskedDtype): + column_null_dtype = ColumnNullType.USE_BYTEMASK + null_value = 1 + return column_null_dtype, null_value + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + if self._col.array._pa_array.chunks[0].buffers()[0] is None: # type: ignore[attr-defined] + return ColumnNullType.NON_NULLABLE, None + return ColumnNullType.USE_BITMASK, 0 + kind = self.dtype[0] + try: + null, value = _NULL_DESCRIPTION[kind] + except KeyError: + raise NotImplementedError(f"Data type {kind} not yet supported") + + return null, value + + @cache_readonly + def null_count(self) -> int: + """ + Number of null elements. Should always be known. + """ + return self._col.isna().sum().item() + + @property + def metadata(self) -> dict[str, pd.Index]: + """ + Store specific metadata of the column. + """ + return {"pandas.index": self._col.index} + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + return 1 + + def get_chunks(self, n_chunks: int | None = None): + """ + Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + if n_chunks and n_chunks > 1: + size = len(self._col) + step = size // n_chunks + if size % n_chunks != 0: + step += 1 + for start in range(0, step * n_chunks, step): + yield PandasColumn( + self._col.iloc[start : start + step], self._allow_copy + ) + else: + yield self + + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + The returned dictionary has the following contents: + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + buffers: ColumnBuffers = { + "data": self._get_data_buffer(), + "validity": None, + "offsets": None, + } + + try: + buffers["validity"] = self._get_validity_buffer() + except NoBufferPresent: + pass + + try: + buffers["offsets"] = self._get_offsets_buffer() + except NoBufferPresent: + pass + + return buffers + + def _get_data_buffer( + self, + ) -> tuple[Buffer, tuple[DtypeKind, int, str, str]]: + """ + Return the buffer containing the data and the buffer's associated dtype. + """ + buffer: Buffer + if self.dtype[0] in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + DtypeKind.DATETIME, + ): + # self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make + # it longer than 4 characters + dtype = self.dtype + if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4: + np_arr = self._col.dt.tz_convert(None).to_numpy() + else: + arr = self._col.array + if isinstance(self._col.dtype, BaseMaskedDtype): + np_arr = arr._data # type: ignore[attr-defined] + elif isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, + # so this is already single-chunk by the time we get here. + arr = arr._pa_array.chunks[0] # type: ignore[attr-defined] + buffer = PandasBufferPyarrow( + arr.buffers()[1], # type: ignore[attr-defined] + length=len(arr), + ) + return buffer, dtype + else: + np_arr = arr._ndarray # type: ignore[attr-defined] + buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) + elif self.dtype[0] == DtypeKind.CATEGORICAL: + codes = self._col.values._codes + buffer = PandasBuffer(codes, allow_copy=self._allow_copy) + dtype = self._dtype_from_pandasdtype(codes.dtype) + elif self.dtype[0] == DtypeKind.STRING: + # Marshal the strings from a NumPy object array into a byte array + buf = self._col.to_numpy() + b = bytearray() + + # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later + for obj in buf: + if isinstance(obj, str): + b.extend(obj.encode(encoding="utf-8")) + + # Convert the byte array to a Pandas "buffer" using + # a NumPy array as the backing store + buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) + + # Define the dtype for the returned buffer + # TODO: this will need correcting + # https://github.com/pandas-dev/pandas/issues/54781 + dtype = self.dtype + else: + raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") + + return buffer, dtype + + def _get_validity_buffer(self) -> tuple[Buffer, Any] | None: + """ + Return the buffer containing the mask values indicating missing data and + the buffer's associated dtype. + Raises NoBufferPresent if null representation is not a bit or byte mask. + """ + null, invalid = self.describe_null + buffer: Buffer + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + arr = self._col.array._pa_array.chunks[0] # type: ignore[attr-defined] + dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE) + if arr.buffers()[0] is None: + return None + buffer = PandasBufferPyarrow( + arr.buffers()[0], + length=len(arr), + ) + return buffer, dtype + + if isinstance(self._col.dtype, BaseMaskedDtype): + mask = self._col.array._mask # type: ignore[attr-defined] + buffer = PandasBuffer(mask) + dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) + return buffer, dtype + + if self.dtype[0] == DtypeKind.STRING: + # For now, use byte array as the mask. + # TODO: maybe store as bit array to save space?.. + buf = self._col.to_numpy() + + # Determine the encoding for valid values + valid = invalid == 0 + invalid = not valid + + mask = np.zeros(shape=(len(buf),), dtype=np.bool_) + for i, obj in enumerate(buf): + mask[i] = valid if isinstance(obj, str) else invalid + + # Convert the mask array to a Pandas "buffer" using + # a NumPy array as the backing store + buffer = PandasBuffer(mask) + + # Define the dtype of the returned buffer + dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) + + return buffer, dtype + + try: + msg = f"{_NO_VALIDITY_BUFFER[null]} so does not have a separate mask" + except KeyError: + # TODO: implement for other bit/byte masks? + raise NotImplementedError("See self.describe_null") + + raise NoBufferPresent(msg) + + def _get_offsets_buffer(self) -> tuple[PandasBuffer, Any]: + """ + Return the buffer containing the offset values for variable-size binary + data (e.g., variable-length strings) and the buffer's associated dtype. + Raises NoBufferPresent if the data buffer does not have an associated + offsets buffer. + """ + if self.dtype[0] == DtypeKind.STRING: + # For each string, we need to manually determine the next offset + values = self._col.to_numpy() + ptr = 0 + offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64) + for i, v in enumerate(values): + # For missing values (in this case, `np.nan` values) + # we don't increment the pointer + if isinstance(v, str): + b = v.encode(encoding="utf-8") + ptr += len(b) + + offsets[i + 1] = ptr + + # Convert the offsets to a Pandas "buffer" using + # the NumPy array as the backing store + buffer = PandasBuffer(offsets) + + # Assemble the buffer dtype info + dtype = ( + DtypeKind.INT, + 64, + ArrowCTypes.INT64, + Endianness.NATIVE, + ) # note: currently only support native endianness + else: + raise NoBufferPresent( + "This column has a fixed-length dtype so " + "it does not have an offsets buffer" + ) + + return buffer, dtype diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/dataframe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/dataframe.py new file mode 100644 index 0000000000000000000000000000000000000000..1abacddfc7e3b7035a9446cd4118097c6accd385 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/dataframe.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +from collections import abc +from typing import TYPE_CHECKING + +from pandas.core.interchange.column import PandasColumn +from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg +from pandas.core.interchange.utils import maybe_rechunk + +if TYPE_CHECKING: + from collections.abc import ( + Iterable, + Sequence, + ) + + from pandas import ( + DataFrame, + Index, + ) + + +class PandasDataFrameXchg(DataFrameXchg): + """ + A data frame class, with only the methods required by the interchange + protocol defined. + Instances of this (private) class are returned from + ``pd.DataFrame.__dataframe__`` as objects with the methods and + attributes defined on this class. + """ + + def __init__(self, df: DataFrame, allow_copy: bool = True) -> None: + """ + Constructor - an instance of this (private) class is returned from + `pd.DataFrame.__dataframe__`. + """ + self._df = df.rename(columns=str, copy=False) + self._allow_copy = allow_copy + for i, _col in enumerate(self._df.columns): + rechunked = maybe_rechunk(self._df.iloc[:, i], allow_copy=allow_copy) + if rechunked is not None: + self._df.isetitem(i, rechunked) + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> PandasDataFrameXchg: + # `nan_as_null` can be removed here once it's removed from + # Dataframe.__dataframe__ + return PandasDataFrameXchg(self._df, allow_copy) + + @property + def metadata(self) -> dict[str, Index]: + # `index` isn't a regular column, and the protocol doesn't support row + # labels - so we export it as Pandas-specific metadata here. + return {"pandas.index": self._df.index} + + def num_columns(self) -> int: + return len(self._df.columns) + + def num_rows(self) -> int: + return len(self._df) + + def num_chunks(self) -> int: + return 1 + + def column_names(self) -> Index: + return self._df.columns + + def get_column(self, i: int) -> PandasColumn: + return PandasColumn(self._df.iloc[:, i], allow_copy=self._allow_copy) + + def get_column_by_name(self, name: str) -> PandasColumn: + return PandasColumn(self._df[name], allow_copy=self._allow_copy) + + def get_columns(self) -> list[PandasColumn]: + return [ + PandasColumn(self._df[name], allow_copy=self._allow_copy) + for name in self._df.columns + ] + + def select_columns(self, indices: Sequence[int]) -> PandasDataFrameXchg: + if not isinstance(indices, abc.Sequence): + raise ValueError("`indices` is not a sequence") + if not isinstance(indices, list): + indices = list(indices) + + return PandasDataFrameXchg( + self._df.iloc[:, indices], allow_copy=self._allow_copy + ) + + def select_columns_by_name(self, names: list[str]) -> PandasDataFrameXchg: # type: ignore[override] + if not isinstance(names, abc.Sequence): + raise ValueError("`names` is not a sequence") + if not isinstance(names, list): + names = list(names) + + return PandasDataFrameXchg(self._df.loc[:, names], allow_copy=self._allow_copy) + + def get_chunks(self, n_chunks: int | None = None) -> Iterable[PandasDataFrameXchg]: + """ + Return an iterator yielding the chunks. + """ + if n_chunks and n_chunks > 1: + size = len(self._df) + step = size // n_chunks + if size % n_chunks != 0: + step += 1 + for start in range(0, step * n_chunks, step): + yield PandasDataFrameXchg( + self._df.iloc[start : start + step, :], + allow_copy=self._allow_copy, + ) + else: + yield self diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/dataframe_protocol.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/dataframe_protocol.py new file mode 100644 index 0000000000000000000000000000000000000000..95e7b6a26f93a8cd10048076bd6906190e04d2ba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/dataframe_protocol.py @@ -0,0 +1,465 @@ +""" +A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api +""" + +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +import enum +from typing import ( + TYPE_CHECKING, + Any, + TypedDict, +) + +if TYPE_CHECKING: + from collections.abc import ( + Iterable, + Sequence, + ) + + +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN/NaT. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: tuple[Buffer, Any] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: tuple[Buffer, Any] | None + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: tuple[Buffer, Any] | None + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + categories: Column | None + + +class Buffer(ABC): + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + + @property + @abstractmethod + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + + @property + @abstractmethod + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + + @abstractmethod + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + + Raises: + + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + raise NotImplementedError("__dlpack__") + + @abstractmethod + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. + """ + + +class Column(ABC): + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + @abstractmethod + def size(self) -> int: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + """ + + @property + @abstractmethod + def offset(self) -> int: + """ + Offset of first element. + + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + + @property + @abstractmethod + def dtype(self) -> tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for bit + masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the future + we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, decimal, + and nested (list, struct, map, union) dtypes. + """ + + @property + @abstractmethod + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding for categorical values. + + Raises TypeError if the dtype is not categorical + + Returns the dictionary with description on how to interpret the data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of indices to + category values (e.g. an array of cat1, cat2, ...). + None if not a dictionary-style categorical. + + TBD: are there any other in-memory representations that are needed? + """ + + @property + @abstractmethod + def describe_null(self) -> tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. None + otherwise. + """ + + @property + @abstractmethod + def null_count(self) -> int | None: + """ + Number of null elements, if known. + + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + + @property + @abstractmethod + def metadata(self) -> dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + + @abstractmethod + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + + @abstractmethod + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + + @abstractmethod + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + + +# def get_children(self) -> Iterable[Column]: +# """ +# Children columns underneath the column, each object in this iterator +# must adhere to the column specification. +# """ +# pass + + +class DataFrame(ABC): + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + + version = 0 # version of the protocol + + @abstractmethod + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): + """Construct a new interchange object, potentially changing the parameters.""" + + @property + @abstractmethod + def metadata(self) -> dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + + @abstractmethod + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + + @abstractmethod + def num_rows(self) -> int | None: + # TODO: not happy with Optional, but need to flag it may be expensive + # why include it if it may be None - what do we expect consumers + # to do here? + """ + Return the number of rows in the DataFrame, if available. + """ + + @abstractmethod + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + + @abstractmethod + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + + @abstractmethod + def get_column(self, i: int) -> Column: + """ + Return the column at the indicated position. + """ + + @abstractmethod + def get_column_by_name(self, name: str) -> Column: + """ + Return the column whose name is the indicated name. + """ + + @abstractmethod + def get_columns(self) -> Iterable[Column]: + """ + Return an iterator yielding the columns. + """ + + @abstractmethod + def select_columns(self, indices: Sequence[int]) -> DataFrame: + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + + @abstractmethod + def select_columns_by_name(self, names: Sequence[str]) -> DataFrame: + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + + @abstractmethod + def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]: + """ + Return an iterator yielding the chunks. + + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + """ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py new file mode 100644 index 0000000000000000000000000000000000000000..4162ebc33f0d6bff8a1caff23b1db1dca20e4f3a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py @@ -0,0 +1,526 @@ +from __future__ import annotations + +import ctypes +import re +from typing import Any + +import numpy as np + +from pandas.compat._optional import import_optional_dependency +from pandas.errors import SettingWithCopyError + +import pandas as pd +from pandas.core.interchange.dataframe_protocol import ( + Buffer, + Column, + ColumnNullType, + DataFrame as DataFrameXchg, + DtypeKind, +) +from pandas.core.interchange.utils import ( + ArrowCTypes, + Endianness, +) + +_NP_DTYPES: dict[DtypeKind, dict[int, Any]] = { + DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, + DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, + DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, + DtypeKind.BOOL: {1: bool, 8: bool}, +} + + +def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame: + """ + Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. + + Parameters + ---------- + df : DataFrameXchg + Object supporting the interchange protocol, i.e. `__dataframe__` method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pd.DataFrame + + Examples + -------- + >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> interchange_object = df_not_necessarily_pandas.__dataframe__() + >>> interchange_object.column_names() + Index(['A', 'B'], dtype='object') + >>> df_pandas = (pd.api.interchange.from_dataframe + ... (interchange_object.select_columns_by_name(['A']))) + >>> df_pandas + A + 0 1 + 1 2 + + These methods (``column_names``, ``select_columns_by_name``) should work + for any dataframe library which implements the interchange protocol. + """ + if isinstance(df, pd.DataFrame): + return df + + if not hasattr(df, "__dataframe__"): + raise ValueError("`df` does not support __dataframe__") + + return _from_dataframe( + df.__dataframe__(allow_copy=allow_copy), allow_copy=allow_copy + ) + + +def _from_dataframe(df: DataFrameXchg, allow_copy: bool = True): + """ + Build a ``pd.DataFrame`` from the DataFrame interchange object. + + Parameters + ---------- + df : DataFrameXchg + Object supporting the interchange protocol, i.e. `__dataframe__` method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pd.DataFrame + """ + pandas_dfs = [] + for chunk in df.get_chunks(): + pandas_df = protocol_df_chunk_to_pandas(chunk) + pandas_dfs.append(pandas_df) + + if not allow_copy and len(pandas_dfs) > 1: + raise RuntimeError( + "To join chunks a copy is required which is forbidden by allow_copy=False" + ) + if not pandas_dfs: + pandas_df = protocol_df_chunk_to_pandas(df) + elif len(pandas_dfs) == 1: + pandas_df = pandas_dfs[0] + else: + pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False) + + index_obj = df.metadata.get("pandas.index", None) + if index_obj is not None: + pandas_df.index = index_obj + + return pandas_df + + +def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: + """ + Convert interchange protocol chunk to ``pd.DataFrame``. + + Parameters + ---------- + df : DataFrameXchg + + Returns + ------- + pd.DataFrame + """ + # We need a dict of columns here, with each column being a NumPy array (at + # least for now, deal with non-NumPy dtypes later). + columns: dict[str, Any] = {} + buffers = [] # hold on to buffers, keeps memory alive + for name in df.column_names(): + if not isinstance(name, str): + raise ValueError(f"Column {name} is not a string") + if name in columns: + raise ValueError(f"Column {name} is not unique") + col = df.get_column_by_name(name) + dtype = col.dtype[0] + if dtype in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + ): + columns[name], buf = primitive_column_to_ndarray(col) + elif dtype == DtypeKind.CATEGORICAL: + columns[name], buf = categorical_column_to_series(col) + elif dtype == DtypeKind.STRING: + columns[name], buf = string_column_to_ndarray(col) + elif dtype == DtypeKind.DATETIME: + columns[name], buf = datetime_column_to_ndarray(col) + else: + raise NotImplementedError(f"Data type {dtype} not handled yet") + + buffers.append(buf) + + pandas_df = pd.DataFrame(columns) + pandas_df.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"] = buffers + return pandas_df + + +def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: + """ + Convert a column holding one of the primitive dtypes to a NumPy array. + + A primitive type is one of: int, uint, float, bool. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. + """ + buffers = col.get_buffers() + + data_buff, data_dtype = buffers["data"] + data = buffer_to_ndarray( + data_buff, data_dtype, offset=col.offset, length=col.size() + ) + + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: + """ + Convert a column holding categorical data to a pandas Series. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of pd.Series holding the data and the memory owner object + that keeps the memory alive. + """ + categorical = col.describe_categorical + + if not categorical["is_dictionary"]: + raise NotImplementedError("Non-dictionary categoricals not supported yet") + + cat_column = categorical["categories"] + if hasattr(cat_column, "_col"): + # Item "Column" of "Optional[Column]" has no attribute "_col" + # Item "None" of "Optional[Column]" has no attribute "_col" + categories = np.array(cat_column._col) # type: ignore[union-attr] + else: + raise NotImplementedError( + "Interchanging categorical columns isn't supported yet, and our " + "fallback of using the `col._col` attribute (a ndarray) failed." + ) + buffers = col.get_buffers() + + codes_buff, codes_dtype = buffers["data"] + codes = buffer_to_ndarray( + codes_buff, codes_dtype, offset=col.offset, length=col.size() + ) + + # Doing module in order to not get ``IndexError`` for + # out-of-bounds sentinel values in `codes` + if len(categories) > 0: + values = categories[codes % len(categories)] + else: + values = codes + + cat = pd.Categorical( + values, categories=categories, ordered=categorical["is_ordered"] + ) + data = pd.Series(cat) + + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: + """ + Convert a column holding string data to a NumPy array. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. + """ + null_kind, sentinel_val = col.describe_null + + if null_kind not in ( + ColumnNullType.NON_NULLABLE, + ColumnNullType.USE_BITMASK, + ColumnNullType.USE_BYTEMASK, + ): + raise NotImplementedError( + f"{null_kind} null kind is not yet supported for string columns." + ) + + buffers = col.get_buffers() + + assert buffers["offsets"], "String buffers must contain offsets" + # Retrieve the data buffer containing the UTF-8 code units + data_buff, _ = buffers["data"] + # We're going to reinterpret the buffer as uint8, so make sure we can do it safely + assert col.dtype[2] in ( + ArrowCTypes.STRING, + ArrowCTypes.LARGE_STRING, + ) # format_str == utf-8 + # Convert the buffers to NumPy arrays. In order to go from STRING to + # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) + data_dtype = ( + DtypeKind.UINT, + 8, + ArrowCTypes.UINT8, + Endianness.NATIVE, + ) + # Specify zero offset as we don't want to chunk the string data + data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize) + + # Retrieve the offsets buffer containing the index offsets demarcating + # the beginning and the ending of each string + offset_buff, offset_dtype = buffers["offsets"] + # Offsets buffer contains start-stop positions of strings in the data buffer, + # meaning that it has more elements than in the data buffer, do `col.size() + 1` + # here to pass a proper offsets buffer size + offsets = buffer_to_ndarray( + offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1 + ) + + null_pos = None + if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): + validity = buffers["validity"] + if validity is not None: + valid_buff, valid_dtype = validity + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) + if sentinel_val == 0: + null_pos = ~null_pos + + # Assemble the strings from the code units + str_list: list[None | float | str] = [None] * col.size() + for i in range(col.size()): + # Check for missing values + if null_pos is not None and null_pos[i]: + str_list[i] = np.nan + continue + + # Extract a range of code units + units = data[offsets[i] : offsets[i + 1]] + + # Convert the list of code units to bytes + str_bytes = bytes(units) + + # Create the string + string = str_bytes.decode(encoding="utf-8") + + # Add to our list of strings + str_list[i] = string + + # Convert the string list to a NumPy array + return np.asarray(str_list, dtype="object"), buffers + + +def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray: + """Parse datetime `format_str` to interpret the `data`.""" + # timestamp 'ts{unit}:tz' + timestamp_meta = re.match(r"ts([smun]):(.*)", format_str) + if timestamp_meta: + unit, tz = timestamp_meta.group(1), timestamp_meta.group(2) + if unit != "s": + # the format string describes only a first letter of the unit, so + # add one extra letter to convert the unit to numpy-style: + # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' + unit += "s" + data = data.astype(f"datetime64[{unit}]") + if tz != "": + data = pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(tz) + return data + + # date 'td{Days/Ms}' + date_meta = re.match(r"td([Dm])", format_str) + if date_meta: + unit = date_meta.group(1) + if unit == "D": + # NumPy doesn't support DAY unit, so converting days to seconds + # (converting to uint64 to avoid overflow) + data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") + elif unit == "m": + data = data.astype("datetime64[ms]") + else: + raise NotImplementedError(f"Date unit is not supported: {unit}") + return data + + raise NotImplementedError(f"DateTime kind is not supported: {format_str}") + + +def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any]: + """ + Convert a column holding DateTime data to a NumPy array. + + Parameters + ---------- + col : Column + + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. + """ + buffers = col.get_buffers() + + _, col_bit_width, format_str, _ = col.dtype + dbuf, _ = buffers["data"] + # Consider dtype being `uint` to get number of units passed since the 01.01.1970 + + data = buffer_to_ndarray( + dbuf, + ( + DtypeKind.INT, + col_bit_width, + getattr(ArrowCTypes, f"INT{col_bit_width}"), + Endianness.NATIVE, + ), + offset=col.offset, + length=col.size(), + ) + + data = parse_datetime_format_str(format_str, data) # type: ignore[assignment] + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def buffer_to_ndarray( + buffer: Buffer, + dtype: tuple[DtypeKind, int, str, str], + *, + length: int, + offset: int = 0, +) -> np.ndarray: + """ + Build a NumPy array from the passed buffer. + + Parameters + ---------- + buffer : Buffer + Buffer to build a NumPy array from. + dtype : tuple + Data type of the buffer conforming protocol dtypes format. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + length : int, optional + If the buffer is a bit-mask, specifies a number of bits to read + from the buffer. Has no effect otherwise. + + Returns + ------- + np.ndarray + + Notes + ----- + The returned array doesn't own the memory. The caller of this function is + responsible for keeping the memory owner object alive as long as + the returned NumPy array is being used. + """ + kind, bit_width, _, _ = dtype + + column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None) + if column_dtype is None: + raise NotImplementedError(f"Conversion for {dtype} is not yet supported.") + + # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer + # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports + # it since https://github.com/numpy/numpy/pull/19083 + ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) + + if bit_width == 1: + assert length is not None, "`length` must be specified for a bit-mask buffer." + pa = import_optional_dependency("pyarrow") + arr = pa.BooleanArray.from_buffers( + pa.bool_(), + length, + [None, pa.foreign_buffer(buffer.ptr, length)], + offset=offset, + ) + return np.asarray(arr) + else: + data_pointer = ctypes.cast( + buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) + ) + if length > 0: + return np.ctypeslib.as_array(data_pointer, shape=(length,)) + return np.array([], dtype=ctypes_type) + + +def set_nulls( + data: np.ndarray | pd.Series, + col: Column, + validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None, + allow_modify_inplace: bool = True, +): + """ + Set null values for the data according to the column null kind. + + Parameters + ---------- + data : np.ndarray or pd.Series + Data to set nulls in. + col : Column + Column object that describes the `data`. + validity : tuple(Buffer, dtype) or None + The return value of ``col.buffers()``. We do not access the ``col.buffers()`` + here to not take the ownership of the memory of buffer objects. + allow_modify_inplace : bool, default: True + Whether to modify the `data` inplace when zero-copy is possible (True) or always + modify a copy of the `data` (False). + + Returns + ------- + np.ndarray or pd.Series + Data with the nulls being set. + """ + if validity is None: + return data + null_kind, sentinel_val = col.describe_null + null_pos = None + + if null_kind == ColumnNullType.USE_SENTINEL: + null_pos = pd.Series(data) == sentinel_val + elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): + assert validity, "Expected to have a validity buffer for the mask" + valid_buff, valid_dtype = validity + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) + if sentinel_val == 0: + null_pos = ~null_pos + elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): + pass + else: + raise NotImplementedError(f"Null kind {null_kind} is not yet supported.") + + if null_pos is not None and np.any(null_pos): + if not allow_modify_inplace: + data = data.copy() + try: + data[null_pos] = None + except TypeError: + # TypeError happens if the `data` dtype appears to be non-nullable + # in numpy notation (bool, int, uint). If this happens, + # cast the `data` to nullable float dtype. + data = data.astype(float) + data[null_pos] = None + except SettingWithCopyError: + # `SettingWithCopyError` may happen for datetime-like with missing values. + data = data.copy() + data[null_pos] = None + + return data diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fd1c7c96392428a44543bbecc7b37931cf69fe59 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/interchange/utils.py @@ -0,0 +1,178 @@ +""" +Utility functions and objects for implementing the interchange API. +""" + +from __future__ import annotations + +import typing + +import numpy as np + +from pandas._libs import lib + +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, + DatetimeTZDtype, +) + +import pandas as pd + +if typing.TYPE_CHECKING: + from pandas._typing import DtypeObj + + +# Maps str(pyarrow.DataType) = C type format string +# Currently, no pyarrow API for this +PYARROW_CTYPES = { + "null": "n", + "bool": "b", + "uint8": "C", + "uint16": "S", + "uint32": "I", + "uint64": "L", + "int8": "c", + "int16": "S", + "int32": "i", + "int64": "l", + "halffloat": "e", # float16 + "float": "f", # float32 + "double": "g", # float64 + "string": "u", + "large_string": "U", + "binary": "z", + "time32[s]": "tts", + "time32[ms]": "ttm", + "time64[us]": "ttu", + "time64[ns]": "ttn", + "date32[day]": "tdD", + "date64[ms]": "tdm", + "timestamp[s]": "tss:", + "timestamp[ms]": "tsm:", + "timestamp[us]": "tsu:", + "timestamp[ns]": "tsn:", + "duration[s]": "tDs", + "duration[ms]": "tDm", + "duration[us]": "tDu", + "duration[ns]": "tDn", +} + + +class ArrowCTypes: + """ + Enum for Apache Arrow C type format strings. + + The Arrow C data interface: + https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings + """ + + NULL = "n" + BOOL = "b" + INT8 = "c" + UINT8 = "C" + INT16 = "s" + UINT16 = "S" + INT32 = "i" + UINT32 = "I" + INT64 = "l" + UINT64 = "L" + FLOAT16 = "e" + FLOAT32 = "f" + FLOAT64 = "g" + STRING = "u" # utf-8 + LARGE_STRING = "U" # utf-8 + DATE32 = "tdD" + DATE64 = "tdm" + # Resoulution: + # - seconds -> 's' + # - milliseconds -> 'm' + # - microseconds -> 'u' + # - nanoseconds -> 'n' + TIMESTAMP = "ts{resolution}:{tz}" + TIME = "tt{resolution}" + + +class Endianness: + """Enum indicating the byte-order of a data-type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: + """ + Represent pandas `dtype` as a format string in Apache Arrow C notation. + + Parameters + ---------- + dtype : np.dtype + Datatype of pandas DataFrame to represent. + + Returns + ------- + str + Format string in Apache Arrow C notation of the given `dtype`. + """ + if isinstance(dtype, CategoricalDtype): + return ArrowCTypes.INT64 + elif dtype == np.dtype("O"): + return ArrowCTypes.STRING + elif isinstance(dtype, ArrowDtype): + import pyarrow as pa + + pa_type = dtype.pyarrow_dtype + if pa.types.is_decimal(pa_type): + return f"d:{pa_type.precision},{pa_type.scale}" + elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None: + return f"ts{pa_type.unit[0]}:{pa_type.tz}" + format_str = PYARROW_CTYPES.get(str(pa_type), None) + if format_str is not None: + return format_str + + format_str = getattr(ArrowCTypes, dtype.name.upper(), None) + if format_str is not None: + return format_str + + if lib.is_np_dtype(dtype, "M"): + # Selecting the first char of resolution string: + # dtype.str -> ' 'n' + resolution = np.datetime_data(dtype)[0][0] + return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="") + + elif isinstance(dtype, DatetimeTZDtype): + return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz) + + elif isinstance(dtype, pd.BooleanDtype): + return ArrowCTypes.BOOL + + raise NotImplementedError( + f"Conversion of {dtype} to Arrow C format string is not implemented." + ) + + +def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None: + """ + Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary. + + - Returns `None` if the input series is not backed by a multi-chunk pyarrow array + (and so doesn't need rechunking) + - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk + pyarrow array and `allow_copy` is `True`. + - Raises a `RuntimeError` if `allow_copy` is `False` and input is a + based by a multi-chunk pyarrow array. + """ + if not isinstance(series.dtype, pd.ArrowDtype): + return None + chunked_array = series.array._pa_array # type: ignore[attr-defined] + if len(chunked_array.chunks) == 1: + return None + if not allow_copy: + raise RuntimeError( + "Found multi-chunk pyarrow array, but `allow_copy` is False. " + "Please rechunk the array before calling this function, or set " + "`allow_copy=True`." + ) + arr = chunked_array.combine_chunks() + return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2eb413440ba9c1ef4c016cd874d19c2aba6d791e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/__init__.py @@ -0,0 +1,85 @@ +from pandas.core.internals.api import make_block # 2023-09-18 pyarrow uses this +from pandas.core.internals.array_manager import ( + ArrayManager, + SingleArrayManager, +) +from pandas.core.internals.base import ( + DataManager, + SingleDataManager, +) +from pandas.core.internals.concat import concatenate_managers +from pandas.core.internals.managers import ( + BlockManager, + SingleBlockManager, +) + +__all__ = [ + "Block", # pylint: disable=undefined-all-variable + "DatetimeTZBlock", # pylint: disable=undefined-all-variable + "ExtensionBlock", # pylint: disable=undefined-all-variable + "make_block", + "DataManager", + "ArrayManager", + "BlockManager", + "SingleDataManager", + "SingleBlockManager", + "SingleArrayManager", + "concatenate_managers", +] + + +def __getattr__(name: str): + # GH#55139 + import warnings + + if name == "create_block_manager_from_blocks": + # GH#33892 + warnings.warn( + f"{name} is deprecated and will be removed in a future version. " + "Use public APIs instead.", + DeprecationWarning, + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, + ) + from pandas.core.internals.managers import create_block_manager_from_blocks + + return create_block_manager_from_blocks + + if name in [ + "NumericBlock", + "ObjectBlock", + "Block", + "ExtensionBlock", + "DatetimeTZBlock", + ]: + warnings.warn( + f"{name} is deprecated and will be removed in a future version. " + "Use public APIs instead.", + DeprecationWarning, + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, + ) + if name == "NumericBlock": + from pandas.core.internals.blocks import NumericBlock + + return NumericBlock + elif name == "DatetimeTZBlock": + from pandas.core.internals.blocks import DatetimeTZBlock + + return DatetimeTZBlock + elif name == "ExtensionBlock": + from pandas.core.internals.blocks import ExtensionBlock + + return ExtensionBlock + elif name == "Block": + from pandas.core.internals.blocks import Block + + return Block + else: + from pandas.core.internals.blocks import ObjectBlock + + return ObjectBlock + + raise AttributeError(f"module 'pandas.core.internals' has no attribute '{name}'") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/api.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/api.py new file mode 100644 index 0000000000000000000000000000000000000000..b0b3937ca47ea06c42b4b51964f6a74830a5d9ee --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/api.py @@ -0,0 +1,156 @@ +""" +This is a pseudo-public API for downstream libraries. We ask that downstream +authors + +1) Try to avoid using internals directly altogether, and failing that, +2) Use only functions exposed here (or in core.internals) + +""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs.internals import BlockPlacement + +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + PeriodDtype, +) + +from pandas.core.arrays import DatetimeArray +from pandas.core.construction import extract_array +from pandas.core.internals.blocks import ( + check_ndim, + ensure_block_shape, + extract_pandas_array, + get_block_type, + maybe_coerce_values, +) + +if TYPE_CHECKING: + from pandas._typing import Dtype + + from pandas.core.internals.blocks import Block + + +def make_block( + values, placement, klass=None, ndim=None, dtype: Dtype | None = None +) -> Block: + """ + This is a pseudo-public analogue to blocks.new_block. + + We ask that downstream libraries use this rather than any fully-internal + APIs, including but not limited to: + + - core.internals.blocks.make_block + - Block.make_block + - Block.make_block_same_class + - Block.__init__ + """ + if dtype is not None: + dtype = pandas_dtype(dtype) + + values, dtype = extract_pandas_array(values, dtype, ndim) + + from pandas.core.internals.blocks import ( + DatetimeTZBlock, + ExtensionBlock, + ) + + if klass is ExtensionBlock and isinstance(values.dtype, PeriodDtype): + # GH-44681 changed PeriodArray to be stored in the 2D + # NDArrayBackedExtensionBlock instead of ExtensionBlock + # -> still allow ExtensionBlock to be passed in this case for back compat + klass = None + + if klass is None: + dtype = dtype or values.dtype + klass = get_block_type(dtype) + + elif klass is DatetimeTZBlock and not isinstance(values.dtype, DatetimeTZDtype): + # pyarrow calls get here + values = DatetimeArray._simple_new( + # error: Argument "dtype" to "_simple_new" of "DatetimeArray" has + # incompatible type "Union[ExtensionDtype, dtype[Any], None]"; + # expected "Union[dtype[datetime64], DatetimeTZDtype]" + values, + dtype=dtype, # type: ignore[arg-type] + ) + + if not isinstance(placement, BlockPlacement): + placement = BlockPlacement(placement) + + ndim = maybe_infer_ndim(values, placement, ndim) + if isinstance(values.dtype, (PeriodDtype, DatetimeTZDtype)): + # GH#41168 ensure we can pass 1D dt64tz values + # More generally, any EA dtype that isn't is_1d_only_ea_dtype + values = extract_array(values, extract_numpy=True) + values = ensure_block_shape(values, ndim) + + check_ndim(values, placement, ndim) + values = maybe_coerce_values(values) + return klass(values, ndim=ndim, placement=placement) + + +def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int: + """ + If `ndim` is not provided, infer it from placement and values. + """ + if ndim is None: + # GH#38134 Block constructor now assumes ndim is not None + if not isinstance(values.dtype, np.dtype): + if len(placement) != 1: + ndim = 1 + else: + ndim = 2 + else: + ndim = values.ndim + return ndim + + +def __getattr__(name: str): + # GH#55139 + import warnings + + if name in [ + "Block", + "ExtensionBlock", + "DatetimeTZBlock", + "create_block_manager_from_blocks", + ]: + # GH#33892 + warnings.warn( + f"{name} is deprecated and will be removed in a future version. " + "Use public APIs instead.", + DeprecationWarning, + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, + ) + + if name == "create_block_manager_from_blocks": + from pandas.core.internals.managers import create_block_manager_from_blocks + + return create_block_manager_from_blocks + + elif name == "Block": + from pandas.core.internals.blocks import Block + + return Block + + elif name == "DatetimeTZBlock": + from pandas.core.internals.blocks import DatetimeTZBlock + + return DatetimeTZBlock + + elif name == "ExtensionBlock": + from pandas.core.internals.blocks import ExtensionBlock + + return ExtensionBlock + + raise AttributeError( + f"module 'pandas.core.internals.api' has no attribute '{name}'" + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/array_manager.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/array_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..e253f82256a5f6dd8b277b576a33597355d69dcc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/array_manager.py @@ -0,0 +1,1340 @@ +""" +Experimental manager based on storing a collection of 1D arrays +""" +from __future__ import annotations + +import itertools +from typing import ( + TYPE_CHECKING, + Callable, + Literal, +) + +import numpy as np + +from pandas._libs import ( + NaT, + lib, +) + +from pandas.core.dtypes.astype import ( + astype_array, + astype_array_safe, +) +from pandas.core.dtypes.cast import ( + ensure_dtype_can_hold_na, + find_common_type, + infer_dtype_from_scalar, + np_find_common_type, +) +from pandas.core.dtypes.common import ( + ensure_platform_int, + is_datetime64_ns_dtype, + is_integer, + is_numeric_dtype, + is_object_dtype, + is_timedelta64_ns_dtype, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + array_equals, + isna, + na_value_for_dtype, +) + +import pandas.core.algorithms as algos +from pandas.core.array_algos.quantile import quantile_compat +from pandas.core.array_algos.take import take_1d +from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + NumpyExtensionArray, + TimedeltaArray, +) +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, + sanitize_array, +) +from pandas.core.indexers import ( + maybe_convert_indices, + validate_indices, +) +from pandas.core.indexes.api import ( + Index, + ensure_index, +) +from pandas.core.indexes.base import get_values_for_csv +from pandas.core.internals.base import ( + DataManager, + SingleDataManager, + ensure_np_dtype, + interleaved_dtype, +) +from pandas.core.internals.blocks import ( + BlockPlacement, + ensure_block_shape, + external_values, + extract_pandas_array, + maybe_coerce_values, + new_block, +) +from pandas.core.internals.managers import make_na_array + +if TYPE_CHECKING: + from collections.abc import Hashable + + from pandas._typing import ( + ArrayLike, + AxisInt, + DtypeObj, + QuantileInterpolation, + Self, + npt, + ) + + +class BaseArrayManager(DataManager): + """ + Core internal data structure to implement DataFrame and Series. + + Alternative to the BlockManager, storing a list of 1D arrays instead of + Blocks. + + This is *not* a public API class + + Parameters + ---------- + arrays : Sequence of arrays + axes : Sequence of Index + verify_integrity : bool, default True + + """ + + __slots__ = [ + "_axes", # private attribute, because 'axes' has different order, see below + "arrays", + ] + + arrays: list[np.ndarray | ExtensionArray] + _axes: list[Index] + + def __init__( + self, + arrays: list[np.ndarray | ExtensionArray], + axes: list[Index], + verify_integrity: bool = True, + ) -> None: + raise NotImplementedError + + def make_empty(self, axes=None) -> Self: + """Return an empty ArrayManager with the items axis of len 0 (no columns)""" + if axes is None: + axes = [self.axes[1:], Index([])] + + arrays: list[np.ndarray | ExtensionArray] = [] + return type(self)(arrays, axes) + + @property + def items(self) -> Index: + return self._axes[-1] + + @property + # error: Signature of "axes" incompatible with supertype "DataManager" + def axes(self) -> list[Index]: # type: ignore[override] + # mypy doesn't work to override attribute with property + # see https://github.com/python/mypy/issues/4125 + """Axes is BlockManager-compatible order (columns, rows)""" + return [self._axes[1], self._axes[0]] + + @property + def shape_proper(self) -> tuple[int, ...]: + # this returns (n_rows, n_columns) + return tuple(len(ax) for ax in self._axes) + + @staticmethod + def _normalize_axis(axis: AxisInt) -> int: + # switch axis + axis = 1 if axis == 0 else 0 + return axis + + def set_axis(self, axis: AxisInt, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + self._validate_set_axis(axis, new_labels) + axis = self._normalize_axis(axis) + self._axes[axis] = new_labels + + def get_dtypes(self) -> npt.NDArray[np.object_]: + return np.array([arr.dtype for arr in self.arrays], dtype="object") + + def add_references(self, mgr: BaseArrayManager) -> None: + """ + Only implemented on the BlockManager level + """ + return + + def __getstate__(self): + return self.arrays, self._axes + + def __setstate__(self, state) -> None: + self.arrays = state[0] + self._axes = state[1] + + def __repr__(self) -> str: + output = type(self).__name__ + output += f"\nIndex: {self._axes[0]}" + if self.ndim == 2: + output += f"\nColumns: {self._axes[1]}" + output += f"\n{len(self.arrays)} arrays:" + for arr in self.arrays: + output += f"\n{arr.dtype}" + return output + + def apply( + self, + f, + align_keys: list[str] | None = None, + **kwargs, + ) -> Self: + """ + Iterate over the arrays, collect and create a new ArrayManager. + + Parameters + ---------- + f : str or callable + Name of the Array method to apply. + align_keys: List[str] or None, default None + **kwargs + Keywords to pass to `f` + + Returns + ------- + ArrayManager + """ + assert "filter" not in kwargs + + align_keys = align_keys or [] + result_arrays: list[ArrayLike] = [] + # fillna: Series/DataFrame is responsible for making sure value is aligned + + aligned_args = {k: kwargs[k] for k in align_keys} + + if f == "apply": + f = kwargs.pop("func") + + for i, arr in enumerate(self.arrays): + if aligned_args: + for k, obj in aligned_args.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj.iloc[i] + else: + kwargs[k] = obj.iloc[:, i]._values + else: + # otherwise we have an array-like + kwargs[k] = obj[i] + + if callable(f): + applied = f(arr, **kwargs) + else: + applied = getattr(arr, f)(**kwargs) + + result_arrays.append(applied) + + new_axes = self._axes + return type(self)(result_arrays, new_axes) + + def apply_with_block(self, f, align_keys=None, **kwargs) -> Self: + # switch axis to follow BlockManager logic + swap_axis = True + if f == "interpolate": + swap_axis = False + if swap_axis and "axis" in kwargs and self.ndim == 2: + kwargs["axis"] = 1 if kwargs["axis"] == 0 else 0 + + align_keys = align_keys or [] + aligned_args = {k: kwargs[k] for k in align_keys} + + result_arrays = [] + + for i, arr in enumerate(self.arrays): + if aligned_args: + for k, obj in aligned_args.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + if self.ndim == 2: + kwargs[k] = obj.iloc[slice(i, i + 1)]._values + else: + kwargs[k] = obj.iloc[:]._values + else: + kwargs[k] = obj.iloc[:, [i]]._values + else: + # otherwise we have an ndarray + if obj.ndim == 2: + kwargs[k] = obj[[i]] + + if isinstance(arr.dtype, np.dtype) and not isinstance(arr, np.ndarray): + # i.e. TimedeltaArray, DatetimeArray with tz=None. Need to + # convert for the Block constructors. + arr = np.asarray(arr) + + arr = maybe_coerce_values(arr) + if self.ndim == 2: + arr = ensure_block_shape(arr, 2) + bp = BlockPlacement(slice(0, 1, 1)) + block = new_block(arr, placement=bp, ndim=2) + else: + bp = BlockPlacement(slice(0, len(self), 1)) + block = new_block(arr, placement=bp, ndim=1) + + applied = getattr(block, f)(**kwargs) + if isinstance(applied, list): + applied = applied[0] + arr = applied.values + if self.ndim == 2 and arr.ndim == 2: + # 2D for np.ndarray or DatetimeArray/TimedeltaArray + assert len(arr) == 1 + # error: No overload variant of "__getitem__" of "ExtensionArray" + # matches argument type "Tuple[int, slice]" + arr = arr[0, :] # type: ignore[call-overload] + result_arrays.append(arr) + + return type(self)(result_arrays, self._axes) + + def setitem(self, indexer, value, warn: bool = True) -> Self: + return self.apply_with_block("setitem", indexer=indexer, value=value) + + def diff(self, n: int) -> Self: + assert self.ndim == 2 # caller ensures + return self.apply(algos.diff, n=n) + + def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self: + if copy is None: + copy = True + + return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors) + + def convert(self, copy: bool | None) -> Self: + if copy is None: + copy = True + + def _convert(arr): + if is_object_dtype(arr.dtype): + # extract NumpyExtensionArray for tests that patch + # NumpyExtensionArray._typ + arr = np.asarray(arr) + result = lib.maybe_convert_objects( + arr, + convert_non_numeric=True, + ) + if result is arr and copy: + return arr.copy() + return result + else: + return arr.copy() if copy else arr + + return self.apply(_convert) + + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Self: + return self.apply( + get_values_for_csv, + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) + + @property + def any_extension_types(self) -> bool: + """Whether any of the blocks in this manager are extension blocks""" + return False # any(block.is_extension for block in self.blocks) + + @property + def is_view(self) -> bool: + """return a boolean if we are a single block and are a view""" + # TODO what is this used for? + return False + + @property + def is_single_block(self) -> bool: + return len(self.arrays) == 1 + + def _get_data_subset(self, predicate: Callable) -> Self: + indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)] + arrays = [self.arrays[i] for i in indices] + # TODO copy? + # Note: using Index.take ensures we can retain e.g. DatetimeIndex.freq, + # see test_describe_datetime_columns + taker = np.array(indices, dtype="intp") + new_cols = self._axes[1].take(taker) + new_axes = [self._axes[0], new_cols] + return type(self)(arrays, new_axes, verify_integrity=False) + + def get_bool_data(self, copy: bool = False) -> Self: + """ + Select columns that are bool-dtype and object-dtype columns that are all-bool. + + Parameters + ---------- + copy : bool, default False + Whether to copy the blocks + """ + return self._get_data_subset(lambda x: x.dtype == np.dtype(bool)) + + def get_numeric_data(self, copy: bool = False) -> Self: + """ + Select columns that have a numeric dtype. + + Parameters + ---------- + copy : bool, default False + Whether to copy the blocks + """ + return self._get_data_subset( + lambda arr: is_numeric_dtype(arr.dtype) + or getattr(arr.dtype, "_is_numeric", False) + ) + + def copy(self, deep: bool | Literal["all"] | None = True) -> Self: + """ + Make deep or shallow copy of ArrayManager + + Parameters + ---------- + deep : bool or string, default True + If False, return shallow copy (do not copy data) + If 'all', copy data and a deep copy of the index + + Returns + ------- + BlockManager + """ + if deep is None: + # ArrayManager does not yet support CoW, so deep=None always means + # deep=True for now + deep = True + + # this preserves the notion of view copying of axes + if deep: + # hit in e.g. tests.io.json.test_pandas + + def copy_func(ax): + return ax.copy(deep=True) if deep == "all" else ax.view() + + new_axes = [copy_func(ax) for ax in self._axes] + else: + new_axes = list(self._axes) + + if deep: + new_arrays = [arr.copy() for arr in self.arrays] + else: + new_arrays = list(self.arrays) + return type(self)(new_arrays, new_axes, verify_integrity=False) + + def reindex_indexer( + self, + new_axis, + indexer, + axis: AxisInt, + fill_value=None, + allow_dups: bool = False, + copy: bool | None = True, + # ignored keywords + only_slice: bool = False, + # ArrayManager specific keywords + use_na_proxy: bool = False, + ) -> Self: + axis = self._normalize_axis(axis) + return self._reindex_indexer( + new_axis, + indexer, + axis, + fill_value, + allow_dups, + copy, + use_na_proxy, + ) + + def _reindex_indexer( + self, + new_axis, + indexer: npt.NDArray[np.intp] | None, + axis: AxisInt, + fill_value=None, + allow_dups: bool = False, + copy: bool | None = True, + use_na_proxy: bool = False, + ) -> Self: + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray[intp] or None + axis : int + fill_value : object, default None + allow_dups : bool, default False + copy : bool, default True + + + pandas-indexer with -1's only. + """ + if copy is None: + # ArrayManager does not yet support CoW, so deep=None always means + # deep=True for now + copy = True + + if indexer is None: + if new_axis is self._axes[axis] and not copy: + return self + + result = self.copy(deep=copy) + result._axes = list(self._axes) + result._axes[axis] = new_axis + return result + + # some axes don't allow reindexing with dups + if not allow_dups: + self._axes[axis]._validate_can_reindex(indexer) + + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 1: + new_arrays = [] + for i in indexer: + if i == -1: + arr = self._make_na_array( + fill_value=fill_value, use_na_proxy=use_na_proxy + ) + else: + arr = self.arrays[i] + if copy: + arr = arr.copy() + new_arrays.append(arr) + + else: + validate_indices(indexer, len(self._axes[0])) + indexer = ensure_platform_int(indexer) + mask = indexer == -1 + needs_masking = mask.any() + new_arrays = [ + take_1d( + arr, + indexer, + allow_fill=needs_masking, + fill_value=fill_value, + mask=mask, + # if fill_value is not None else blk.fill_value + ) + for arr in self.arrays + ] + + new_axes = list(self._axes) + new_axes[axis] = new_axis + + return type(self)(new_arrays, new_axes, verify_integrity=False) + + def take( + self, + indexer: npt.NDArray[np.intp], + axis: AxisInt = 1, + verify: bool = True, + ) -> Self: + """ + Take items along any axis. + """ + assert isinstance(indexer, np.ndarray), type(indexer) + assert indexer.dtype == np.intp, indexer.dtype + + axis = self._normalize_axis(axis) + + if not indexer.ndim == 1: + raise ValueError("indexer should be 1-dimensional") + + n = self.shape_proper[axis] + indexer = maybe_convert_indices(indexer, n, verify=verify) + + new_labels = self._axes[axis].take(indexer) + return self._reindex_indexer( + new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + ) + + def _make_na_array(self, fill_value=None, use_na_proxy: bool = False): + if use_na_proxy: + assert fill_value is None + return NullArrayProxy(self.shape_proper[0]) + + if fill_value is None: + fill_value = np.nan + + dtype, fill_value = infer_dtype_from_scalar(fill_value) + array_values = make_na_array(dtype, self.shape_proper[:1], fill_value) + return array_values + + def _equal_values(self, other) -> bool: + """ + Used in .equals defined in base class. Only check the column values + assuming shape and indexes have already been checked. + """ + for left, right in zip(self.arrays, other.arrays): + if not array_equals(left, right): + return False + return True + + # TODO + # to_dict + + +class ArrayManager(BaseArrayManager): + @property + def ndim(self) -> Literal[2]: + return 2 + + def __init__( + self, + arrays: list[np.ndarray | ExtensionArray], + axes: list[Index], + verify_integrity: bool = True, + ) -> None: + # Note: we are storing the axes in "_axes" in the (row, columns) order + # which contrasts the order how it is stored in BlockManager + self._axes = axes + self.arrays = arrays + + if verify_integrity: + self._axes = [ensure_index(ax) for ax in axes] + arrays = [extract_pandas_array(x, None, 1)[0] for x in arrays] + self.arrays = [maybe_coerce_values(arr) for arr in arrays] + self._verify_integrity() + + def _verify_integrity(self) -> None: + n_rows, n_columns = self.shape_proper + if not len(self.arrays) == n_columns: + raise ValueError( + "Number of passed arrays must equal the size of the column Index: " + f"{len(self.arrays)} arrays vs {n_columns} columns." + ) + for arr in self.arrays: + if not len(arr) == n_rows: + raise ValueError( + "Passed arrays should have the same length as the rows Index: " + f"{len(arr)} vs {n_rows} rows" + ) + if not isinstance(arr, (np.ndarray, ExtensionArray)): + raise ValueError( + "Passed arrays should be np.ndarray or ExtensionArray instances, " + f"got {type(arr)} instead" + ) + if not arr.ndim == 1: + raise ValueError( + "Passed arrays should be 1-dimensional, got array with " + f"{arr.ndim} dimensions instead." + ) + + # -------------------------------------------------------------------- + # Indexing + + def fast_xs(self, loc: int) -> SingleArrayManager: + """ + Return the array corresponding to `frame.iloc[loc]`. + + Parameters + ---------- + loc : int + + Returns + ------- + np.ndarray or ExtensionArray + """ + dtype = interleaved_dtype([arr.dtype for arr in self.arrays]) + + values = [arr[loc] for arr in self.arrays] + if isinstance(dtype, ExtensionDtype): + result = dtype.construct_array_type()._from_sequence(values, dtype=dtype) + # for datetime64/timedelta64, the np.ndarray constructor cannot handle pd.NaT + elif is_datetime64_ns_dtype(dtype): + result = DatetimeArray._from_sequence(values, dtype=dtype)._ndarray + elif is_timedelta64_ns_dtype(dtype): + result = TimedeltaArray._from_sequence(values, dtype=dtype)._ndarray + else: + result = np.array(values, dtype=dtype) + return SingleArrayManager([result], [self._axes[1]]) + + def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager: + axis = self._normalize_axis(axis) + + if axis == 0: + arrays = [arr[slobj] for arr in self.arrays] + elif axis == 1: + arrays = self.arrays[slobj] + + new_axes = list(self._axes) + new_axes[axis] = new_axes[axis]._getitem_slice(slobj) + + return type(self)(arrays, new_axes, verify_integrity=False) + + def iget(self, i: int) -> SingleArrayManager: + """ + Return the data as a SingleArrayManager. + """ + values = self.arrays[i] + return SingleArrayManager([values], [self._axes[0]]) + + def iget_values(self, i: int) -> ArrayLike: + """ + Return the data for column i as the values (ndarray or ExtensionArray). + """ + return self.arrays[i] + + @property + def column_arrays(self) -> list[ArrayLike]: + """ + Used in the JSON C code to access column arrays. + """ + + return [np.asarray(arr) for arr in self.arrays] + + def iset( + self, + loc: int | slice | np.ndarray, + value: ArrayLike, + inplace: bool = False, + refs=None, + ) -> None: + """ + Set new column(s). + + This changes the ArrayManager in-place, but replaces (an) existing + column(s), not changing column values in-place). + + Parameters + ---------- + loc : integer, slice or boolean mask + Positional location (already bounds checked) + value : np.ndarray or ExtensionArray + inplace : bool, default False + Whether overwrite existing array as opposed to replacing it. + """ + # single column -> single integer index + if lib.is_integer(loc): + # TODO can we avoid needing to unpack this here? That means converting + # DataFrame into 1D array when loc is an integer + if isinstance(value, np.ndarray) and value.ndim == 2: + assert value.shape[1] == 1 + value = value[:, 0] + + # TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item + # but we should avoid that and pass directly the proper array + value = maybe_coerce_values(value) + + assert isinstance(value, (np.ndarray, ExtensionArray)) + assert value.ndim == 1 + assert len(value) == len(self._axes[0]) + self.arrays[loc] = value + return + + # multiple columns -> convert slice or array to integer indices + elif isinstance(loc, slice): + indices: range | np.ndarray = range( + loc.start if loc.start is not None else 0, + loc.stop if loc.stop is not None else self.shape_proper[1], + loc.step if loc.step is not None else 1, + ) + else: + assert isinstance(loc, np.ndarray) + assert loc.dtype == "bool" + indices = np.nonzero(loc)[0] + + assert value.ndim == 2 + assert value.shape[0] == len(self._axes[0]) + + for value_idx, mgr_idx in enumerate(indices): + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[slice, int]" + value_arr = value[:, value_idx] # type: ignore[call-overload] + self.arrays[mgr_idx] = value_arr + return + + def column_setitem( + self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False + ) -> None: + """ + Set values ("setitem") into a single column (not setting the full column). + + This is a method on the ArrayManager level, to avoid creating an + intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) + """ + if not is_integer(loc): + raise TypeError("The column index should be an integer") + arr = self.arrays[loc] + mgr = SingleArrayManager([arr], [self._axes[0]]) + if inplace_only: + mgr.setitem_inplace(idx, value) + else: + new_mgr = mgr.setitem((idx,), value) + # update existing ArrayManager in-place + self.arrays[loc] = new_mgr.arrays[0] + + def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : np.ndarray or ExtensionArray + """ + # insert to the axis; this could possibly raise a TypeError + new_axis = self.items.insert(loc, item) + + value = extract_array(value, extract_numpy=True) + if value.ndim == 2: + if value.shape[0] == 1: + # error: No overload variant of "__getitem__" of "ExtensionArray" + # matches argument type "Tuple[int, slice]" + value = value[0, :] # type: ignore[call-overload] + else: + raise ValueError( + f"Expected a 1D array, got an array with shape {value.shape}" + ) + value = maybe_coerce_values(value) + + # TODO self.arrays can be empty + # assert len(value) == len(self.arrays[0]) + + # TODO is this copy needed? + arrays = self.arrays.copy() + arrays.insert(loc, value) + + self.arrays = arrays + self._axes[1] = new_axis + + def idelete(self, indexer) -> ArrayManager: + """ + Delete selected locations in-place (new block and array, same BlockManager) + """ + to_keep = np.ones(self.shape[0], dtype=np.bool_) + to_keep[indexer] = False + + self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] + self._axes = [self._axes[0], self._axes[1][to_keep]] + return self + + # -------------------------------------------------------------------- + # Array-wise Operation + + def grouped_reduce(self, func: Callable) -> Self: + """ + Apply grouped reduction function columnwise, returning a new ArrayManager. + + Parameters + ---------- + func : grouped reduction function + + Returns + ------- + ArrayManager + """ + result_arrays: list[np.ndarray] = [] + result_indices: list[int] = [] + + for i, arr in enumerate(self.arrays): + # grouped_reduce functions all expect 2D arrays + arr = ensure_block_shape(arr, ndim=2) + res = func(arr) + if res.ndim == 2: + # reverse of ensure_block_shape + assert res.shape[0] == 1 + res = res[0] + + result_arrays.append(res) + result_indices.append(i) + + if len(result_arrays) == 0: + nrows = 0 + else: + nrows = result_arrays[0].shape[0] + index = Index(range(nrows)) + + columns = self.items + + # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]"; + # expected "List[Union[ndarray, ExtensionArray]]" + return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type] + + def reduce(self, func: Callable) -> Self: + """ + Apply reduction function column-wise, returning a single-row ArrayManager. + + Parameters + ---------- + func : reduction function + + Returns + ------- + ArrayManager + """ + result_arrays: list[np.ndarray] = [] + for i, arr in enumerate(self.arrays): + res = func(arr, axis=0) + + # TODO NaT doesn't preserve dtype, so we need to ensure to create + # a timedelta result array if original was timedelta + # what if datetime results in timedelta? (eg std) + dtype = arr.dtype if res is NaT else None + result_arrays.append( + sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type] + ) + + index = Index._simple_new(np.array([None], dtype=object)) # placeholder + columns = self.items + + # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]"; + # expected "List[Union[ndarray, ExtensionArray]]" + new_mgr = type(self)(result_arrays, [index, columns]) # type: ignore[arg-type] + return new_mgr + + def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager: + """ + Apply array_op blockwise with another (aligned) BlockManager. + """ + # TODO what if `other` is BlockManager ? + left_arrays = self.arrays + right_arrays = other.arrays + result_arrays = [ + array_op(left, right) for left, right in zip(left_arrays, right_arrays) + ] + return type(self)(result_arrays, self._axes) + + def quantile( + self, + *, + qs: Index, # with dtype float64 + transposed: bool = False, + interpolation: QuantileInterpolation = "linear", + ) -> ArrayManager: + arrs = [ensure_block_shape(x, 2) for x in self.arrays] + new_arrs = [ + quantile_compat(x, np.asarray(qs._values), interpolation) for x in arrs + ] + for i, arr in enumerate(new_arrs): + if arr.ndim == 2: + assert arr.shape[0] == 1, arr.shape + new_arrs[i] = arr[0] + + axes = [qs, self._axes[1]] + return type(self)(new_arrs, axes) + + # ---------------------------------------------------------------- + + def unstack(self, unstacker, fill_value) -> ArrayManager: + """ + Return a BlockManager with all blocks unstacked. + + Parameters + ---------- + unstacker : reshape._Unstacker + fill_value : Any + fill_value for newly introduced missing values. + + Returns + ------- + unstacked : BlockManager + """ + indexer, _ = unstacker._indexer_and_to_sort + if unstacker.mask.all(): + new_indexer = indexer + allow_fill = False + new_mask2D = None + needs_masking = None + else: + new_indexer = np.full(unstacker.mask.shape, -1) + new_indexer[unstacker.mask] = indexer + allow_fill = True + # calculating the full mask once and passing it to take_1d is faster + # than letting take_1d calculate it in each repeated call + new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape) + needs_masking = new_mask2D.any(axis=0) + new_indexer2D = new_indexer.reshape(*unstacker.full_shape) + new_indexer2D = ensure_platform_int(new_indexer2D) + + new_arrays = [] + for arr in self.arrays: + for i in range(unstacker.full_shape[1]): + if allow_fill: + # error: Value of type "Optional[Any]" is not indexable [index] + new_arr = take_1d( + arr, + new_indexer2D[:, i], + allow_fill=needs_masking[i], # type: ignore[index] + fill_value=fill_value, + mask=new_mask2D[:, i], # type: ignore[index] + ) + else: + new_arr = take_1d(arr, new_indexer2D[:, i], allow_fill=False) + new_arrays.append(new_arr) + + new_index = unstacker.new_index + new_columns = unstacker.get_new_columns(self._axes[1]) + new_axes = [new_index, new_columns] + + return type(self)(new_arrays, new_axes, verify_integrity=False) + + def as_array( + self, + dtype=None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Parameters + ---------- + dtype : object, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. + + Returns + ------- + arr : ndarray + """ + if len(self.arrays) == 0: + empty_arr = np.empty(self.shape, dtype=float) + return empty_arr.transpose() + + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default + + if not dtype: + dtype = interleaved_dtype([arr.dtype for arr in self.arrays]) + + dtype = ensure_np_dtype(dtype) + + result = np.empty(self.shape_proper, dtype=dtype) + + for i, arr in enumerate(self.arrays): + arr = arr.astype(dtype, copy=copy) + result[:, i] = arr + + if na_value is not lib.no_default: + result[isna(result)] = na_value + + return result + + @classmethod + def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed ArrayManagers horizontally. + """ + # concatting along the columns -> combine reindexed arrays in a single manager + arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) + new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False) + return new_mgr + + @classmethod + def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed ArrayManagers vertically. + """ + # concatting along the rows -> concat the reindexed arrays + # TODO(ArrayManager) doesn't yet preserve the correct dtype + arrays = [ + concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))]) + for j in range(len(mgrs[0].arrays)) + ] + new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False) + return new_mgr + + +class SingleArrayManager(BaseArrayManager, SingleDataManager): + __slots__ = [ + "_axes", # private attribute, because 'axes' has different order, see below + "arrays", + ] + + arrays: list[np.ndarray | ExtensionArray] + _axes: list[Index] + + @property + def ndim(self) -> Literal[1]: + return 1 + + def __init__( + self, + arrays: list[np.ndarray | ExtensionArray], + axes: list[Index], + verify_integrity: bool = True, + ) -> None: + self._axes = axes + self.arrays = arrays + + if verify_integrity: + assert len(axes) == 1 + assert len(arrays) == 1 + self._axes = [ensure_index(ax) for ax in self._axes] + arr = arrays[0] + arr = maybe_coerce_values(arr) + arr = extract_pandas_array(arr, None, 1)[0] + self.arrays = [arr] + self._verify_integrity() + + def _verify_integrity(self) -> None: + (n_rows,) = self.shape + assert len(self.arrays) == 1 + arr = self.arrays[0] + assert len(arr) == n_rows + if not arr.ndim == 1: + raise ValueError( + "Passed array should be 1-dimensional, got array with " + f"{arr.ndim} dimensions instead." + ) + + @staticmethod + def _normalize_axis(axis): + return axis + + def make_empty(self, axes=None) -> Self: + """Return an empty ArrayManager with index/array of length 0""" + if axes is None: + axes = [Index([], dtype=object)] + array: np.ndarray = np.array([], dtype=self.dtype) + return type(self)([array], axes) + + @classmethod + def from_array(cls, array, index) -> SingleArrayManager: + return cls([array], [index]) + + # error: Cannot override writeable attribute with read-only property + @property + def axes(self) -> list[Index]: # type: ignore[override] + return self._axes + + @property + def index(self) -> Index: + return self._axes[0] + + @property + def dtype(self): + return self.array.dtype + + def external_values(self): + """The array that Series.values returns""" + return external_values(self.array) + + def internal_values(self): + """The array that Series._values returns""" + return self.array + + def array_values(self): + """The array that Series.array returns""" + arr = self.array + if isinstance(arr, np.ndarray): + arr = NumpyExtensionArray(arr) + return arr + + @property + def _can_hold_na(self) -> bool: + if isinstance(self.array, np.ndarray): + return self.array.dtype.kind not in "iub" + else: + # ExtensionArray + return self.array._can_hold_na + + @property + def is_single_block(self) -> bool: + return True + + def fast_xs(self, loc: int) -> SingleArrayManager: + raise NotImplementedError("Use series._values[loc] instead") + + def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleArrayManager: + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + new_array = self.array[slobj] + new_index = self.index._getitem_slice(slobj) + return type(self)([new_array], [new_index], verify_integrity=False) + + def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> SingleArrayManager: + new_array = self.array[indexer] + new_index = self.index[indexer] + return type(self)([new_array], [new_index]) + + # error: Signature of "apply" incompatible with supertype "BaseArrayManager" + def apply(self, func, **kwargs) -> Self: # type: ignore[override] + if callable(func): + new_array = func(self.array, **kwargs) + else: + new_array = getattr(self.array, func)(**kwargs) + return type(self)([new_array], self._axes) + + def setitem(self, indexer, value, warn: bool = True) -> SingleArrayManager: + """ + Set values with indexer. + + For SingleArrayManager, this backs s[indexer] = value + + See `setitem_inplace` for a version that works inplace and doesn't + return a new Manager. + """ + if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: + raise ValueError(f"Cannot set values with ndim > {self.ndim}") + return self.apply_with_block("setitem", indexer=indexer, value=value) + + def idelete(self, indexer) -> SingleArrayManager: + """ + Delete selected locations in-place (new array, same ArrayManager) + """ + to_keep = np.ones(self.shape[0], dtype=np.bool_) + to_keep[indexer] = False + + self.arrays = [self.arrays[0][to_keep]] + self._axes = [self._axes[0][to_keep]] + return self + + def _get_data_subset(self, predicate: Callable) -> SingleArrayManager: + # used in get_numeric_data / get_bool_data + if predicate(self.array): + return type(self)(self.arrays, self._axes, verify_integrity=False) + else: + return self.make_empty() + + def set_values(self, values: ArrayLike) -> None: + """ + Set (replace) the values of the SingleArrayManager in place. + + Use at your own risk! This does not check if the passed values are + valid for the current SingleArrayManager (length, dtype, etc). + """ + self.arrays[0] = values + + def to_2d_mgr(self, columns: Index) -> ArrayManager: + """ + Manager analogue of Series.to_frame + """ + arrays = [self.arrays[0]] + axes = [self.axes[0], columns] + + return ArrayManager(arrays, axes, verify_integrity=False) + + +class NullArrayProxy: + """ + Proxy object for an all-NA array. + + Only stores the length of the array, and not the dtype. The dtype + will only be known when actually concatenating (after determining the + common dtype, for which this proxy is ignored). + Using this object avoids that the internals/concat.py needs to determine + the proper dtype and array type. + """ + + ndim = 1 + + def __init__(self, n: int) -> None: + self.n = n + + @property + def shape(self) -> tuple[int]: + return (self.n,) + + def to_array(self, dtype: DtypeObj) -> ArrayLike: + """ + Helper function to create the actual all-NA array from the NullArrayProxy + object. + + Parameters + ---------- + arr : NullArrayProxy + dtype : the dtype for the resulting array + + Returns + ------- + np.ndarray or ExtensionArray + """ + if isinstance(dtype, ExtensionDtype): + empty = dtype.construct_array_type()._from_sequence([], dtype=dtype) + indexer = -np.ones(self.n, dtype=np.intp) + return empty.take(indexer, allow_fill=True) + else: + # when introducing missing values, int becomes float, bool becomes object + dtype = ensure_dtype_can_hold_na(dtype) + fill_value = na_value_for_dtype(dtype) + arr = np.empty(self.n, dtype=dtype) + arr.fill(fill_value) + return ensure_wrapped_if_datetimelike(arr) + + +def concat_arrays(to_concat: list) -> ArrayLike: + """ + Alternative for concat_compat but specialized for use in the ArrayManager. + + Differences: only deals with 1D arrays (no axis keyword), assumes + ensure_wrapped_if_datetimelike and does not skip empty arrays to determine + the dtype. + In addition ensures that all NullArrayProxies get replaced with actual + arrays. + + Parameters + ---------- + to_concat : list of arrays + + Returns + ------- + np.ndarray or ExtensionArray + """ + # ignore the all-NA proxies to determine the resulting dtype + to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] + + dtypes = {x.dtype for x in to_concat_no_proxy} + single_dtype = len(dtypes) == 1 + + if single_dtype: + target_dtype = to_concat_no_proxy[0].dtype + elif all(lib.is_np_dtype(x, "iub") for x in dtypes): + # GH#42092 + target_dtype = np_find_common_type(*dtypes) + else: + target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) + + to_concat = [ + arr.to_array(target_dtype) + if isinstance(arr, NullArrayProxy) + else astype_array(arr, target_dtype, copy=False) + for arr in to_concat + ] + + if isinstance(to_concat[0], ExtensionArray): + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + + result = np.concatenate(to_concat) + + # TODO decide on exact behaviour (we shouldn't do this only for empty result) + # see https://github.com/pandas-dev/pandas/issues/39817 + if len(result) == 0: + # all empties -> check for bool to not coerce to float + kinds = {obj.dtype.kind for obj in to_concat_no_proxy} + if len(kinds) != 1: + if "b" in kinds: + result = result.astype(object) + return result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/base.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/base.py new file mode 100644 index 0000000000000000000000000000000000000000..ae91f167205a0628c4bcf9b61ce58e888fe6ec8e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/base.py @@ -0,0 +1,407 @@ +""" +Base class for the internal managers. Both BlockManager and ArrayManager +inherit from this class. +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, + Literal, + cast, + final, +) + +import numpy as np + +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) + +from pandas._libs import ( + algos as libalgos, + lib, +) +from pandas.errors import AbstractMethodError +from pandas.util._validators import validate_bool_kwarg + +from pandas.core.dtypes.cast import ( + find_common_type, + np_can_hold_element, +) +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, + SparseDtype, +) + +from pandas.core.base import PandasObject +from pandas.core.construction import extract_array +from pandas.core.indexes.api import ( + Index, + default_index, +) + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + AxisInt, + DtypeObj, + Self, + Shape, + ) + + +class _AlreadyWarned: + def __init__(self): + # This class is used on the manager level to the block level to + # ensure that we warn only once. The block method can update the + # warned_already option without returning a value to keep the + # interface consistent. This is only a temporary solution for + # CoW warnings. + self.warned_already = False + + +class DataManager(PandasObject): + # TODO share more methods/attributes + + axes: list[Index] + + @property + def items(self) -> Index: + raise AbstractMethodError(self) + + @final + def __len__(self) -> int: + return len(self.items) + + @property + def ndim(self) -> int: + return len(self.axes) + + @property + def shape(self) -> Shape: + return tuple(len(ax) for ax in self.axes) + + @final + def _validate_set_axis(self, axis: AxisInt, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + old_len = len(self.axes[axis]) + new_len = len(new_labels) + + if axis == 1 and len(self.items) == 0: + # If we are setting the index on a DataFrame with no columns, + # it is OK to change the length. + pass + + elif new_len != old_len: + raise ValueError( + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" + ) + + def reindex_indexer( + self, + new_axis, + indexer, + axis: AxisInt, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + only_slice: bool = False, + ) -> Self: + raise AbstractMethodError(self) + + @final + def reindex_axis( + self, + new_index: Index, + axis: AxisInt, + fill_value=None, + only_slice: bool = False, + ) -> Self: + """ + Conform data manager to new index. + """ + new_index, indexer = self.axes[axis].reindex(new_index) + + return self.reindex_indexer( + new_index, + indexer, + axis=axis, + fill_value=fill_value, + copy=False, + only_slice=only_slice, + ) + + def _equal_values(self, other: Self) -> bool: + """ + To be implemented by the subclasses. Only check the column values + assuming shape and indexes have already been checked. + """ + raise AbstractMethodError(self) + + @final + def equals(self, other: object) -> bool: + """ + Implementation for DataFrame.equals + """ + if not isinstance(other, type(self)): + return False + + self_axes, other_axes = self.axes, other.axes + if len(self_axes) != len(other_axes): + return False + if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): + return False + + return self._equal_values(other) + + def apply( + self, + f, + align_keys: list[str] | None = None, + **kwargs, + ) -> Self: + raise AbstractMethodError(self) + + def apply_with_block( + self, + f, + align_keys: list[str] | None = None, + **kwargs, + ) -> Self: + raise AbstractMethodError(self) + + @final + def isna(self, func) -> Self: + return self.apply("apply", func=func) + + @final + def fillna(self, value, limit: int | None, inplace: bool, downcast) -> Self: + if limit is not None: + # Do this validation even if we go through one of the no-op paths + limit = libalgos.validate_limit(None, limit=limit) + + return self.apply_with_block( + "fillna", + value=value, + limit=limit, + inplace=inplace, + downcast=downcast, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + + @final + def where(self, other, cond, align: bool) -> Self: + if align: + align_keys = ["other", "cond"] + else: + align_keys = ["cond"] + other = extract_array(other, extract_numpy=True) + + return self.apply_with_block( + "where", + align_keys=align_keys, + other=other, + cond=cond, + using_cow=using_copy_on_write(), + ) + + @final + def putmask(self, mask, new, align: bool = True, warn: bool = True) -> Self: + if align: + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + new = extract_array(new, extract_numpy=True) + + already_warned = None + if warn_copy_on_write(): + already_warned = _AlreadyWarned() + if not warn: + already_warned.warned_already = True + + return self.apply_with_block( + "putmask", + align_keys=align_keys, + mask=mask, + new=new, + using_cow=using_copy_on_write(), + already_warned=already_warned, + ) + + @final + def round(self, decimals: int, using_cow: bool = False) -> Self: + return self.apply_with_block( + "round", + decimals=decimals, + using_cow=using_cow, + ) + + @final + def replace(self, to_replace, value, inplace: bool) -> Self: + inplace = validate_bool_kwarg(inplace, "inplace") + # NDFrame.replace ensures the not-is_list_likes here + assert not lib.is_list_like(to_replace) + assert not lib.is_list_like(value) + return self.apply_with_block( + "replace", + to_replace=to_replace, + value=value, + inplace=inplace, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + + @final + def replace_regex(self, **kwargs) -> Self: + return self.apply_with_block( + "_replace_regex", + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + + @final + def replace_list( + self, + src_list: list[Any], + dest_list: list[Any], + inplace: bool = False, + regex: bool = False, + ) -> Self: + """do a list replace""" + inplace = validate_bool_kwarg(inplace, "inplace") + + bm = self.apply_with_block( + "replace_list", + src_list=src_list, + dest_list=dest_list, + inplace=inplace, + regex=regex, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + bm._consolidate_inplace() + return bm + + def interpolate(self, inplace: bool, **kwargs) -> Self: + return self.apply_with_block( + "interpolate", + inplace=inplace, + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + + def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: + return self.apply_with_block( + "pad_or_backfill", + inplace=inplace, + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + + def shift(self, periods: int, fill_value) -> Self: + if fill_value is lib.no_default: + fill_value = None + + return self.apply_with_block("shift", periods=periods, fill_value=fill_value) + + # -------------------------------------------------------------------- + # Consolidation: No-ops for all but BlockManager + + def is_consolidated(self) -> bool: + return True + + def consolidate(self) -> Self: + return self + + def _consolidate_inplace(self) -> None: + return + + +class SingleDataManager(DataManager): + @property + def ndim(self) -> Literal[1]: + return 1 + + @final + @property + def array(self) -> ArrayLike: + """ + Quick access to the backing array of the Block or SingleArrayManager. + """ + # error: "SingleDataManager" has no attribute "arrays"; maybe "array" + return self.arrays[0] # type: ignore[attr-defined] + + def setitem_inplace(self, indexer, value, warn: bool = True) -> None: + """ + Set values with indexer. + + For Single[Block/Array]Manager, this backs s[indexer] = value + + This is an inplace version of `setitem()`, mutating the manager/values + in place, not returning a new Manager (and Block), and thus never changing + the dtype. + """ + arr = self.array + + # EAs will do this validation in their own __setitem__ methods. + if isinstance(arr, np.ndarray): + # Note: checking for ndarray instead of np.dtype means we exclude + # dt64/td64, which do their own validation. + value = np_can_hold_element(arr.dtype, value) + + if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + value = value[0, ...] + + arr[indexer] = value + + def grouped_reduce(self, func): + arr = self.array + res = func(arr) + index = default_index(len(res)) + + mgr = type(self).from_array(res, index) + return mgr + + @classmethod + def from_array(cls, arr: ArrayLike, index: Index): + raise AbstractMethodError(cls) + + +def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None: + """ + Find the common dtype for `blocks`. + + Parameters + ---------- + blocks : List[DtypeObj] + + Returns + ------- + dtype : np.dtype, ExtensionDtype, or None + None is returned when `blocks` is empty. + """ + if not len(dtypes): + return None + + return find_common_type(dtypes) + + +def ensure_np_dtype(dtype: DtypeObj) -> np.dtype: + # TODO: https://github.com/pandas-dev/pandas/issues/22791 + # Give EAs some input on what happens here. Sparse needs this. + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + dtype = cast(np.dtype, dtype) + elif isinstance(dtype, ExtensionDtype): + dtype = np.dtype("object") + elif dtype == np.dtype(str): + dtype = np.dtype("object") + return dtype diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/blocks.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..259e969112dd7506b97520f3f2a683cbf7f7ef5d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/blocks.py @@ -0,0 +1,2850 @@ +from __future__ import annotations + +from functools import wraps +import inspect +import re +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + cast, + final, +) +import warnings +import weakref + +import numpy as np + +from pandas._config import ( + get_option, + using_copy_on_write, + warn_copy_on_write, +) + +from pandas._libs import ( + NaT, + internals as libinternals, + lib, +) +from pandas._libs.internals import ( + BlockPlacement, + BlockValuesRefs, +) +from pandas._libs.missing import NA +from pandas._typing import ( + ArrayLike, + AxisInt, + DtypeBackend, + DtypeObj, + F, + FillnaOptions, + IgnoreRaise, + InterpolateOptions, + QuantileInterpolation, + Self, + Shape, + npt, +) +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import validate_bool_kwarg + +from pandas.core.dtypes.astype import ( + astype_array_safe, + astype_is_view, +) +from pandas.core.dtypes.cast import ( + LossySetitemError, + can_hold_element, + convert_dtypes, + find_result_type, + maybe_downcast_to_dtype, + np_can_hold_element, +) +from pandas.core.dtypes.common import ( + is_1d_only_ea_dtype, + is_float_dtype, + is_integer_dtype, + is_list_like, + is_scalar, + is_string_dtype, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + NumpyEADtype, + PeriodDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndex, + ABCNumpyExtensionArray, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna, + na_value_for_dtype, +) + +from pandas.core import missing +import pandas.core.algorithms as algos +from pandas.core.array_algos.putmask import ( + extract_bool_array, + putmask_inplace, + putmask_without_repeat, + setitem_datetimelike_compat, + validate_putmask, +) +from pandas.core.array_algos.quantile import quantile_compat +from pandas.core.array_algos.replace import ( + compare_or_regex_search, + replace_regex, + should_use_regex, +) +from pandas.core.array_algos.transforms import shift +from pandas.core.arrays import ( + Categorical, + DatetimeArray, + ExtensionArray, + IntervalArray, + NumpyExtensionArray, + PeriodArray, + TimedeltaArray, +) +from pandas.core.base import PandasObject +import pandas.core.common as com +from pandas.core.computation import expressions +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, +) +from pandas.core.indexers import check_setitem_lengths +from pandas.core.indexes.base import get_values_for_csv + +if TYPE_CHECKING: + from collections.abc import ( + Iterable, + Sequence, + ) + + from pandas.core.api import Index + from pandas.core.arrays._mixins import NDArrayBackedExtensionArray + +# comparison is faster than is_object_dtype +_dtype_obj = np.dtype("object") + + +COW_WARNING_GENERAL_MSG = """\ +Setting a value on a view: behaviour will change in pandas 3.0. +You are mutating a Series or DataFrame object, and currently this mutation will +also have effect on other Series or DataFrame objects that share data with this +object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object +will never modify another. +""" + + +COW_WARNING_SETITEM_MSG = """\ +Setting a value on a view: behaviour will change in pandas 3.0. +Currently, the mutation will also have effect on the object that shares data +with this object. For example, when setting a value in a Series that was +extracted from a column of a DataFrame, that DataFrame will also be updated: + + ser = df["col"] + ser[0] = 0 <--- in pandas 2, this also updates `df` + +In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never +modify another, and thus in the example above, `df` will not be changed. +""" + + +def maybe_split(meth: F) -> F: + """ + If we have a multi-column block, split and operate block-wise. Otherwise + use the original method. + """ + + @wraps(meth) + def newfunc(self, *args, **kwargs) -> list[Block]: + if self.ndim == 1 or self.shape[0] == 1: + return meth(self, *args, **kwargs) + else: + # Split and operate column-by-column + return self.split_and_operate(meth, *args, **kwargs) + + return cast(F, newfunc) + + +class Block(PandasObject, libinternals.Block): + """ + Canonical n-dimensional unit of homogeneous dtype contained in a pandas + data structure + + Index-ignorant; let the container take care of that + """ + + values: np.ndarray | ExtensionArray + ndim: int + refs: BlockValuesRefs + __init__: Callable + + __slots__ = () + is_numeric = False + + @final + @cache_readonly + def _validate_ndim(self) -> bool: + """ + We validate dimension for blocks that can hold 2D values, which for now + means numpy dtypes or DatetimeTZDtype. + """ + dtype = self.dtype + return not isinstance(dtype, ExtensionDtype) or isinstance( + dtype, DatetimeTZDtype + ) + + @final + @cache_readonly + def is_object(self) -> bool: + return self.values.dtype == _dtype_obj + + @final + @cache_readonly + def is_extension(self) -> bool: + return not lib.is_np_dtype(self.values.dtype) + + @final + @cache_readonly + def _can_consolidate(self) -> bool: + # We _could_ consolidate for DatetimeTZDtype but don't for now. + return not self.is_extension + + @final + @cache_readonly + def _consolidate_key(self): + return self._can_consolidate, self.dtype.name + + @final + @cache_readonly + def _can_hold_na(self) -> bool: + """ + Can we store NA values in this Block? + """ + dtype = self.dtype + if isinstance(dtype, np.dtype): + return dtype.kind not in "iub" + return dtype._can_hold_na + + @final + @property + def is_bool(self) -> bool: + """ + We can be bool if a) we are bool dtype or b) object dtype with bool objects. + """ + return self.values.dtype == np.dtype(bool) + + @final + def external_values(self): + return external_values(self.values) + + @final + @cache_readonly + def fill_value(self): + # Used in reindex_indexer + return na_value_for_dtype(self.dtype, compat=False) + + @final + def _standardize_fill_value(self, value): + # if we are passed a scalar None, convert it here + if self.dtype != _dtype_obj and is_valid_na_for_dtype(value, self.dtype): + value = self.fill_value + return value + + @property + def mgr_locs(self) -> BlockPlacement: + return self._mgr_locs + + @mgr_locs.setter + def mgr_locs(self, new_mgr_locs: BlockPlacement) -> None: + self._mgr_locs = new_mgr_locs + + @final + def make_block( + self, + values, + placement: BlockPlacement | None = None, + refs: BlockValuesRefs | None = None, + ) -> Block: + """ + Create a new block, with type inference propagate any values that are + not specified + """ + if placement is None: + placement = self._mgr_locs + if self.is_extension: + values = ensure_block_shape(values, ndim=self.ndim) + + return new_block(values, placement=placement, ndim=self.ndim, refs=refs) + + @final + def make_block_same_class( + self, + values, + placement: BlockPlacement | None = None, + refs: BlockValuesRefs | None = None, + ) -> Self: + """Wrap given values in a block of same type as self.""" + # Pre-2.0 we called ensure_wrapped_if_datetimelike because fastparquet + # relied on it, as of 2.0 the caller is responsible for this. + if placement is None: + placement = self._mgr_locs + + # We assume maybe_coerce_values has already been called + return type(self)(values, placement=placement, ndim=self.ndim, refs=refs) + + @final + def __repr__(self) -> str: + # don't want to print out all of the items here + name = type(self).__name__ + if self.ndim == 1: + result = f"{name}: {len(self)} dtype: {self.dtype}" + else: + shape = " x ".join([str(s) for s in self.shape]) + result = f"{name}: {self.mgr_locs.indexer}, {shape}, dtype: {self.dtype}" + + return result + + @final + def __len__(self) -> int: + return len(self.values) + + @final + def slice_block_columns(self, slc: slice) -> Self: + """ + Perform __getitem__-like, return result as block. + """ + new_mgr_locs = self._mgr_locs[slc] + + new_values = self._slice(slc) + refs = self.refs + return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs) + + @final + def take_block_columns(self, indices: npt.NDArray[np.intp]) -> Self: + """ + Perform __getitem__-like, return result as block. + + Only supports slices that preserve dimensionality. + """ + # Note: only called from is from internals.concat, and we can verify + # that never happens with 1-column blocks, i.e. never for ExtensionBlock. + + new_mgr_locs = self._mgr_locs[indices] + + new_values = self._slice(indices) + return type(self)(new_values, new_mgr_locs, self.ndim, refs=None) + + @final + def getitem_block_columns( + self, slicer: slice, new_mgr_locs: BlockPlacement, ref_inplace_op: bool = False + ) -> Self: + """ + Perform __getitem__-like, return result as block. + + Only supports slices that preserve dimensionality. + """ + new_values = self._slice(slicer) + refs = self.refs if not ref_inplace_op or self.refs.has_reference() else None + return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs) + + @final + def _can_hold_element(self, element: Any) -> bool: + """require the same dtype as ourselves""" + element = extract_array(element, extract_numpy=True) + return can_hold_element(self.values, element) + + @final + def should_store(self, value: ArrayLike) -> bool: + """ + Should we set self.values[indexer] = value inplace or do we need to cast? + + Parameters + ---------- + value : np.ndarray or ExtensionArray + + Returns + ------- + bool + """ + return value.dtype == self.dtype + + # --------------------------------------------------------------------- + # Apply/Reduce and Helpers + + @final + def apply(self, func, **kwargs) -> list[Block]: + """ + apply the function to my values; return a block if we are not + one + """ + result = func(self.values, **kwargs) + + result = maybe_coerce_values(result) + return self._split_op_result(result) + + @final + def reduce(self, func) -> list[Block]: + # We will apply the function and reshape the result into a single-row + # Block with the same mgr_locs; squeezing will be done at a higher level + assert self.ndim == 2 + + result = func(self.values) + + if self.values.ndim == 1: + res_values = result + else: + res_values = result.reshape(-1, 1) + + nb = self.make_block(res_values) + return [nb] + + @final + def _split_op_result(self, result: ArrayLike) -> list[Block]: + # See also: split_and_operate + if result.ndim > 1 and isinstance(result.dtype, ExtensionDtype): + # TODO(EA2D): unnecessary with 2D EAs + # if we get a 2D ExtensionArray, we need to split it into 1D pieces + nbs = [] + for i, loc in enumerate(self._mgr_locs): + if not is_1d_only_ea_dtype(result.dtype): + vals = result[i : i + 1] + else: + vals = result[i] + + bp = BlockPlacement(loc) + block = self.make_block(values=vals, placement=bp) + nbs.append(block) + return nbs + + nb = self.make_block(result) + + return [nb] + + @final + def _split(self) -> list[Block]: + """ + Split a block into a list of single-column blocks. + """ + assert self.ndim == 2 + + new_blocks = [] + for i, ref_loc in enumerate(self._mgr_locs): + vals = self.values[slice(i, i + 1)] + + bp = BlockPlacement(ref_loc) + nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs) + new_blocks.append(nb) + return new_blocks + + @final + def split_and_operate(self, func, *args, **kwargs) -> list[Block]: + """ + Split the block and apply func column-by-column. + + Parameters + ---------- + func : Block method + *args + **kwargs + + Returns + ------- + List[Block] + """ + assert self.ndim == 2 and self.shape[0] != 1 + + res_blocks = [] + for nb in self._split(): + rbs = func(nb, *args, **kwargs) + res_blocks.extend(rbs) + return res_blocks + + # --------------------------------------------------------------------- + # Up/Down-casting + + @final + def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: + """ + coerce the current block to a dtype compat for other + we will return a block, possibly object, and not raise + + we can also safely try to coerce to the same dtype + and will receive the same block + """ + new_dtype = find_result_type(self.values.dtype, other) + if new_dtype == self.dtype: + # GH#52927 avoid RecursionError + raise AssertionError( + "Something has gone wrong, please report a bug at " + "https://github.com/pandas-dev/pandas/issues" + ) + + # In a future version of pandas, the default will be that + # setting `nan` into an integer series won't raise. + if ( + is_scalar(other) + and is_integer_dtype(self.values.dtype) + and isna(other) + and other is not NaT + and not ( + isinstance(other, (np.datetime64, np.timedelta64)) and np.isnat(other) + ) + ): + warn_on_upcast = False + elif ( + isinstance(other, np.ndarray) + and other.ndim == 1 + and is_integer_dtype(self.values.dtype) + and is_float_dtype(other.dtype) + and lib.has_only_ints_or_nan(other) + ): + warn_on_upcast = False + + if warn_on_upcast: + warnings.warn( + f"Setting an item of incompatible dtype is deprecated " + "and will raise an error in a future version of pandas. " + f"Value '{other}' has dtype incompatible with {self.values.dtype}, " + "please explicitly cast to a compatible dtype first.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if self.values.dtype == new_dtype: + raise AssertionError( + f"Did not expect new dtype {new_dtype} to equal self.dtype " + f"{self.values.dtype}. Please report a bug at " + "https://github.com/pandas-dev/pandas/issues." + ) + return self.astype(new_dtype, copy=False) + + @final + def _maybe_downcast( + self, + blocks: list[Block], + downcast, + using_cow: bool, + caller: str, + ) -> list[Block]: + if downcast is False: + return blocks + + if self.dtype == _dtype_obj: + # TODO: does it matter that self.dtype might not match blocks[i].dtype? + # GH#44241 We downcast regardless of the argument; + # respecting 'downcast=None' may be worthwhile at some point, + # but ATM it breaks too much existing code. + # split and convert the blocks + + if caller == "fillna" and get_option("future.no_silent_downcasting"): + return blocks + + nbs = extend_blocks( + [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] + ) + if caller == "fillna": + if len(nbs) != len(blocks) or not all( + x.dtype == y.dtype for x, y in zip(nbs, blocks) + ): + # GH#54261 + warnings.warn( + "Downcasting object dtype arrays on .fillna, .ffill, .bfill " + "is deprecated and will change in a future version. " + "Call result.infer_objects(copy=False) instead. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return nbs + + elif downcast is None: + return blocks + elif caller == "where" and get_option("future.no_silent_downcasting") is True: + return blocks + else: + nbs = extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks]) + + # When _maybe_downcast is called with caller="where", it is either + # a) with downcast=False, which is a no-op (the desired future behavior) + # b) with downcast="infer", which is _not_ passed by the user. + # In the latter case the future behavior is to stop doing inference, + # so we issue a warning if and only if some inference occurred. + if caller == "where": + # GH#53656 + if len(blocks) != len(nbs) or any( + left.dtype != right.dtype for left, right in zip(blocks, nbs) + ): + # In this case _maybe_downcast was _not_ a no-op, so the behavior + # will change, so we issue a warning. + warnings.warn( + "Downcasting behavior in Series and DataFrame methods 'where', " + "'mask', and 'clip' is deprecated. In a future " + "version this will not infer object dtypes or cast all-round " + "floats to integers. Instead call " + "result.infer_objects(copy=False) for object inference, " + "or cast round floats explicitly. To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return nbs + + @final + @maybe_split + def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]: + """ + downcast specialized to 2D case post-validation. + + Refactored to allow use of maybe_split. + """ + new_values = maybe_downcast_to_dtype(self.values, dtype=dtype) + new_values = maybe_coerce_values(new_values) + refs = self.refs if new_values is self.values else None + return [self.make_block(new_values, refs=refs)] + + @final + def convert( + self, + *, + copy: bool = True, + using_cow: bool = False, + ) -> list[Block]: + """ + Attempt to coerce any object types to better types. Return a copy + of the block (if copy = True). + """ + if not self.is_object: + if not copy and using_cow: + return [self.copy(deep=False)] + return [self.copy()] if copy else [self] + + if self.ndim != 1 and self.shape[0] != 1: + blocks = self.split_and_operate( + Block.convert, copy=copy, using_cow=using_cow + ) + if all(blk.dtype.kind == "O" for blk in blocks): + # Avoid fragmenting the block if convert is a no-op + if using_cow: + return [self.copy(deep=False)] + return [self.copy()] if copy else [self] + return blocks + + values = self.values + if values.ndim == 2: + # the check above ensures we only get here with values.shape[0] == 1, + # avoid doing .ravel as that might make a copy + values = values[0] + + res_values = lib.maybe_convert_objects( + values, # type: ignore[arg-type] + convert_non_numeric=True, + ) + refs = None + if copy and res_values is values: + res_values = values.copy() + elif res_values is values: + refs = self.refs + + res_values = ensure_block_shape(res_values, self.ndim) + res_values = maybe_coerce_values(res_values) + return [self.make_block(res_values, refs=refs)] + + def convert_dtypes( + self, + copy: bool, + using_cow: bool, + infer_objects: bool = True, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, + convert_floating: bool = True, + dtype_backend: DtypeBackend = "numpy_nullable", + ) -> list[Block]: + if infer_objects and self.is_object: + blks = self.convert(copy=False, using_cow=using_cow) + else: + blks = [self] + + if not any( + [convert_floating, convert_integer, convert_boolean, convert_string] + ): + return [b.copy(deep=copy) for b in blks] + + rbs = [] + for blk in blks: + # Determine dtype column by column + sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split() + dtypes = [ + convert_dtypes( + b.values, + convert_string, + convert_integer, + convert_boolean, + convert_floating, + infer_objects, + dtype_backend, + ) + for b in sub_blks + ] + if all(dtype == self.dtype for dtype in dtypes): + # Avoid block splitting if no dtype changes + rbs.append(blk.copy(deep=copy)) + continue + + for dtype, b in zip(dtypes, sub_blks): + rbs.append(b.astype(dtype=dtype, copy=copy, squeeze=b.ndim != 1)) + return rbs + + # --------------------------------------------------------------------- + # Array-Like Methods + + @final + @cache_readonly + def dtype(self) -> DtypeObj: + return self.values.dtype + + @final + def astype( + self, + dtype: DtypeObj, + copy: bool = False, + errors: IgnoreRaise = "raise", + using_cow: bool = False, + squeeze: bool = False, + ) -> Block: + """ + Coerce to the new dtype. + + Parameters + ---------- + dtype : np.dtype or ExtensionDtype + copy : bool, default False + copy if indicated + errors : str, {'raise', 'ignore'}, default 'raise' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + using_cow: bool, default False + Signaling if copy on write copy logic is used. + squeeze : bool, default False + squeeze values to ndim=1 if only one column is given + + Returns + ------- + Block + """ + values = self.values + if squeeze and values.ndim == 2 and is_1d_only_ea_dtype(dtype): + if values.shape[0] != 1: + raise ValueError("Can not squeeze with more than one column.") + values = values[0, :] # type: ignore[call-overload] + + new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) + + new_values = maybe_coerce_values(new_values) + + refs = None + if (using_cow or not copy) and astype_is_view(values.dtype, new_values.dtype): + refs = self.refs + + newb = self.make_block(new_values, refs=refs) + if newb.shape != self.shape: + raise TypeError( + f"cannot set astype for copy = [{copy}] for dtype " + f"({self.dtype.name} [{self.shape}]) to different shape " + f"({newb.dtype.name} [{newb.shape}])" + ) + return newb + + @final + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Block: + """convert to our native types format""" + result = get_values_for_csv( + self.values, + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) + return self.make_block(result) + + @final + def copy(self, deep: bool = True) -> Self: + """copy constructor""" + values = self.values + refs: BlockValuesRefs | None + if deep: + values = values.copy() + refs = None + else: + refs = self.refs + return type(self)(values, placement=self._mgr_locs, ndim=self.ndim, refs=refs) + + # --------------------------------------------------------------------- + # Copy-on-Write Helpers + + @final + def _maybe_copy(self, using_cow: bool, inplace: bool) -> Self: + if using_cow and inplace: + deep = self.refs.has_reference() + blk = self.copy(deep=deep) + else: + blk = self if inplace else self.copy() + return blk + + @final + def _get_refs_and_copy(self, using_cow: bool, inplace: bool): + refs = None + copy = not inplace + if inplace: + if using_cow and self.refs.has_reference(): + copy = True + else: + refs = self.refs + return copy, refs + + # --------------------------------------------------------------------- + # Replace + + @final + def replace( + self, + to_replace, + value, + inplace: bool = False, + # mask may be pre-computed if we're called from replace_list + mask: npt.NDArray[np.bool_] | None = None, + using_cow: bool = False, + already_warned=None, + ) -> list[Block]: + """ + replace the to_replace value with value, possible to create new + blocks here this is just a call to putmask. + """ + + # Note: the checks we do in NDFrame.replace ensure we never get + # here with listlike to_replace or value, as those cases + # go through replace_list + values = self.values + + if isinstance(values, Categorical): + # TODO: avoid special-casing + # GH49404 + blk = self._maybe_copy(using_cow, inplace) + values = cast(Categorical, blk.values) + values._replace(to_replace=to_replace, value=value, inplace=True) + return [blk] + + if not self._can_hold_element(to_replace): + # We cannot hold `to_replace`, so we know immediately that + # replacing it is a no-op. + # Note: If to_replace were a list, NDFrame.replace would call + # replace_list instead of replace. + if using_cow: + return [self.copy(deep=False)] + else: + return [self] if inplace else [self.copy()] + + if mask is None: + mask = missing.mask_missing(values, to_replace) + if not mask.any(): + # Note: we get here with test_replace_extension_other incorrectly + # bc _can_hold_element is incorrect. + if using_cow: + return [self.copy(deep=False)] + else: + return [self] if inplace else [self.copy()] + + elif self._can_hold_element(value): + # TODO(CoW): Maybe split here as well into columns where mask has True + # and rest? + blk = self._maybe_copy(using_cow, inplace) + putmask_inplace(blk.values, mask, value) + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + + if not (self.is_object and value is None): + # if the user *explicitly* gave None, we keep None, otherwise + # may downcast to NaN + if get_option("future.no_silent_downcasting") is True: + blocks = [blk] + else: + blocks = blk.convert(copy=False, using_cow=using_cow) + if len(blocks) > 1 or blocks[0].dtype != blk.dtype: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated and " + "will be removed in a future version. To retain the old " + "behavior, explicitly call " + "`result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + blocks = [blk] + return blocks + + elif self.ndim == 1 or self.shape[0] == 1: + if value is None or value is NA: + blk = self.astype(np.dtype(object)) + else: + blk = self.coerce_to_target_dtype(value) + return blk.replace( + to_replace=to_replace, + value=value, + inplace=True, + mask=mask, + ) + + else: + # split so that we only upcast where necessary + blocks = [] + for i, nb in enumerate(self._split()): + blocks.extend( + type(self).replace( + nb, + to_replace=to_replace, + value=value, + inplace=True, + mask=mask[i : i + 1], + using_cow=using_cow, + ) + ) + return blocks + + @final + def _replace_regex( + self, + to_replace, + value, + inplace: bool = False, + mask=None, + using_cow: bool = False, + already_warned=None, + ) -> list[Block]: + """ + Replace elements by the given value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + mask : array-like of bool, optional + True indicate corresponding element is ignored. + using_cow: bool, default False + Specifying if copy on write is enabled. + + Returns + ------- + List[Block] + """ + if not self._can_hold_element(to_replace): + # i.e. only if self.is_object is True, but could in principle include a + # String ExtensionBlock + if using_cow: + return [self.copy(deep=False)] + return [self] if inplace else [self.copy()] + + rx = re.compile(to_replace) + + block = self._maybe_copy(using_cow, inplace) + + replace_regex(block.values, rx, value, mask) + + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + + nbs = block.convert(copy=False, using_cow=using_cow) + opt = get_option("future.no_silent_downcasting") + if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated and " + "will be removed in a future version. To retain the old " + "behavior, explicitly call `result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + return nbs + + @final + def replace_list( + self, + src_list: Iterable[Any], + dest_list: Sequence[Any], + inplace: bool = False, + regex: bool = False, + using_cow: bool = False, + already_warned=None, + ) -> list[Block]: + """ + See BlockManager.replace_list docstring. + """ + values = self.values + + if isinstance(values, Categorical): + # TODO: avoid special-casing + # GH49404 + blk = self._maybe_copy(using_cow, inplace) + values = cast(Categorical, blk.values) + values._replace(to_replace=src_list, value=dest_list, inplace=True) + return [blk] + + # Exclude anything that we know we won't contain + pairs = [ + (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) + ] + if not len(pairs): + if using_cow: + return [self.copy(deep=False)] + # shortcut, nothing to replace + return [self] if inplace else [self.copy()] + + src_len = len(pairs) - 1 + + if is_string_dtype(values.dtype): + # Calculate the mask once, prior to the call of comp + # in order to avoid repeating the same computations + na_mask = ~isna(values) + masks: Iterable[npt.NDArray[np.bool_]] = ( + extract_bool_array( + cast( + ArrayLike, + compare_or_regex_search( + values, s[0], regex=regex, mask=na_mask + ), + ) + ) + for s in pairs + ) + else: + # GH#38086 faster if we know we dont need to check for regex + masks = (missing.mask_missing(values, s[0]) for s in pairs) + # Materialize if inplace = True, since the masks can change + # as we replace + if inplace: + masks = list(masks) + + if using_cow: + # Don't set up refs here, otherwise we will think that we have + # references when we check again later + rb = [self] + else: + rb = [self if inplace else self.copy()] + + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + + opt = get_option("future.no_silent_downcasting") + for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): + convert = i == src_len # only convert once at the end + new_rb: list[Block] = [] + + # GH-39338: _replace_coerce can split a block into + # single-column blocks, so track the index so we know + # where to index into the mask + for blk_num, blk in enumerate(rb): + if len(rb) == 1: + m = mask + else: + mib = mask + assert not isinstance(mib, bool) + m = mib[blk_num : blk_num + 1] + + # error: Argument "mask" to "_replace_coerce" of "Block" has + # incompatible type "Union[ExtensionArray, ndarray[Any, Any], bool]"; + # expected "ndarray[Any, dtype[bool_]]" + result = blk._replace_coerce( + to_replace=src, + value=dest, + mask=m, + inplace=inplace, + regex=regex, + using_cow=using_cow, + ) + + if using_cow and i != src_len: + # This is ugly, but we have to get rid of intermediate refs + # that did not go out of scope yet, otherwise we will trigger + # many unnecessary copies + for b in result: + ref = weakref.ref(b) + b.refs.referenced_blocks.pop( + b.refs.referenced_blocks.index(ref) + ) + + if ( + not opt + and convert + and blk.is_object + and not all(x is None for x in dest_list) + ): + # GH#44498 avoid unwanted cast-back + nbs = [] + for res_blk in result: + converted = res_blk.convert( + copy=True and not using_cow, using_cow=using_cow + ) + if len(converted) > 1 or converted[0].dtype != res_blk.dtype: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated " + "and will be removed in a future version. To " + "retain the old behavior, explicitly call " + "`result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + nbs.extend(converted) + result = nbs + new_rb.extend(result) + rb = new_rb + return rb + + @final + def _replace_coerce( + self, + to_replace, + value, + mask: npt.NDArray[np.bool_], + inplace: bool = True, + regex: bool = False, + using_cow: bool = False, + ) -> list[Block]: + """ + Replace value corresponding to the given boolean array with another + value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + mask : np.ndarray[bool] + True indicate corresponding element is ignored. + inplace : bool, default True + Perform inplace modification. + regex : bool, default False + If true, perform regular expression substitution. + + Returns + ------- + List[Block] + """ + if should_use_regex(regex, to_replace): + return self._replace_regex( + to_replace, + value, + inplace=inplace, + mask=mask, + ) + else: + if value is None: + # gh-45601, gh-45836, gh-46634 + if mask.any(): + has_ref = self.refs.has_reference() + nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow) + if (nb is self or using_cow) and not inplace: + nb = nb.copy() + elif inplace and has_ref and nb.refs.has_reference() and using_cow: + # no copy in astype and we had refs before + nb = nb.copy() + putmask_inplace(nb.values, mask, value) + return [nb] + if using_cow: + return [self] + return [self] if inplace else [self.copy()] + return self.replace( + to_replace=to_replace, + value=value, + inplace=inplace, + mask=mask, + using_cow=using_cow, + ) + + # --------------------------------------------------------------------- + # 2D Methods - Shared by NumpyBlock and NDArrayBackedExtensionBlock + # but not ExtensionBlock + + def _maybe_squeeze_arg(self, arg: np.ndarray) -> np.ndarray: + """ + For compatibility with 1D-only ExtensionArrays. + """ + return arg + + def _unwrap_setitem_indexer(self, indexer): + """ + For compatibility with 1D-only ExtensionArrays. + """ + return indexer + + # NB: this cannot be made cache_readonly because in mgr.set_values we pin + # new .values that can have different shape GH#42631 + @property + def shape(self) -> Shape: + return self.values.shape + + def iget(self, i: int | tuple[int, int] | tuple[slice, int]) -> np.ndarray: + # In the case where we have a tuple[slice, int], the slice will always + # be slice(None) + # Note: only reached with self.ndim == 2 + # Invalid index type "Union[int, Tuple[int, int], Tuple[slice, int]]" + # for "Union[ndarray[Any, Any], ExtensionArray]"; expected type + # "Union[int, integer[Any]]" + return self.values[i] # type: ignore[index] + + def _slice( + self, slicer: slice | npt.NDArray[np.bool_] | npt.NDArray[np.intp] + ) -> ArrayLike: + """return a slice of my values""" + + return self.values[slicer] + + def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: + """ + Modify block values in-place with new item value. + + If copy=True, first copy the underlying values in place before modifying + (for Copy-on-Write). + + Notes + ----- + `set_inplace` never creates a new array or new Block, whereas `setitem` + _may_ create a new array and always creates a new Block. + + Caller is responsible for checking values.dtype == self.dtype. + """ + if copy: + self.values = self.values.copy() + self.values[locs] = values + + @final + def take_nd( + self, + indexer: npt.NDArray[np.intp], + axis: AxisInt, + new_mgr_locs: BlockPlacement | None = None, + fill_value=lib.no_default, + ) -> Block: + """ + Take values according to indexer and return them as a block. + """ + values = self.values + + if fill_value is lib.no_default: + fill_value = self.fill_value + allow_fill = False + else: + allow_fill = True + + # Note: algos.take_nd has upcast logic similar to coerce_to_target_dtype + new_values = algos.take_nd( + values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value + ) + + # Called from three places in managers, all of which satisfy + # these assertions + if isinstance(self, ExtensionBlock): + # NB: in this case, the 'axis' kwarg will be ignored in the + # algos.take_nd call above. + assert not (self.ndim == 1 and new_mgr_locs is None) + assert not (axis == 0 and new_mgr_locs is None) + + if new_mgr_locs is None: + new_mgr_locs = self._mgr_locs + + if new_values.dtype != self.dtype: + return self.make_block(new_values, new_mgr_locs) + else: + return self.make_block_same_class(new_values, new_mgr_locs) + + def _unstack( + self, + unstacker, + fill_value, + new_placement: npt.NDArray[np.intp], + needs_masking: npt.NDArray[np.bool_], + ): + """ + Return a list of unstacked blocks of self + + Parameters + ---------- + unstacker : reshape._Unstacker + fill_value : int + Only used in ExtensionBlock._unstack + new_placement : np.ndarray[np.intp] + allow_fill : bool + needs_masking : np.ndarray[bool] + + Returns + ------- + blocks : list of Block + New blocks of unstacked values. + mask : array-like of bool + The mask of columns of `blocks` we should keep. + """ + new_values, mask = unstacker.get_new_values( + self.values.T, fill_value=fill_value + ) + + mask = mask.any(0) + # TODO: in all tests we have mask.all(); can we rely on that? + + # Note: these next two lines ensure that + # mask.sum() == sum(len(nb.mgr_locs) for nb in blocks) + # which the calling function needs in order to pass verify_integrity=False + # to the BlockManager constructor + new_values = new_values.T[mask] + new_placement = new_placement[mask] + + bp = BlockPlacement(new_placement) + blocks = [new_block_2d(new_values, placement=bp)] + return blocks, mask + + # --------------------------------------------------------------------- + + def setitem(self, indexer, value, using_cow: bool = False) -> Block: + """ + Attempt self.values[indexer] = value, possibly creating a new array. + + Parameters + ---------- + indexer : tuple, list-like, array-like, slice, int + The subset of self.values to set + value : object + The value being set + using_cow: bool, default False + Signaling if CoW is used. + + Returns + ------- + Block + + Notes + ----- + `indexer` is a direct slice/positional indexer. `value` must + be a compatible shape. + """ + + value = self._standardize_fill_value(value) + + values = cast(np.ndarray, self.values) + if self.ndim == 2: + values = values.T + + # length checking + check_setitem_lengths(indexer, value, values) + + if self.dtype != _dtype_obj: + # GH48933: extract_array would convert a pd.Series value to np.ndarray + value = extract_array(value, extract_numpy=True) + try: + casted = np_can_hold_element(values.dtype, value) + except LossySetitemError: + # current dtype cannot store value, coerce to common dtype + nb = self.coerce_to_target_dtype(value, warn_on_upcast=True) + return nb.setitem(indexer, value) + else: + if self.dtype == _dtype_obj: + # TODO: avoid having to construct values[indexer] + vi = values[indexer] + if lib.is_list_like(vi): + # checking lib.is_scalar here fails on + # test_iloc_setitem_custom_object + casted = setitem_datetimelike_compat(values, len(vi), casted) + + self = self._maybe_copy(using_cow, inplace=True) + values = cast(np.ndarray, self.values.T) + if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1: + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + casted = casted[0, ...] + try: + values[indexer] = casted + except (TypeError, ValueError) as err: + if is_list_like(casted): + raise ValueError( + "setting an array element with a sequence." + ) from err + raise + return self + + def putmask( + self, mask, new, using_cow: bool = False, already_warned=None + ) -> list[Block]: + """ + putmask the data to the block; it is possible that we may create a + new dtype of block + + Return the resulting block(s). + + Parameters + ---------- + mask : np.ndarray[bool], SparseArray[bool], or BooleanArray + new : a ndarray/object + using_cow: bool, default False + + Returns + ------- + List[Block] + """ + orig_mask = mask + values = cast(np.ndarray, self.values) + mask, noop = validate_putmask(values.T, mask) + assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame)) + + if new is lib.no_default: + new = self.fill_value + + new = self._standardize_fill_value(new) + new = extract_array(new, extract_numpy=True) + + if noop: + if using_cow: + return [self.copy(deep=False)] + return [self] + + if ( + warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + + try: + casted = np_can_hold_element(values.dtype, new) + + self = self._maybe_copy(using_cow, inplace=True) + values = cast(np.ndarray, self.values) + + putmask_without_repeat(values.T, mask, casted) + return [self] + except LossySetitemError: + if self.ndim == 1 or self.shape[0] == 1: + # no need to split columns + + if not is_list_like(new): + # using just new[indexer] can't save us the need to cast + return self.coerce_to_target_dtype( + new, warn_on_upcast=True + ).putmask(mask, new) + else: + indexer = mask.nonzero()[0] + nb = self.setitem(indexer, new[indexer], using_cow=using_cow) + return [nb] + + else: + is_array = isinstance(new, np.ndarray) + + res_blocks = [] + nbs = self._split() + for i, nb in enumerate(nbs): + n = new + if is_array: + # we have a different value per-column + n = new[:, i : i + 1] + + submask = orig_mask[:, i : i + 1] + rbs = nb.putmask(submask, n, using_cow=using_cow) + res_blocks.extend(rbs) + return res_blocks + + def where( + self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False + ) -> list[Block]: + """ + evaluate the block; return result block(s) from the result + + Parameters + ---------- + other : a ndarray/object + cond : np.ndarray[bool], SparseArray[bool], or BooleanArray + _downcast : str or None, default "infer" + Private because we only specify it when calling from fillna. + + Returns + ------- + List[Block] + """ + assert cond.ndim == self.ndim + assert not isinstance(other, (ABCIndex, ABCSeries, ABCDataFrame)) + + transpose = self.ndim == 2 + + cond = extract_bool_array(cond) + + # EABlocks override where + values = cast(np.ndarray, self.values) + orig_other = other + if transpose: + values = values.T + + icond, noop = validate_putmask(values, ~cond) + if noop: + # GH-39595: Always return a copy; short-circuit up/downcasting + if using_cow: + return [self.copy(deep=False)] + return [self.copy()] + + if other is lib.no_default: + other = self.fill_value + + other = self._standardize_fill_value(other) + + try: + # try/except here is equivalent to a self._can_hold_element check, + # but this gets us back 'casted' which we will reuse below; + # without using 'casted', expressions.where may do unwanted upcasts. + casted = np_can_hold_element(values.dtype, other) + except (ValueError, TypeError, LossySetitemError): + # we cannot coerce, return a compat dtype + + if self.ndim == 1 or self.shape[0] == 1: + # no need to split columns + + block = self.coerce_to_target_dtype(other) + blocks = block.where(orig_other, cond, using_cow=using_cow) + return self._maybe_downcast( + blocks, downcast=_downcast, using_cow=using_cow, caller="where" + ) + + else: + # since _maybe_downcast would split blocks anyway, we + # can avoid some potential upcast/downcast by splitting + # on the front end. + is_array = isinstance(other, (np.ndarray, ExtensionArray)) + + res_blocks = [] + nbs = self._split() + for i, nb in enumerate(nbs): + oth = other + if is_array: + # we have a different value per-column + oth = other[:, i : i + 1] + + submask = cond[:, i : i + 1] + rbs = nb.where( + oth, submask, _downcast=_downcast, using_cow=using_cow + ) + res_blocks.extend(rbs) + return res_blocks + + else: + other = casted + alt = setitem_datetimelike_compat(values, icond.sum(), other) + if alt is not other: + if is_list_like(other) and len(other) < len(values): + # call np.where with other to get the appropriate ValueError + np.where(~icond, values, other) + raise NotImplementedError( + "This should not be reached; call to np.where above is " + "expected to raise ValueError. Please report a bug at " + "github.com/pandas-dev/pandas" + ) + result = values.copy() + np.putmask(result, icond, alt) + else: + # By the time we get here, we should have all Series/Index + # args extracted to ndarray + if ( + is_list_like(other) + and not isinstance(other, np.ndarray) + and len(other) == self.shape[-1] + ): + # If we don't do this broadcasting here, then expressions.where + # will broadcast a 1D other to be row-like instead of + # column-like. + other = np.array(other).reshape(values.shape) + # If lengths don't match (or len(other)==1), we will raise + # inside expressions.where, see test_series_where + + # Note: expressions.where may upcast. + result = expressions.where(~icond, values, other) + # The np_can_hold_element check _should_ ensure that we always + # have result.dtype == self.dtype here. + + if transpose: + result = result.T + + return [self.make_block(result)] + + def fillna( + self, + value, + limit: int | None = None, + inplace: bool = False, + downcast=None, + using_cow: bool = False, + already_warned=None, + ) -> list[Block]: + """ + fillna on the block with the value. If we fail, then convert to + block to hold objects instead and try again + """ + # Caller is responsible for validating limit; if int it is strictly positive + inplace = validate_bool_kwarg(inplace, "inplace") + + if not self._can_hold_na: + # can short-circuit the isna call + noop = True + else: + mask = isna(self.values) + mask, noop = validate_putmask(self.values, mask) + + if noop: + # we can't process the value, but nothing to do + if inplace: + if using_cow: + return [self.copy(deep=False)] + # Arbitrarily imposing the convention that we ignore downcast + # on no-op when inplace=True + return [self] + else: + # GH#45423 consistent downcasting on no-ops. + nb = self.copy(deep=not using_cow) + nbs = nb._maybe_downcast( + [nb], downcast=downcast, using_cow=using_cow, caller="fillna" + ) + return nbs + + if limit is not None: + mask[mask.cumsum(self.ndim - 1) > limit] = False + + if inplace: + nbs = self.putmask( + mask.T, value, using_cow=using_cow, already_warned=already_warned + ) + else: + # without _downcast, we would break + # test_fillna_dtype_conversion_equiv_replace + nbs = self.where(value, ~mask.T, _downcast=False) + + # Note: blk._maybe_downcast vs self._maybe_downcast(nbs) + # makes a difference bc blk may have object dtype, which has + # different behavior in _maybe_downcast. + return extend_blocks( + [ + blk._maybe_downcast( + [blk], downcast=downcast, using_cow=using_cow, caller="fillna" + ) + for blk in nbs + ] + ) + + def pad_or_backfill( + self, + *, + method: FillnaOptions, + axis: AxisInt = 0, + inplace: bool = False, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + downcast: Literal["infer"] | None = None, + using_cow: bool = False, + already_warned=None, + ) -> list[Block]: + if not self._can_hold_na: + # If there are no NAs, then interpolate is a no-op + if using_cow: + return [self.copy(deep=False)] + return [self] if inplace else [self.copy()] + + copy, refs = self._get_refs_and_copy(using_cow, inplace) + + # Dispatch to the NumpyExtensionArray method. + # We know self.array_values is a NumpyExtensionArray bc EABlock overrides + vals = cast(NumpyExtensionArray, self.array_values) + if axis == 1: + vals = vals.T + new_values = vals._pad_or_backfill( + method=method, + limit=limit, + limit_area=limit_area, + copy=copy, + ) + if ( + not copy + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + if axis == 1: + new_values = new_values.T + + data = extract_array(new_values, extract_numpy=True) + + nb = self.make_block_same_class(data, refs=refs) + return nb._maybe_downcast([nb], downcast, using_cow, caller="fillna") + + @final + def interpolate( + self, + *, + method: InterpolateOptions, + index: Index, + inplace: bool = False, + limit: int | None = None, + limit_direction: Literal["forward", "backward", "both"] = "forward", + limit_area: Literal["inside", "outside"] | None = None, + downcast: Literal["infer"] | None = None, + using_cow: bool = False, + already_warned=None, + **kwargs, + ) -> list[Block]: + inplace = validate_bool_kwarg(inplace, "inplace") + # error: Non-overlapping equality check [...] + if method == "asfreq": # type: ignore[comparison-overlap] + # clean_fill_method used to allow this + missing.clean_fill_method(method) + + if not self._can_hold_na: + # If there are no NAs, then interpolate is a no-op + if using_cow: + return [self.copy(deep=False)] + return [self] if inplace else [self.copy()] + + # TODO(3.0): this case will not be reachable once GH#53638 is enforced + if self.dtype == _dtype_obj: + # only deal with floats + # bc we already checked that can_hold_na, we don't have int dtype here + # test_interp_basic checks that we make a copy here + if using_cow: + return [self.copy(deep=False)] + return [self] if inplace else [self.copy()] + + copy, refs = self._get_refs_and_copy(using_cow, inplace) + + # Dispatch to the EA method. + new_values = self.array_values.interpolate( + method=method, + axis=self.ndim - 1, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + copy=copy, + **kwargs, + ) + data = extract_array(new_values, extract_numpy=True) + + if ( + not copy + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + + nb = self.make_block_same_class(data, refs=refs) + return nb._maybe_downcast([nb], downcast, using_cow, caller="interpolate") + + @final + def diff(self, n: int) -> list[Block]: + """return block for the diff of the values""" + # only reached with ndim == 2 + # TODO(EA2D): transpose will be unnecessary with 2D EAs + new_values = algos.diff(self.values.T, n, axis=0).T + return [self.make_block(values=new_values)] + + def shift(self, periods: int, fill_value: Any = None) -> list[Block]: + """shift the block by periods, possibly upcast""" + # convert integer to float if necessary. need to do a lot more than + # that, handle boolean etc also + axis = self.ndim - 1 + + # Note: periods is never 0 here, as that is handled at the top of + # NDFrame.shift. If that ever changes, we can do a check for periods=0 + # and possibly avoid coercing. + + if not lib.is_scalar(fill_value) and self.dtype != _dtype_obj: + # with object dtype there is nothing to promote, and the user can + # pass pretty much any weird fill_value they like + # see test_shift_object_non_scalar_fill + raise ValueError("fill_value must be a scalar") + + fill_value = self._standardize_fill_value(fill_value) + + try: + # error: Argument 1 to "np_can_hold_element" has incompatible type + # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]" + casted = np_can_hold_element( + self.dtype, fill_value # type: ignore[arg-type] + ) + except LossySetitemError: + nb = self.coerce_to_target_dtype(fill_value) + return nb.shift(periods, fill_value=fill_value) + + else: + values = cast(np.ndarray, self.values) + new_values = shift(values, periods, axis, casted) + return [self.make_block_same_class(new_values)] + + @final + def quantile( + self, + qs: Index, # with dtype float64 + interpolation: QuantileInterpolation = "linear", + ) -> Block: + """ + compute the quantiles of the + + Parameters + ---------- + qs : Index + The quantiles to be computed in float64. + interpolation : str, default 'linear' + Type of interpolation. + + Returns + ------- + Block + """ + # We should always have ndim == 2 because Series dispatches to DataFrame + assert self.ndim == 2 + assert is_list_like(qs) # caller is responsible for this + + result = quantile_compat(self.values, np.asarray(qs._values), interpolation) + # ensure_block_shape needed for cases where we start with EA and result + # is ndarray, e.g. IntegerArray, SparseArray + result = ensure_block_shape(result, ndim=2) + return new_block_2d(result, placement=self._mgr_locs) + + @final + def round(self, decimals: int, using_cow: bool = False) -> Self: + """ + Rounds the values. + If the block is not of an integer or float dtype, nothing happens. + This is consistent with DataFrame.round behavivor. + (Note: Series.round would raise) + + Parameters + ---------- + decimals: int, + Number of decimal places to round to. + Caller is responsible for validating this + using_cow: bool, + Whether Copy on Write is enabled right now + """ + if not self.is_numeric or self.is_bool: + return self.copy(deep=not using_cow) + refs = None + # TODO: round only defined on BaseMaskedArray + # Series also does this, so would need to fix both places + # error: Item "ExtensionArray" of "Union[ndarray[Any, Any], ExtensionArray]" + # has no attribute "round" + values = self.values.round(decimals) # type: ignore[union-attr] + if values is self.values: + if not using_cow: + # Normally would need to do this before, but + # numpy only returns same array when round operation + # is no-op + # https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636 + values = values.copy() + else: + refs = self.refs + return self.make_block_same_class(values, refs=refs) + + # --------------------------------------------------------------------- + # Abstract Methods Overridden By EABackedBlock and NumpyBlock + + def delete(self, loc) -> list[Block]: + """Deletes the locs from the block. + + We split the block to avoid copying the underlying data. We create new + blocks for every connected segment of the initial block that is not deleted. + The new blocks point to the initial array. + """ + if not is_list_like(loc): + loc = [loc] + + if self.ndim == 1: + values = cast(np.ndarray, self.values) + values = np.delete(values, loc) + mgr_locs = self._mgr_locs.delete(loc) + return [type(self)(values, placement=mgr_locs, ndim=self.ndim)] + + if np.max(loc) >= self.values.shape[0]: + raise IndexError + + # Add one out-of-bounds indexer as maximum to collect + # all columns after our last indexer if any + loc = np.concatenate([loc, [self.values.shape[0]]]) + mgr_locs_arr = self._mgr_locs.as_array + new_blocks: list[Block] = [] + + previous_loc = -1 + # TODO(CoW): This is tricky, if parent block goes out of scope + # all split blocks are referencing each other even though they + # don't share data + refs = self.refs if self.refs.has_reference() else None + for idx in loc: + if idx == previous_loc + 1: + # There is no column between current and last idx + pass + else: + # No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[slice, slice]" + values = self.values[previous_loc + 1 : idx, :] # type: ignore[call-overload] + locs = mgr_locs_arr[previous_loc + 1 : idx] + nb = type(self)( + values, placement=BlockPlacement(locs), ndim=self.ndim, refs=refs + ) + new_blocks.append(nb) + + previous_loc = idx + + return new_blocks + + @property + def is_view(self) -> bool: + """return a boolean if I am possibly a view""" + raise AbstractMethodError(self) + + @property + def array_values(self) -> ExtensionArray: + """ + The array that Series.array returns. Always an ExtensionArray. + """ + raise AbstractMethodError(self) + + def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: + """ + return an internal format, currently just the ndarray + this is often overridden to handle to_dense like operations + """ + raise AbstractMethodError(self) + + +class EABackedBlock(Block): + """ + Mixin for Block subclasses backed by ExtensionArray. + """ + + values: ExtensionArray + + @final + def shift(self, periods: int, fill_value: Any = None) -> list[Block]: + """ + Shift the block by `periods`. + + Dispatches to underlying ExtensionArray and re-boxes in an + ExtensionBlock. + """ + # Transpose since EA.shift is always along axis=0, while we want to shift + # along rows. + new_values = self.values.T.shift(periods=periods, fill_value=fill_value).T + return [self.make_block_same_class(new_values)] + + @final + def setitem(self, indexer, value, using_cow: bool = False): + """ + Attempt self.values[indexer] = value, possibly creating a new array. + + This differs from Block.setitem by not allowing setitem to change + the dtype of the Block. + + Parameters + ---------- + indexer : tuple, list-like, array-like, slice, int + The subset of self.values to set + value : object + The value being set + using_cow: bool, default False + Signaling if CoW is used. + + Returns + ------- + Block + + Notes + ----- + `indexer` is a direct slice/positional indexer. `value` must + be a compatible shape. + """ + orig_indexer = indexer + orig_value = value + + indexer = self._unwrap_setitem_indexer(indexer) + value = self._maybe_squeeze_arg(value) + + values = self.values + if values.ndim == 2: + # TODO(GH#45419): string[pyarrow] tests break if we transpose + # unconditionally + values = values.T + check_setitem_lengths(indexer, value, values) + + try: + values[indexer] = value + except (ValueError, TypeError): + if isinstance(self.dtype, IntervalDtype): + # see TestSetitemFloatIntervalWithIntIntervalValues + nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) + return nb.setitem(orig_indexer, orig_value) + + elif isinstance(self, NDArrayBackedExtensionBlock): + nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) + return nb.setitem(orig_indexer, orig_value) + + else: + raise + + else: + return self + + @final + def where( + self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False + ) -> list[Block]: + # _downcast private bc we only specify it when calling from fillna + arr = self.values.T + + cond = extract_bool_array(cond) + + orig_other = other + orig_cond = cond + other = self._maybe_squeeze_arg(other) + cond = self._maybe_squeeze_arg(cond) + + if other is lib.no_default: + other = self.fill_value + + icond, noop = validate_putmask(arr, ~cond) + if noop: + # GH#44181, GH#45135 + # Avoid a) raising for Interval/PeriodDtype and b) unnecessary object upcast + if using_cow: + return [self.copy(deep=False)] + return [self.copy()] + + try: + res_values = arr._where(cond, other).T + except (ValueError, TypeError): + if self.ndim == 1 or self.shape[0] == 1: + if isinstance(self.dtype, IntervalDtype): + # TestSetitemFloatIntervalWithIntIntervalValues + blk = self.coerce_to_target_dtype(orig_other) + nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) + return self._maybe_downcast( + nbs, downcast=_downcast, using_cow=using_cow, caller="where" + ) + + elif isinstance(self, NDArrayBackedExtensionBlock): + # NB: not (yet) the same as + # isinstance(values, NDArrayBackedExtensionArray) + blk = self.coerce_to_target_dtype(orig_other) + nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) + return self._maybe_downcast( + nbs, downcast=_downcast, using_cow=using_cow, caller="where" + ) + + else: + raise + + else: + # Same pattern we use in Block.putmask + is_array = isinstance(orig_other, (np.ndarray, ExtensionArray)) + + res_blocks = [] + nbs = self._split() + for i, nb in enumerate(nbs): + n = orig_other + if is_array: + # we have a different value per-column + n = orig_other[:, i : i + 1] + + submask = orig_cond[:, i : i + 1] + rbs = nb.where(n, submask, using_cow=using_cow) + res_blocks.extend(rbs) + return res_blocks + + nb = self.make_block_same_class(res_values) + return [nb] + + @final + def putmask( + self, mask, new, using_cow: bool = False, already_warned=None + ) -> list[Block]: + """ + See Block.putmask.__doc__ + """ + mask = extract_bool_array(mask) + if new is lib.no_default: + new = self.fill_value + + orig_new = new + orig_mask = mask + new = self._maybe_squeeze_arg(new) + mask = self._maybe_squeeze_arg(mask) + + if not mask.any(): + if using_cow: + return [self.copy(deep=False)] + return [self] + + if ( + warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + + self = self._maybe_copy(using_cow, inplace=True) + values = self.values + if values.ndim == 2: + values = values.T + + try: + # Caller is responsible for ensuring matching lengths + values._putmask(mask, new) + except (TypeError, ValueError): + if self.ndim == 1 or self.shape[0] == 1: + if isinstance(self.dtype, IntervalDtype): + # Discussion about what we want to support in the general + # case GH#39584 + blk = self.coerce_to_target_dtype(orig_new, warn_on_upcast=True) + return blk.putmask(orig_mask, orig_new) + + elif isinstance(self, NDArrayBackedExtensionBlock): + # NB: not (yet) the same as + # isinstance(values, NDArrayBackedExtensionArray) + blk = self.coerce_to_target_dtype(orig_new, warn_on_upcast=True) + return blk.putmask(orig_mask, orig_new) + + else: + raise + + else: + # Same pattern we use in Block.putmask + is_array = isinstance(orig_new, (np.ndarray, ExtensionArray)) + + res_blocks = [] + nbs = self._split() + for i, nb in enumerate(nbs): + n = orig_new + if is_array: + # we have a different value per-column + n = orig_new[:, i : i + 1] + + submask = orig_mask[:, i : i + 1] + rbs = nb.putmask(submask, n) + res_blocks.extend(rbs) + return res_blocks + + return [self] + + @final + def delete(self, loc) -> list[Block]: + # This will be unnecessary if/when __array_function__ is implemented + if self.ndim == 1: + values = self.values.delete(loc) + mgr_locs = self._mgr_locs.delete(loc) + return [type(self)(values, placement=mgr_locs, ndim=self.ndim)] + elif self.values.ndim == 1: + # We get here through to_stata + return [] + return super().delete(loc) + + @final + @cache_readonly + def array_values(self) -> ExtensionArray: + return self.values + + @final + def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: + """ + return object dtype as boxed values, such as Timestamps/Timedelta + """ + values: ArrayLike = self.values + if dtype == _dtype_obj: + values = values.astype(object) + # TODO(EA2D): reshape not needed with 2D EAs + return np.asarray(values).reshape(self.shape) + + @final + def pad_or_backfill( + self, + *, + method: FillnaOptions, + axis: AxisInt = 0, + inplace: bool = False, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + downcast: Literal["infer"] | None = None, + using_cow: bool = False, + already_warned=None, + ) -> list[Block]: + values = self.values + + kwargs: dict[str, Any] = {"method": method, "limit": limit} + if "limit_area" in inspect.signature(values._pad_or_backfill).parameters: + kwargs["limit_area"] = limit_area + elif limit_area is not None: + raise NotImplementedError( + f"{type(values).__name__} does not implement limit_area " + "(added in pandas 2.2). 3rd-party ExtnsionArray authors " + "need to add this argument to _pad_or_backfill." + ) + + if values.ndim == 2 and axis == 1: + # NDArrayBackedExtensionArray.fillna assumes axis=0 + new_values = values.T._pad_or_backfill(**kwargs).T + else: + new_values = values._pad_or_backfill(**kwargs) + return [self.make_block_same_class(new_values)] + + +class ExtensionBlock(EABackedBlock): + """ + Block for holding extension types. + + Notes + ----- + This holds all 3rd-party extension array types. It's also the immediate + parent class for our internal extension types' blocks. + + ExtensionArrays are limited to 1-D. + """ + + values: ExtensionArray + + def fillna( + self, + value, + limit: int | None = None, + inplace: bool = False, + downcast=None, + using_cow: bool = False, + already_warned=None, + ) -> list[Block]: + if isinstance(self.dtype, IntervalDtype): + # Block.fillna handles coercion (test_fillna_interval) + return super().fillna( + value=value, + limit=limit, + inplace=inplace, + downcast=downcast, + using_cow=using_cow, + already_warned=already_warned, + ) + if using_cow and self._can_hold_na and not self.values._hasna: + refs = self.refs + new_values = self.values + else: + copy, refs = self._get_refs_and_copy(using_cow, inplace) + + try: + new_values = self.values.fillna( + value=value, method=None, limit=limit, copy=copy + ) + except TypeError: + # 3rd party EA that has not implemented copy keyword yet + refs = None + new_values = self.values.fillna(value=value, method=None, limit=limit) + # issue the warning *after* retrying, in case the TypeError + # was caused by an invalid fill_value + warnings.warn( + # GH#53278 + "ExtensionArray.fillna added a 'copy' keyword in pandas " + "2.1.0. In a future version, ExtensionArray subclasses will " + "need to implement this keyword or an exception will be " + "raised. In the interim, the keyword is ignored by " + f"{type(self.values).__name__}.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + else: + if ( + not copy + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + + nb = self.make_block_same_class(new_values, refs=refs) + return nb._maybe_downcast([nb], downcast, using_cow=using_cow, caller="fillna") + + @cache_readonly + def shape(self) -> Shape: + # TODO(EA2D): override unnecessary with 2D EAs + if self.ndim == 1: + return (len(self.values),) + return len(self._mgr_locs), len(self.values) + + def iget(self, i: int | tuple[int, int] | tuple[slice, int]): + # In the case where we have a tuple[slice, int], the slice will always + # be slice(None) + # We _could_ make the annotation more specific, but mypy would + # complain about override mismatch: + # Literal[0] | tuple[Literal[0], int] | tuple[slice, int] + + # Note: only reached with self.ndim == 2 + + if isinstance(i, tuple): + # TODO(EA2D): unnecessary with 2D EAs + col, loc = i + if not com.is_null_slice(col) and col != 0: + raise IndexError(f"{self} only contains one item") + if isinstance(col, slice): + # the is_null_slice check above assures that col is slice(None) + # so what we want is a view on all our columns and row loc + if loc < 0: + loc += len(self.values) + # Note: loc:loc+1 vs [[loc]] makes a difference when called + # from fast_xs because we want to get a view back. + return self.values[loc : loc + 1] + return self.values[loc] + else: + if i != 0: + raise IndexError(f"{self} only contains one item") + return self.values + + def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: + # When an ndarray, we should have locs.tolist() == [0] + # When a BlockPlacement we should have list(locs) == [0] + if copy: + self.values = self.values.copy() + self.values[:] = values + + def _maybe_squeeze_arg(self, arg): + """ + If necessary, squeeze a (N, 1) ndarray to (N,) + """ + # e.g. if we are passed a 2D mask for putmask + if ( + isinstance(arg, (np.ndarray, ExtensionArray)) + and arg.ndim == self.values.ndim + 1 + ): + # TODO(EA2D): unnecessary with 2D EAs + assert arg.shape[1] == 1 + # error: No overload variant of "__getitem__" of "ExtensionArray" + # matches argument type "Tuple[slice, int]" + arg = arg[:, 0] # type: ignore[call-overload] + elif isinstance(arg, ABCDataFrame): + # 2022-01-06 only reached for setitem + # TODO: should we avoid getting here with DataFrame? + assert arg.shape[1] == 1 + arg = arg._ixs(0, axis=1)._values + + return arg + + def _unwrap_setitem_indexer(self, indexer): + """ + Adapt a 2D-indexer to our 1D values. + + This is intended for 'setitem', not 'iget' or '_slice'. + """ + # TODO: ATM this doesn't work for iget/_slice, can we change that? + + if isinstance(indexer, tuple) and len(indexer) == 2: + # TODO(EA2D): not needed with 2D EAs + # Should never have length > 2. Caller is responsible for checking. + # Length 1 is reached vis setitem_single_block and setitem_single_column + # each of which pass indexer=(pi,) + if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer): + # GH#44703 went through indexing.maybe_convert_ix + first, second = indexer + if not ( + second.size == 1 and (second == 0).all() and first.shape[1] == 1 + ): + raise NotImplementedError( + "This should not be reached. Please report a bug at " + "github.com/pandas-dev/pandas/" + ) + indexer = first[:, 0] + + elif lib.is_integer(indexer[1]) and indexer[1] == 0: + # reached via setitem_single_block passing the whole indexer + indexer = indexer[0] + + elif com.is_null_slice(indexer[1]): + indexer = indexer[0] + + elif is_list_like(indexer[1]) and indexer[1][0] == 0: + indexer = indexer[0] + + else: + raise NotImplementedError( + "This should not be reached. Please report a bug at " + "github.com/pandas-dev/pandas/" + ) + return indexer + + @property + def is_view(self) -> bool: + """Extension arrays are never treated as views.""" + return False + + # error: Cannot override writeable attribute with read-only property + @cache_readonly + def is_numeric(self) -> bool: # type: ignore[override] + return self.values.dtype._is_numeric + + def _slice( + self, slicer: slice | npt.NDArray[np.bool_] | npt.NDArray[np.intp] + ) -> ExtensionArray: + """ + Return a slice of my values. + + Parameters + ---------- + slicer : slice, ndarray[int], or ndarray[bool] + Valid (non-reducing) indexer for self.values. + + Returns + ------- + ExtensionArray + """ + # Notes: ndarray[bool] is only reachable when via get_rows_with_mask, which + # is only for Series, i.e. self.ndim == 1. + + # return same dims as we currently have + if self.ndim == 2: + # reached via getitem_block via _slice_take_blocks_ax0 + # TODO(EA2D): won't be necessary with 2D EAs + + if not isinstance(slicer, slice): + raise AssertionError( + "invalid slicing for a 1-ndim ExtensionArray", slicer + ) + # GH#32959 only full-slicers along fake-dim0 are valid + # TODO(EA2D): won't be necessary with 2D EAs + # range(1) instead of self._mgr_locs to avoid exception on [::-1] + # see test_iloc_getitem_slice_negative_step_ea_block + new_locs = range(1)[slicer] + if not len(new_locs): + raise AssertionError( + "invalid slicing for a 1-ndim ExtensionArray", slicer + ) + slicer = slice(None) + + return self.values[slicer] + + @final + def slice_block_rows(self, slicer: slice) -> Self: + """ + Perform __getitem__-like specialized to slicing along index. + """ + # GH#42787 in principle this is equivalent to values[..., slicer], but we don't + # require subclasses of ExtensionArray to support that form (for now). + new_values = self.values[slicer] + return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs) + + def _unstack( + self, + unstacker, + fill_value, + new_placement: npt.NDArray[np.intp], + needs_masking: npt.NDArray[np.bool_], + ): + # ExtensionArray-safe unstack. + # We override Block._unstack, which unstacks directly on the + # values of the array. For EA-backed blocks, this would require + # converting to a 2-D ndarray of objects. + # Instead, we unstack an ndarray of integer positions, followed by + # a `take` on the actual values. + + # Caller is responsible for ensuring self.shape[-1] == len(unstacker.index) + new_values, mask = unstacker.arange_result + + # Note: these next two lines ensure that + # mask.sum() == sum(len(nb.mgr_locs) for nb in blocks) + # which the calling function needs in order to pass verify_integrity=False + # to the BlockManager constructor + new_values = new_values.T[mask] + new_placement = new_placement[mask] + + # needs_masking[i] calculated once in BlockManager.unstack tells + # us if there are any -1s in the relevant indices. When False, + # that allows us to go through a faster path in 'take', among + # other things avoiding e.g. Categorical._validate_scalar. + blocks = [ + # TODO: could cast to object depending on fill_value? + type(self)( + self.values.take( + indices, allow_fill=needs_masking[i], fill_value=fill_value + ), + BlockPlacement(place), + ndim=2, + ) + for i, (indices, place) in enumerate(zip(new_values, new_placement)) + ] + return blocks, mask + + +class NumpyBlock(Block): + values: np.ndarray + __slots__ = () + + @property + def is_view(self) -> bool: + """return a boolean if I am possibly a view""" + return self.values.base is not None + + @property + def array_values(self) -> ExtensionArray: + return NumpyExtensionArray(self.values) + + def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: + if dtype == _dtype_obj: + return self.values.astype(_dtype_obj) + return self.values + + @cache_readonly + def is_numeric(self) -> bool: # type: ignore[override] + dtype = self.values.dtype + kind = dtype.kind + + return kind in "fciub" + + +class NumericBlock(NumpyBlock): + # this Block type is kept for backwards-compatibility + # TODO(3.0): delete and remove deprecation in __init__.py. + __slots__ = () + + +class ObjectBlock(NumpyBlock): + # this Block type is kept for backwards-compatibility + # TODO(3.0): delete and remove deprecation in __init__.py. + __slots__ = () + + +class NDArrayBackedExtensionBlock(EABackedBlock): + """ + Block backed by an NDArrayBackedExtensionArray + """ + + values: NDArrayBackedExtensionArray + + @property + def is_view(self) -> bool: + """return a boolean if I am possibly a view""" + # check the ndarray values of the DatetimeIndex values + return self.values._ndarray.base is not None + + +class DatetimeLikeBlock(NDArrayBackedExtensionBlock): + """Block for datetime64[ns], timedelta64[ns].""" + + __slots__ = () + is_numeric = False + values: DatetimeArray | TimedeltaArray + + +class DatetimeTZBlock(DatetimeLikeBlock): + """implement a datetime64 block with a tz attribute""" + + values: DatetimeArray + + __slots__ = () + + +# ----------------------------------------------------------------- +# Constructor Helpers + + +def maybe_coerce_values(values: ArrayLike) -> ArrayLike: + """ + Input validation for values passed to __init__. Ensure that + any datetime64/timedelta64 dtypes are in nanoseconds. Ensure + that we do not have string dtypes. + + Parameters + ---------- + values : np.ndarray or ExtensionArray + + Returns + ------- + values : np.ndarray or ExtensionArray + """ + # Caller is responsible for ensuring NumpyExtensionArray is already extracted. + + if isinstance(values, np.ndarray): + values = ensure_wrapped_if_datetimelike(values) + + if issubclass(values.dtype.type, str): + values = np.array(values, dtype=object) + + if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None: + # freq is only stored in DatetimeIndex/TimedeltaIndex, not in Series/DataFrame + values = values._with_freq(None) + + return values + + +def get_block_type(dtype: DtypeObj) -> type[Block]: + """ + Find the appropriate Block subclass to use for the given values and dtype. + + Parameters + ---------- + dtype : numpy or pandas dtype + + Returns + ------- + cls : class, subclass of Block + """ + if isinstance(dtype, DatetimeTZDtype): + return DatetimeTZBlock + elif isinstance(dtype, PeriodDtype): + return NDArrayBackedExtensionBlock + elif isinstance(dtype, ExtensionDtype): + # Note: need to be sure NumpyExtensionArray is unwrapped before we get here + return ExtensionBlock + + # We use kind checks because it is much more performant + # than is_foo_dtype + kind = dtype.kind + if kind in "Mm": + return DatetimeLikeBlock + + return NumpyBlock + + +def new_block_2d( + values: ArrayLike, placement: BlockPlacement, refs: BlockValuesRefs | None = None +): + # new_block specialized to case with + # ndim=2 + # isinstance(placement, BlockPlacement) + # check_ndim/ensure_block_shape already checked + klass = get_block_type(values.dtype) + + values = maybe_coerce_values(values) + return klass(values, ndim=2, placement=placement, refs=refs) + + +def new_block( + values, + placement: BlockPlacement, + *, + ndim: int, + refs: BlockValuesRefs | None = None, +) -> Block: + # caller is responsible for ensuring: + # - values is NOT a NumpyExtensionArray + # - check_ndim/ensure_block_shape already checked + # - maybe_coerce_values already called/unnecessary + klass = get_block_type(values.dtype) + return klass(values, ndim=ndim, placement=placement, refs=refs) + + +def check_ndim(values, placement: BlockPlacement, ndim: int) -> None: + """ + ndim inference and validation. + + Validates that values.ndim and ndim are consistent. + Validates that len(values) and len(placement) are consistent. + + Parameters + ---------- + values : array-like + placement : BlockPlacement + ndim : int + + Raises + ------ + ValueError : the number of dimensions do not match + """ + + if values.ndim > ndim: + # Check for both np.ndarray and ExtensionArray + raise ValueError( + "Wrong number of dimensions. " + f"values.ndim > ndim [{values.ndim} > {ndim}]" + ) + + if not is_1d_only_ea_dtype(values.dtype): + # TODO(EA2D): special case not needed with 2D EAs + if values.ndim != ndim: + raise ValueError( + "Wrong number of dimensions. " + f"values.ndim != ndim [{values.ndim} != {ndim}]" + ) + if len(placement) != len(values): + raise ValueError( + f"Wrong number of items passed {len(values)}, " + f"placement implies {len(placement)}" + ) + elif ndim == 2 and len(placement) != 1: + # TODO(EA2D): special case unnecessary with 2D EAs + raise ValueError("need to split") + + +def extract_pandas_array( + values: ArrayLike, dtype: DtypeObj | None, ndim: int +) -> tuple[ArrayLike, DtypeObj | None]: + """ + Ensure that we don't allow NumpyExtensionArray / NumpyEADtype in internals. + """ + # For now, blocks should be backed by ndarrays when possible. + if isinstance(values, ABCNumpyExtensionArray): + values = values.to_numpy() + if ndim and ndim > 1: + # TODO(EA2D): special case not needed with 2D EAs + values = np.atleast_2d(values) + + if isinstance(dtype, NumpyEADtype): + dtype = dtype.numpy_dtype + + return values, dtype + + +# ----------------------------------------------------------------- + + +def extend_blocks(result, blocks=None) -> list[Block]: + """return a new extended blocks, given the result""" + if blocks is None: + blocks = [] + if isinstance(result, list): + for r in result: + if isinstance(r, list): + blocks.extend(r) + else: + blocks.append(r) + else: + assert isinstance(result, Block), type(result) + blocks.append(result) + return blocks + + +def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: + """ + Reshape if possible to have values.ndim == ndim. + """ + + if values.ndim < ndim: + if not is_1d_only_ea_dtype(values.dtype): + # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 + # block.shape is incorrect for "2D" ExtensionArrays + # We can't, and don't need to, reshape. + values = cast("np.ndarray | DatetimeArray | TimedeltaArray", values) + values = values.reshape(1, -1) + + return values + + +def external_values(values: ArrayLike) -> ArrayLike: + """ + The array that Series.values returns (public attribute). + + This has some historical constraints, and is overridden in block + subclasses to return the correct array (e.g. period returns + object ndarray and datetimetz a datetime64[ns] ndarray instead of + proper extension array). + """ + if isinstance(values, (PeriodArray, IntervalArray)): + return values.astype(object) + elif isinstance(values, (DatetimeArray, TimedeltaArray)): + # NB: for datetime64tz this is different from np.asarray(values), since + # that returns an object-dtype ndarray of Timestamps. + # Avoid raising in .astype in casting from dt64tz to dt64 + values = values._ndarray + + if isinstance(values, np.ndarray) and using_copy_on_write(): + values = values.view() + values.flags.writeable = False + + # TODO(CoW) we should also mark our ExtensionArrays as read-only + + return values diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/concat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/concat.py new file mode 100644 index 0000000000000000000000000000000000000000..b2d463a8c6c26f62ded5a06283f29275612c9b40 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/concat.py @@ -0,0 +1,598 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + cast, +) +import warnings + +import numpy as np + +from pandas._libs import ( + NaT, + algos as libalgos, + internals as libinternals, + lib, +) +from pandas._libs.missing import NA +from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import ( + ensure_dtype_can_hold_na, + find_common_type, +) +from pandas.core.dtypes.common import ( + is_1d_only_ea_dtype, + is_scalar, + needs_i8_conversion, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, + SparseDtype, +) +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna, + isna_all, +) + +from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core.internals.array_manager import ArrayManager +from pandas.core.internals.blocks import ( + ensure_block_shape, + new_block_2d, +) +from pandas.core.internals.managers import ( + BlockManager, + make_na_array, +) + +if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import ( + ArrayLike, + AxisInt, + DtypeObj, + Manager2D, + Shape, + ) + + from pandas import Index + from pandas.core.internals.blocks import ( + Block, + BlockPlacement, + ) + + +def _concatenate_array_managers( + mgrs: list[ArrayManager], axes: list[Index], concat_axis: AxisInt +) -> Manager2D: + """ + Concatenate array managers into one. + + Parameters + ---------- + mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + + Returns + ------- + ArrayManager + """ + if concat_axis == 1: + return mgrs[0].concat_vertical(mgrs, axes) + else: + # concatting along the columns -> combine reindexed arrays in a single manager + assert concat_axis == 0 + return mgrs[0].concat_horizontal(mgrs, axes) + + +def concatenate_managers( + mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool +) -> Manager2D: + """ + Concatenate block managers into one. + + Parameters + ---------- + mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + Returns + ------- + BlockManager + """ + + needs_copy = copy and concat_axis == 0 + + # TODO(ArrayManager) this assumes that all managers are of the same type + if isinstance(mgrs_indexers[0][0], ArrayManager): + mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy) + # error: Argument 1 to "_concatenate_array_managers" has incompatible + # type "List[BlockManager]"; expected "List[Union[ArrayManager, + # SingleArrayManager, BlockManager, SingleBlockManager]]" + return _concatenate_array_managers( + mgrs, axes, concat_axis # type: ignore[arg-type] + ) + + # Assertions disabled for performance + # for tup in mgrs_indexers: + # # caller is responsible for ensuring this + # indexers = tup[1] + # assert concat_axis not in indexers + + if concat_axis == 0: + mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy) + return mgrs[0].concat_horizontal(mgrs, axes) + + if len(mgrs_indexers) > 0 and mgrs_indexers[0][0].nblocks > 0: + first_dtype = mgrs_indexers[0][0].blocks[0].dtype + if first_dtype in [np.float64, np.float32]: + # TODO: support more dtypes here. This will be simpler once + # JoinUnit.is_na behavior is deprecated. + if ( + all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in mgrs_indexers) + and len(mgrs_indexers) > 1 + ): + # Fastpath! + # Length restriction is just to avoid having to worry about 'copy' + shape = tuple(len(x) for x in axes) + nb = _concat_homogeneous_fastpath(mgrs_indexers, shape, first_dtype) + return BlockManager((nb,), axes) + + mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy) + + if len(mgrs) == 1: + mgr = mgrs[0] + out = mgr.copy(deep=False) + out.axes = axes + return out + + concat_plan = _get_combined_plan(mgrs) + + blocks = [] + values: ArrayLike + + for placement, join_units in concat_plan: + unit = join_units[0] + blk = unit.block + + if _is_uniform_join_units(join_units): + vals = [ju.block.values for ju in join_units] + + if not blk.is_extension: + # _is_uniform_join_units ensures a single dtype, so + # we can use np.concatenate, which is more performant + # than concat_compat + # error: Argument 1 to "concatenate" has incompatible type + # "List[Union[ndarray[Any, Any], ExtensionArray]]"; + # expected "Union[_SupportsArray[dtype[Any]], + # _NestedSequence[_SupportsArray[dtype[Any]]]]" + values = np.concatenate(vals, axis=1) # type: ignore[arg-type] + elif is_1d_only_ea_dtype(blk.dtype): + # TODO(EA2D): special-casing not needed with 2D EAs + values = concat_compat(vals, axis=0, ea_compat_axis=True) + values = ensure_block_shape(values, ndim=2) + else: + values = concat_compat(vals, axis=1) + + values = ensure_wrapped_if_datetimelike(values) + + fastpath = blk.values.dtype == values.dtype + else: + values = _concatenate_join_units(join_units, copy=copy) + fastpath = False + + if fastpath: + b = blk.make_block_same_class(values, placement=placement) + else: + b = new_block_2d(values, placement=placement) + + blocks.append(b) + + return BlockManager(tuple(blocks), axes) + + +def _maybe_reindex_columns_na_proxy( + axes: list[Index], + mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]], + needs_copy: bool, +) -> list[BlockManager]: + """ + Reindex along columns so that all of the BlockManagers being concatenated + have matching columns. + + Columns added in this reindexing have dtype=np.void, indicating they + should be ignored when choosing a column's final dtype. + """ + new_mgrs = [] + + for mgr, indexers in mgrs_indexers: + # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this + # is a cheap reindexing. + for i, indexer in indexers.items(): + mgr = mgr.reindex_indexer( + axes[i], + indexers[i], + axis=i, + copy=False, + only_slice=True, # only relevant for i==0 + allow_dups=True, + use_na_proxy=True, # only relevant for i==0 + ) + if needs_copy and not indexers: + mgr = mgr.copy() + + new_mgrs.append(mgr) + return new_mgrs + + +def _is_homogeneous_mgr(mgr: BlockManager, first_dtype: DtypeObj) -> bool: + """ + Check if this Manager can be treated as a single ndarray. + """ + if mgr.nblocks != 1: + return False + blk = mgr.blocks[0] + if not (blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1): + return False + + return blk.dtype == first_dtype + + +def _concat_homogeneous_fastpath( + mgrs_indexers, shape: Shape, first_dtype: np.dtype +) -> Block: + """ + With single-Block managers with homogeneous dtypes (that can already hold nan), + we avoid [...] + """ + # assumes + # all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in in mgrs_indexers) + + if all(not indexers for _, indexers in mgrs_indexers): + # https://github.com/pandas-dev/pandas/pull/52685#issuecomment-1523287739 + arrs = [mgr.blocks[0].values.T for mgr, _ in mgrs_indexers] + arr = np.concatenate(arrs).T + bp = libinternals.BlockPlacement(slice(shape[0])) + nb = new_block_2d(arr, bp) + return nb + + arr = np.empty(shape, dtype=first_dtype) + + if first_dtype == np.float64: + take_func = libalgos.take_2d_axis0_float64_float64 + else: + take_func = libalgos.take_2d_axis0_float32_float32 + + start = 0 + for mgr, indexers in mgrs_indexers: + mgr_len = mgr.shape[1] + end = start + mgr_len + + if 0 in indexers: + take_func( + mgr.blocks[0].values, + indexers[0], + arr[:, start:end], + ) + else: + # No reindexing necessary, we can copy values directly + arr[:, start:end] = mgr.blocks[0].values + + start += mgr_len + + bp = libinternals.BlockPlacement(slice(shape[0])) + nb = new_block_2d(arr, bp) + return nb + + +def _get_combined_plan( + mgrs: list[BlockManager], +) -> list[tuple[BlockPlacement, list[JoinUnit]]]: + plan = [] + + max_len = mgrs[0].shape[0] + + blknos_list = [mgr.blknos for mgr in mgrs] + pairs = libinternals.get_concat_blkno_indexers(blknos_list) + for ind, (blknos, bp) in enumerate(pairs): + # assert bp.is_slice_like + # assert len(bp) > 0 + + units_for_bp = [] + for k, mgr in enumerate(mgrs): + blkno = blknos[k] + + nb = _get_block_for_concat_plan(mgr, bp, blkno, max_len=max_len) + unit = JoinUnit(nb) + units_for_bp.append(unit) + + plan.append((bp, units_for_bp)) + + return plan + + +def _get_block_for_concat_plan( + mgr: BlockManager, bp: BlockPlacement, blkno: int, *, max_len: int +) -> Block: + blk = mgr.blocks[blkno] + # Assertions disabled for performance: + # assert bp.is_slice_like + # assert blkno != -1 + # assert (mgr.blknos[bp] == blkno).all() + + if len(bp) == len(blk.mgr_locs) and ( + blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1 + ): + nb = blk + else: + ax0_blk_indexer = mgr.blklocs[bp.indexer] + + slc = lib.maybe_indices_to_slice(ax0_blk_indexer, max_len) + # TODO: in all extant test cases 2023-04-08 we have a slice here. + # Will this always be the case? + if isinstance(slc, slice): + nb = blk.slice_block_columns(slc) + else: + nb = blk.take_block_columns(slc) + + # assert nb.shape == (len(bp), mgr.shape[1]) + return nb + + +class JoinUnit: + def __init__(self, block: Block) -> None: + self.block = block + + def __repr__(self) -> str: + return f"{type(self).__name__}({repr(self.block)})" + + def _is_valid_na_for(self, dtype: DtypeObj) -> bool: + """ + Check that we are all-NA of a type/dtype that is compatible with this dtype. + Augments `self.is_na` with an additional check of the type of NA values. + """ + if not self.is_na: + return False + + blk = self.block + if blk.dtype.kind == "V": + return True + + if blk.dtype == object: + values = blk.values + return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K")) + + na_value = blk.fill_value + if na_value is NaT and blk.dtype != dtype: + # e.g. we are dt64 and other is td64 + # fill_values match but we should not cast blk.values to dtype + # TODO: this will need updating if we ever have non-nano dt64/td64 + return False + + if na_value is NA and needs_i8_conversion(dtype): + # FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat + # e.g. blk.dtype == "Int64" and dtype is td64, we dont want + # to consider these as matching + return False + + # TODO: better to use can_hold_element? + return is_valid_na_for_dtype(na_value, dtype) + + @cache_readonly + def is_na(self) -> bool: + blk = self.block + if blk.dtype.kind == "V": + return True + + if not blk._can_hold_na: + return False + + values = blk.values + if values.size == 0: + # GH#39122 this case will return False once deprecation is enforced + return True + + if isinstance(values.dtype, SparseDtype): + return False + + if values.ndim == 1: + # TODO(EA2D): no need for special case with 2D EAs + val = values[0] + if not is_scalar(val) or not isna(val): + # ideally isna_all would do this short-circuiting + return False + return isna_all(values) + else: + val = values[0][0] + if not is_scalar(val) or not isna(val): + # ideally isna_all would do this short-circuiting + return False + return all(isna_all(row) for row in values) + + @cache_readonly + def is_na_after_size_and_isna_all_deprecation(self) -> bool: + """ + Will self.is_na be True after values.size == 0 deprecation and isna_all + deprecation are enforced? + """ + blk = self.block + if blk.dtype.kind == "V": + return True + return False + + def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: + values: ArrayLike + + if upcasted_na is None and self.block.dtype.kind != "V": + # No upcasting is necessary + return self.block.values + else: + fill_value = upcasted_na + + if self._is_valid_na_for(empty_dtype): + # note: always holds when self.block.dtype.kind == "V" + blk_dtype = self.block.dtype + + if blk_dtype == np.dtype("object"): + # we want to avoid filling with np.nan if we are + # using None; we already know that we are all + # nulls + values = cast(np.ndarray, self.block.values) + if values.size and values[0, 0] is None: + fill_value = None + + return make_na_array(empty_dtype, self.block.shape, fill_value) + + return self.block.values + + +def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike: + """ + Concatenate values from several join units along axis=1. + """ + empty_dtype, empty_dtype_future = _get_empty_dtype(join_units) + + has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) + upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) + + to_concat = [ + ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) + for ju in join_units + ] + + if any(is_1d_only_ea_dtype(t.dtype) for t in to_concat): + # TODO(EA2D): special case not needed if all EAs used HybridBlocks + + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[int, slice]" + to_concat = [ + t + if is_1d_only_ea_dtype(t.dtype) + else t[0, :] # type: ignore[call-overload] + for t in to_concat + ] + concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) + concat_values = ensure_block_shape(concat_values, 2) + + else: + concat_values = concat_compat(to_concat, axis=1) + + if empty_dtype != empty_dtype_future: + if empty_dtype == concat_values.dtype: + # GH#39122, GH#40893 + warnings.warn( + "The behavior of DataFrame concatenation with empty or all-NA " + "entries is deprecated. In a future version, this will no longer " + "exclude empty or all-NA columns when determining the result dtypes. " + "To retain the old behavior, exclude the relevant entries before " + "the concat operation.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return concat_values + + +def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): + """ + Find the NA value to go with this dtype. + """ + if isinstance(dtype, ExtensionDtype): + return dtype.na_value + elif dtype.kind in "mM": + return dtype.type("NaT") + elif dtype.kind in "fc": + return dtype.type("NaN") + elif dtype.kind == "b": + # different from missing.na_value_for_dtype + return None + elif dtype.kind in "iu": + if not has_none_blocks: + # different from missing.na_value_for_dtype + return None + return np.nan + elif dtype.kind == "O": + return np.nan + raise NotImplementedError + + +def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]: + """ + Return dtype and N/A values to use when concatenating specified units. + + Returned N/A value may be None which means there was no casting involved. + + Returns + ------- + dtype + """ + if lib.dtypes_all_equal([ju.block.dtype for ju in join_units]): + empty_dtype = join_units[0].block.dtype + return empty_dtype, empty_dtype + + has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) + + dtypes = [unit.block.dtype for unit in join_units if not unit.is_na] + if not len(dtypes): + dtypes = [ + unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V" + ] + + dtype = find_common_type(dtypes) + if has_none_blocks: + dtype = ensure_dtype_can_hold_na(dtype) + + dtype_future = dtype + if len(dtypes) != len(join_units): + dtypes_future = [ + unit.block.dtype + for unit in join_units + if not unit.is_na_after_size_and_isna_all_deprecation + ] + if not len(dtypes_future): + dtypes_future = [ + unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V" + ] + + if len(dtypes) != len(dtypes_future): + dtype_future = find_common_type(dtypes_future) + if has_none_blocks: + dtype_future = ensure_dtype_can_hold_na(dtype_future) + + return dtype, dtype_future + + +def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: + """ + Check if the join units consist of blocks of uniform type that can + be concatenated using Block.concat_same_type instead of the generic + _concatenate_join_units (which uses `concat_compat`). + + """ + first = join_units[0].block + if first.dtype.kind == "V": + return False + return ( + # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64 + all(type(ju.block) is type(first) for ju in join_units) + and + # e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform + all( + ju.block.dtype == first.dtype + # GH#42092 we only want the dtype_equal check for non-numeric blocks + # (for now, may change but that would need a deprecation) + or ju.block.dtype.kind in "iub" + for ju in join_units + ) + and + # no blocks that would get missing values (can lead to type upcasts) + # unless we're an extension dtype. + all(not ju.is_na or ju.block.is_extension for ju in join_units) + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/construction.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/construction.py new file mode 100644 index 0000000000000000000000000000000000000000..609d2c9a7a285ec23569f9fa06067f0a5b0a00cc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/construction.py @@ -0,0 +1,1072 @@ +""" +Functions for preparing various inputs passed to the DataFrame or Series +constructors before passing them to a BlockManager. +""" +from __future__ import annotations + +from collections import abc +from typing import ( + TYPE_CHECKING, + Any, +) + +import numpy as np +from numpy import ma + +from pandas._config import using_pyarrow_string_dtype + +from pandas._libs import lib + +from pandas.core.dtypes.astype import astype_is_view +from pandas.core.dtypes.cast import ( + construct_1d_arraylike_from_scalar, + dict_compat, + maybe_cast_to_datetime, + maybe_convert_platform, + maybe_infer_to_datetimelike, +) +from pandas.core.dtypes.common import ( + is_1d_only_ea_dtype, + is_integer_dtype, + is_list_like, + is_named_tuple, + is_object_dtype, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) + +from pandas.core import ( + algorithms, + common as com, +) +from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.string_ import StringDtype +from pandas.core.construction import ( + array as pd_array, + ensure_wrapped_if_datetimelike, + extract_array, + range_to_ndarray, + sanitize_array, +) +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + TimedeltaIndex, + default_index, + ensure_index, + get_objs_combined_axis, + union_indexes, +) +from pandas.core.internals.array_manager import ( + ArrayManager, + SingleArrayManager, +) +from pandas.core.internals.blocks import ( + BlockPlacement, + ensure_block_shape, + new_block, + new_block_2d, +) +from pandas.core.internals.managers import ( + BlockManager, + SingleBlockManager, + create_block_manager_from_blocks, + create_block_manager_from_column_arrays, +) + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Sequence, + ) + + from pandas._typing import ( + ArrayLike, + DtypeObj, + Manager, + npt, + ) +# --------------------------------------------------------------------- +# BlockManager Interface + + +def arrays_to_mgr( + arrays, + columns: Index, + index, + *, + dtype: DtypeObj | None = None, + verify_integrity: bool = True, + typ: str | None = None, + consolidate: bool = True, +) -> Manager: + """ + Segregate Series based on type and coerce into matrices. + + Needs to handle a lot of exceptional cases. + """ + if verify_integrity: + # figure out the index, if necessary + if index is None: + index = _extract_index(arrays) + else: + index = ensure_index(index) + + # don't force copy because getting jammed in an ndarray anyway + arrays, refs = _homogenize(arrays, index, dtype) + # _homogenize ensures + # - all(len(x) == len(index) for x in arrays) + # - all(x.ndim == 1 for x in arrays) + # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays) + # - all(type(x) is not NumpyExtensionArray for x in arrays) + + else: + index = ensure_index(index) + arrays = [extract_array(x, extract_numpy=True) for x in arrays] + # with _from_arrays, the passed arrays should never be Series objects + refs = [None] * len(arrays) + + # Reached via DataFrame._from_arrays; we do minimal validation here + for arr in arrays: + if ( + not isinstance(arr, (np.ndarray, ExtensionArray)) + or arr.ndim != 1 + or len(arr) != len(index) + ): + raise ValueError( + "Arrays must be 1-dimensional np.ndarray or ExtensionArray " + "with length matching len(index)" + ) + + columns = ensure_index(columns) + if len(columns) != len(arrays): + raise ValueError("len(arrays) must match len(columns)") + + # from BlockManager perspective + axes = [columns, index] + + if typ == "block": + return create_block_manager_from_column_arrays( + arrays, axes, consolidate=consolidate, refs=refs + ) + elif typ == "array": + return ArrayManager(arrays, [index, columns]) + else: + raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") + + +def rec_array_to_mgr( + data: np.rec.recarray | np.ndarray, + index, + columns, + dtype: DtypeObj | None, + copy: bool, + typ: str, +) -> Manager: + """ + Extract from a masked rec array and create the manager. + """ + # essentially process a record array then fill it + fdata = ma.getdata(data) + if index is None: + index = default_index(len(fdata)) + else: + index = ensure_index(index) + + if columns is not None: + columns = ensure_index(columns) + arrays, arr_columns = to_arrays(fdata, columns) + + # create the manager + + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, len(index)) + if columns is None: + columns = arr_columns + + mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ) + + if copy: + mgr = mgr.copy() + return mgr + + +def mgr_to_mgr(mgr, typ: str, copy: bool = True) -> Manager: + """ + Convert to specific type of Manager. Does not copy if the type is already + correct. Does not guarantee a copy otherwise. `copy` keyword only controls + whether conversion from Block->ArrayManager copies the 1D arrays. + """ + new_mgr: Manager + + if typ == "block": + if isinstance(mgr, BlockManager): + new_mgr = mgr + else: + if mgr.ndim == 2: + new_mgr = arrays_to_mgr( + mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block" + ) + else: + new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index) + elif typ == "array": + if isinstance(mgr, ArrayManager): + new_mgr = mgr + else: + if mgr.ndim == 2: + arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))] + if copy: + arrays = [arr.copy() for arr in arrays] + new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) + else: + array = mgr.internal_values() + if copy: + array = array.copy() + new_mgr = SingleArrayManager([array], [mgr.index]) + else: + raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") + return new_mgr + + +# --------------------------------------------------------------------- +# DataFrame Constructor Interface + + +def ndarray_to_mgr( + values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str +) -> Manager: + # used in DataFrame.__init__ + # input must be a ndarray, list, Series, Index, ExtensionArray + + if isinstance(values, ABCSeries): + if columns is None: + if values.name is not None: + columns = Index([values.name]) + if index is None: + index = values.index + else: + values = values.reindex(index) + + # zero len case (GH #2234) + if not len(values) and columns is not None and len(columns): + values = np.empty((0, 1), dtype=object) + + # if the array preparation does a copy -> avoid this for ArrayManager, + # since the copy is done on conversion to 1D arrays + copy_on_sanitize = False if typ == "array" else copy + + vdtype = getattr(values, "dtype", None) + refs = None + if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): + # GH#19157 + + if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1: + # GH#12513 a EA dtype passed with a 2D array, split into + # multiple EAs that view the values + # error: No overload variant of "__getitem__" of "ExtensionArray" + # matches argument type "Tuple[slice, int]" + values = [ + values[:, n] # type: ignore[call-overload] + for n in range(values.shape[1]) + ] + else: + values = [values] + + if columns is None: + columns = Index(range(len(values))) + else: + columns = ensure_index(columns) + + return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) + + elif isinstance(vdtype, ExtensionDtype): + # i.e. Datetime64TZ, PeriodDtype; cases with is_1d_only_ea_dtype(vdtype) + # are already caught above + values = extract_array(values, extract_numpy=True) + if copy: + values = values.copy() + if values.ndim == 1: + values = values.reshape(-1, 1) + + elif isinstance(values, (ABCSeries, Index)): + if not copy_on_sanitize and ( + dtype is None or astype_is_view(values.dtype, dtype) + ): + refs = values._references + + if copy_on_sanitize: + values = values._values.copy() + else: + values = values._values + + values = _ensure_2d(values) + + elif isinstance(values, (np.ndarray, ExtensionArray)): + # drop subclass info + _copy = ( + copy_on_sanitize + if (dtype is None or astype_is_view(values.dtype, dtype)) + else False + ) + values = np.array(values, copy=_copy) + values = _ensure_2d(values) + + else: + # by definition an array here + # the dtypes will be coerced to a single dtype + values = _prep_ndarraylike(values, copy=copy_on_sanitize) + + if dtype is not None and values.dtype != dtype: + # GH#40110 see similar check inside sanitize_array + values = sanitize_array( + values, + None, + dtype=dtype, + copy=copy_on_sanitize, + allow_2d=True, + ) + + # _prep_ndarraylike ensures that values.ndim == 2 at this point + index, columns = _get_axes( + values.shape[0], values.shape[1], index=index, columns=columns + ) + + _check_values_indices_shape_match(values, index, columns) + + if typ == "array": + if issubclass(values.dtype.type, str): + values = np.array(values, dtype=object) + + if dtype is None and is_object_dtype(values.dtype): + arrays = [ + ensure_wrapped_if_datetimelike( + maybe_infer_to_datetimelike(values[:, i]) + ) + for i in range(values.shape[1]) + ] + else: + if lib.is_np_dtype(values.dtype, "mM"): + values = ensure_wrapped_if_datetimelike(values) + arrays = [values[:, i] for i in range(values.shape[1])] + + if copy: + arrays = [arr.copy() for arr in arrays] + + return ArrayManager(arrays, [index, columns], verify_integrity=False) + + values = values.T + + # if we don't have a dtype specified, then try to convert objects + # on the entire block; this is to convert if we have datetimelike's + # embedded in an object type + if dtype is None and is_object_dtype(values.dtype): + obj_columns = list(values) + maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] + # don't convert (and copy) the objects if no type inference occurs + if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): + dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] + block_values = [ + new_block_2d(dvals_list[n], placement=BlockPlacement(n)) + for n in range(len(dvals_list)) + ] + else: + bp = BlockPlacement(slice(len(columns))) + nb = new_block_2d(values, placement=bp, refs=refs) + block_values = [nb] + elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): + dtype = StringDtype(storage="pyarrow_numpy") + + obj_columns = list(values) + block_values = [ + new_block( + dtype.construct_array_type()._from_sequence(data, dtype=dtype), + BlockPlacement(slice(i, i + 1)), + ndim=2, + ) + for i, data in enumerate(obj_columns) + ] + + else: + bp = BlockPlacement(slice(len(columns))) + nb = new_block_2d(values, placement=bp, refs=refs) + block_values = [nb] + + if len(columns) == 0: + # TODO: check len(values) == 0? + block_values = [] + + return create_block_manager_from_blocks( + block_values, [columns, index], verify_integrity=False + ) + + +def _check_values_indices_shape_match( + values: np.ndarray, index: Index, columns: Index +) -> None: + """ + Check that the shape implied by our axes matches the actual shape of the + data. + """ + if values.shape[1] != len(columns) or values.shape[0] != len(index): + # Could let this raise in Block constructor, but we get a more + # helpful exception message this way. + if values.shape[0] == 0 < len(index): + raise ValueError("Empty data passed with indices specified.") + + passed = values.shape + implied = (len(index), len(columns)) + raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") + + +def dict_to_mgr( + data: dict, + index, + columns, + *, + dtype: DtypeObj | None = None, + typ: str = "block", + copy: bool = True, +) -> Manager: + """ + Segregate Series based on type and coerce into matrices. + Needs to handle a lot of exceptional cases. + + Used in DataFrame.__init__ + """ + arrays: Sequence[Any] | Series + + if columns is not None: + from pandas.core.series import Series + + arrays = Series(data, index=columns, dtype=object) + missing = arrays.isna() + if index is None: + # GH10856 + # raise ValueError if only scalars in dict + index = _extract_index(arrays[~missing]) + else: + index = ensure_index(index) + + # no obvious "empty" int column + if missing.any() and not is_integer_dtype(dtype): + nan_dtype: DtypeObj + + if dtype is not None: + # calling sanitize_array ensures we don't mix-and-match + # NA dtypes + midxs = missing.values.nonzero()[0] + for i in midxs: + arr = sanitize_array(arrays.iat[i], index, dtype=dtype) + arrays.iat[i] = arr + else: + # GH#1783 + nan_dtype = np.dtype("object") + val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) + nmissing = missing.sum() + if copy: + rhs = [val] * nmissing + else: + # GH#45369 + rhs = [val.copy() for _ in range(nmissing)] + arrays.loc[missing] = rhs + + arrays = list(arrays) + columns = ensure_index(columns) + + else: + keys = list(data.keys()) + columns = Index(keys) if keys else default_index(0) + arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] + + if copy: + if typ == "block": + # We only need to copy arrays that will not get consolidated, i.e. + # only EA arrays + arrays = [ + x.copy() + if isinstance(x, ExtensionArray) + else x.copy(deep=True) + if ( + isinstance(x, Index) + or isinstance(x, ABCSeries) + and is_1d_only_ea_dtype(x.dtype) + ) + else x + for x in arrays + ] + else: + # dtype check to exclude e.g. range objects, scalars + arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays] + + return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy) + + +def nested_data_to_arrays( + data: Sequence, + columns: Index | None, + index: Index | None, + dtype: DtypeObj | None, +) -> tuple[list[ArrayLike], Index, Index]: + """ + Convert a single sequence of arrays to multiple arrays. + """ + # By the time we get here we have already checked treat_as_nested(data) + + if is_named_tuple(data[0]) and columns is None: + columns = ensure_index(data[0]._fields) + + arrays, columns = to_arrays(data, columns, dtype=dtype) + columns = ensure_index(columns) + + if index is None: + if isinstance(data[0], ABCSeries): + index = _get_names_from_index(data) + else: + index = default_index(len(data)) + + return arrays, columns, index + + +def treat_as_nested(data) -> bool: + """ + Check if we should use nested_data_to_arrays. + """ + return ( + len(data) > 0 + and is_list_like(data[0]) + and getattr(data[0], "ndim", 1) == 1 + and not (isinstance(data, ExtensionArray) and data.ndim == 2) + ) + + +# --------------------------------------------------------------------- + + +def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray: + # values is specifically _not_ ndarray, EA, Index, or Series + # We only get here with `not treat_as_nested(values)` + + if len(values) == 0: + # TODO: check for length-zero range, in which case return int64 dtype? + # TODO: reuse anything in try_cast? + return np.empty((0, 0), dtype=object) + elif isinstance(values, range): + arr = range_to_ndarray(values) + return arr[..., np.newaxis] + + def convert(v): + if not is_list_like(v) or isinstance(v, ABCDataFrame): + return v + + v = extract_array(v, extract_numpy=True) + res = maybe_convert_platform(v) + # We don't do maybe_infer_to_datetimelike here bc we will end up doing + # it column-by-column in ndarray_to_mgr + return res + + # we could have a 1-dim or 2-dim list here + # this is equiv of np.asarray, but does object conversion + # and platform dtype preservation + # does not convert e.g. [1, "a", True] to ["1", "a", "True"] like + # np.asarray would + if is_list_like(values[0]): + values = np.array([convert(v) for v in values]) + elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: + # GH#21861 see test_constructor_list_of_lists + values = np.array([convert(v) for v in values]) + else: + values = convert(values) + + return _ensure_2d(values) + + +def _ensure_2d(values: np.ndarray) -> np.ndarray: + """ + Reshape 1D values, raise on anything else other than 2D. + """ + if values.ndim == 1: + values = values.reshape((values.shape[0], 1)) + elif values.ndim != 2: + raise ValueError(f"Must pass 2-d input. shape={values.shape}") + return values + + +def _homogenize( + data, index: Index, dtype: DtypeObj | None +) -> tuple[list[ArrayLike], list[Any]]: + oindex = None + homogenized = [] + # if the original array-like in `data` is a Series, keep track of this Series' refs + refs: list[Any] = [] + + for val in data: + if isinstance(val, (ABCSeries, Index)): + if dtype is not None: + val = val.astype(dtype, copy=False) + if isinstance(val, ABCSeries) and val.index is not index: + # Forces alignment. No need to copy data since we + # are putting it into an ndarray later + val = val.reindex(index, copy=False) + refs.append(val._references) + val = val._values + else: + if isinstance(val, dict): + # GH#41785 this _should_ be equivalent to (but faster than) + # val = Series(val, index=index)._values + if oindex is None: + oindex = index.astype("O") + + if isinstance(index, (DatetimeIndex, TimedeltaIndex)): + # see test_constructor_dict_datetime64_index + val = dict_compat(val) + else: + # see test_constructor_subclass_dict + val = dict(val) + val = lib.fast_multiget(val, oindex._values, default=np.nan) + + val = sanitize_array(val, index, dtype=dtype, copy=False) + com.require_length_match(val, index) + refs.append(None) + + homogenized.append(val) + + return homogenized, refs + + +def _extract_index(data) -> Index: + """ + Try to infer an Index from the passed data, raise ValueError on failure. + """ + index: Index + if len(data) == 0: + return default_index(0) + + raw_lengths = [] + indexes: list[list[Hashable] | Index] = [] + + have_raw_arrays = False + have_series = False + have_dicts = False + + for val in data: + if isinstance(val, ABCSeries): + have_series = True + indexes.append(val.index) + elif isinstance(val, dict): + have_dicts = True + indexes.append(list(val.keys())) + elif is_list_like(val) and getattr(val, "ndim", 1) == 1: + have_raw_arrays = True + raw_lengths.append(len(val)) + elif isinstance(val, np.ndarray) and val.ndim > 1: + raise ValueError("Per-column arrays must each be 1-dimensional") + + if not indexes and not raw_lengths: + raise ValueError("If using all scalar values, you must pass an index") + + if have_series: + index = union_indexes(indexes) + elif have_dicts: + index = union_indexes(indexes, sort=False) + + if have_raw_arrays: + lengths = list(set(raw_lengths)) + if len(lengths) > 1: + raise ValueError("All arrays must be of the same length") + + if have_dicts: + raise ValueError( + "Mixing dicts with non-Series may lead to ambiguous ordering." + ) + + if have_series: + if lengths[0] != len(index): + msg = ( + f"array length {lengths[0]} does not match index " + f"length {len(index)}" + ) + raise ValueError(msg) + else: + index = default_index(lengths[0]) + + return ensure_index(index) + + +def reorder_arrays( + arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int +) -> tuple[list[ArrayLike], Index]: + """ + Pre-emptively (cheaply) reindex arrays with new columns. + """ + # reorder according to the columns + if columns is not None: + if not columns.equals(arr_columns): + # if they are equal, there is nothing to do + new_arrays: list[ArrayLike] = [] + indexer = arr_columns.get_indexer(columns) + for i, k in enumerate(indexer): + if k == -1: + # by convention default is all-NaN object dtype + arr = np.empty(length, dtype=object) + arr.fill(np.nan) + else: + arr = arrays[k] + new_arrays.append(arr) + + arrays = new_arrays + arr_columns = columns + + return arrays, arr_columns + + +def _get_names_from_index(data) -> Index: + has_some_name = any(getattr(s, "name", None) is not None for s in data) + if not has_some_name: + return default_index(len(data)) + + index: list[Hashable] = list(range(len(data))) + count = 0 + for i, s in enumerate(data): + n = getattr(s, "name", None) + if n is not None: + index[i] = n + else: + index[i] = f"Unnamed {count}" + count += 1 + + return Index(index) + + +def _get_axes( + N: int, K: int, index: Index | None, columns: Index | None +) -> tuple[Index, Index]: + # helper to create the axes as indexes + # return axes or defaults + + if index is None: + index = default_index(N) + else: + index = ensure_index(index) + + if columns is None: + columns = default_index(K) + else: + columns = ensure_index(columns) + return index, columns + + +def dataclasses_to_dicts(data): + """ + Converts a list of dataclass instances to a list of dictionaries. + + Parameters + ---------- + data : List[Type[dataclass]] + + Returns + -------- + list_dict : List[dict] + + Examples + -------- + >>> from dataclasses import dataclass + >>> @dataclass + ... class Point: + ... x: int + ... y: int + + >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)]) + [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}] + + """ + from dataclasses import asdict + + return list(map(asdict, data)) + + +# --------------------------------------------------------------------- +# Conversion of Inputs to Arrays + + +def to_arrays( + data, columns: Index | None, dtype: DtypeObj | None = None +) -> tuple[list[ArrayLike], Index]: + """ + Return list of arrays, columns. + + Returns + ------- + list[ArrayLike] + These will become columns in a DataFrame. + Index + This will become frame.columns. + + Notes + ----- + Ensures that len(result_arrays) == len(result_index). + """ + + if not len(data): + if isinstance(data, np.ndarray): + if data.dtype.names is not None: + # i.e. numpy structured array + columns = ensure_index(data.dtype.names) + arrays = [data[name] for name in columns] + + if len(data) == 0: + # GH#42456 the indexing above results in list of 2D ndarrays + # TODO: is that an issue with numpy? + for i, arr in enumerate(arrays): + if arr.ndim == 2: + arrays[i] = arr[:, 0] + + return arrays, columns + return [], ensure_index([]) + + elif isinstance(data, np.ndarray) and data.dtype.names is not None: + # e.g. recarray + columns = Index(list(data.dtype.names)) + arrays = [data[k] for k in columns] + return arrays, columns + + if isinstance(data[0], (list, tuple)): + arr = _list_to_arrays(data) + elif isinstance(data[0], abc.Mapping): + arr, columns = _list_of_dict_to_arrays(data, columns) + elif isinstance(data[0], ABCSeries): + arr, columns = _list_of_series_to_arrays(data, columns) + else: + # last ditch effort + data = [tuple(x) for x in data] + arr = _list_to_arrays(data) + + content, columns = _finalize_columns_and_data(arr, columns, dtype) + return content, columns + + +def _list_to_arrays(data: list[tuple | list]) -> np.ndarray: + # Returned np.ndarray has ndim = 2 + # Note: we already check len(data) > 0 before getting hre + if isinstance(data[0], tuple): + content = lib.to_object_array_tuples(data) + else: + # list of lists + content = lib.to_object_array(data) + return content + + +def _list_of_series_to_arrays( + data: list, + columns: Index | None, +) -> tuple[np.ndarray, Index]: + # returned np.ndarray has ndim == 2 + + if columns is None: + # We know pass_data is non-empty because data[0] is a Series + pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] + columns = get_objs_combined_axis(pass_data, sort=False) + + indexer_cache: dict[int, np.ndarray] = {} + + aligned_values = [] + for s in data: + index = getattr(s, "index", None) + if index is None: + index = default_index(len(s)) + + if id(index) in indexer_cache: + indexer = indexer_cache[id(index)] + else: + indexer = indexer_cache[id(index)] = index.get_indexer(columns) + + values = extract_array(s, extract_numpy=True) + aligned_values.append(algorithms.take_nd(values, indexer)) + + content = np.vstack(aligned_values) + return content, columns + + +def _list_of_dict_to_arrays( + data: list[dict], + columns: Index | None, +) -> tuple[np.ndarray, Index]: + """ + Convert list of dicts to numpy arrays + + if `columns` is not passed, column names are inferred from the records + - for OrderedDict and dicts, the column names match + the key insertion-order from the first record to the last. + - For other kinds of dict-likes, the keys are lexically sorted. + + Parameters + ---------- + data : iterable + collection of records (OrderedDict, dict) + columns: iterables or None + + Returns + ------- + content : np.ndarray[object, ndim=2] + columns : Index + """ + if columns is None: + gen = (list(x.keys()) for x in data) + sort = not any(isinstance(d, dict) for d in data) + pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort) + columns = ensure_index(pre_cols) + + # assure that they are of the base dict class and not of derived + # classes + data = [d if type(d) is dict else dict(d) for d in data] # noqa: E721 + + content = lib.dicts_to_array(data, list(columns)) + return content, columns + + +def _finalize_columns_and_data( + content: np.ndarray, # ndim == 2 + columns: Index | None, + dtype: DtypeObj | None, +) -> tuple[list[ArrayLike], Index]: + """ + Ensure we have valid columns, cast object dtypes if possible. + """ + contents = list(content.T) + + try: + columns = _validate_or_indexify_columns(contents, columns) + except AssertionError as err: + # GH#26429 do not raise user-facing AssertionError + raise ValueError(err) from err + + if len(contents) and contents[0].dtype == np.object_: + contents = convert_object_array(contents, dtype=dtype) + + return contents, columns + + +def _validate_or_indexify_columns( + content: list[np.ndarray], columns: Index | None +) -> Index: + """ + If columns is None, make numbers as column names; Otherwise, validate that + columns have valid length. + + Parameters + ---------- + content : list of np.ndarrays + columns : Index or None + + Returns + ------- + Index + If columns is None, assign positional column index value as columns. + + Raises + ------ + 1. AssertionError when content is not composed of list of lists, and if + length of columns is not equal to length of content. + 2. ValueError when content is list of lists, but length of each sub-list + is not equal + 3. ValueError when content is list of lists, but length of sub-list is + not equal to length of content + """ + if columns is None: + columns = default_index(len(content)) + else: + # Add mask for data which is composed of list of lists + is_mi_list = isinstance(columns, list) and all( + isinstance(col, list) for col in columns + ) + + if not is_mi_list and len(columns) != len(content): # pragma: no cover + # caller's responsibility to check for this... + raise AssertionError( + f"{len(columns)} columns passed, passed data had " + f"{len(content)} columns" + ) + if is_mi_list: + # check if nested list column, length of each sub-list should be equal + if len({len(col) for col in columns}) > 1: + raise ValueError( + "Length of columns passed for MultiIndex columns is different" + ) + + # if columns is not empty and length of sublist is not equal to content + if columns and len(columns[0]) != len(content): + raise ValueError( + f"{len(columns[0])} columns passed, passed data had " + f"{len(content)} columns" + ) + return columns + + +def convert_object_array( + content: list[npt.NDArray[np.object_]], + dtype: DtypeObj | None, + dtype_backend: str = "numpy", + coerce_float: bool = False, +) -> list[ArrayLike]: + """ + Internal function to convert object array. + + Parameters + ---------- + content: List[np.ndarray] + dtype: np.dtype or ExtensionDtype + dtype_backend: Controls if nullable/pyarrow dtypes are returned. + coerce_float: Cast floats that are integers to int. + + Returns + ------- + List[ArrayLike] + """ + # provide soft conversion of object dtypes + + def convert(arr): + if dtype != np.dtype("O"): + arr = lib.maybe_convert_objects( + arr, + try_float=coerce_float, + convert_to_nullable_dtype=dtype_backend != "numpy", + ) + # Notes on cases that get here 2023-02-15 + # 1) we DO get here when arr is all Timestamps and dtype=None + # 2) disabling this doesn't break the world, so this must be + # getting caught at a higher level + # 3) passing convert_non_numeric to maybe_convert_objects get this right + # 4) convert_non_numeric? + + if dtype is None: + if arr.dtype == np.dtype("O"): + # i.e. maybe_convert_objects didn't convert + arr = maybe_infer_to_datetimelike(arr) + if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): + new_dtype = StringDtype() + arr_cls = new_dtype.construct_array_type() + arr = arr_cls._from_sequence(arr, dtype=new_dtype) + elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): + if arr.dtype.kind in "iufb": + arr = pd_array(arr, copy=False) + + elif isinstance(dtype, ExtensionDtype): + # TODO: test(s) that get here + # TODO: try to de-duplicate this convert function with + # core.construction functions + cls = dtype.construct_array_type() + arr = cls._from_sequence(arr, dtype=dtype, copy=False) + elif dtype.kind in "mM": + # This restriction is harmless bc these are the only cases + # where maybe_cast_to_datetime is not a no-op. + # Here we know: + # 1) dtype.kind in "mM" and + # 2) arr is either object or numeric dtype + arr = maybe_cast_to_datetime(arr, dtype) + + return arr + + arrays = [convert(arr) for arr in content] + + return arrays diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/managers.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/managers.py new file mode 100644 index 0000000000000000000000000000000000000000..2e0e04717373fbe80490990d929c35130a7c733a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/managers.py @@ -0,0 +1,2375 @@ +from __future__ import annotations + +from collections.abc import ( + Hashable, + Sequence, +) +import itertools +from typing import ( + TYPE_CHECKING, + Callable, + Literal, + cast, +) +import warnings + +import numpy as np + +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) + +from pandas._libs import ( + internals as libinternals, + lib, +) +from pandas._libs.internals import ( + BlockPlacement, + BlockValuesRefs, +) +from pandas._libs.tslibs import Timestamp +from pandas.errors import PerformanceWarning +from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import infer_dtype_from_scalar +from pandas.core.dtypes.common import ( + ensure_platform_int, + is_1d_only_ea_dtype, + is_list_like, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + array_equals, + isna, +) + +import pandas.core.algorithms as algos +from pandas.core.arrays import ( + ArrowExtensionArray, + ArrowStringArray, + DatetimeArray, +) +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, +) +from pandas.core.indexers import maybe_convert_indices +from pandas.core.indexes.api import ( + Index, + ensure_index, +) +from pandas.core.internals.base import ( + DataManager, + SingleDataManager, + ensure_np_dtype, + interleaved_dtype, +) +from pandas.core.internals.blocks import ( + COW_WARNING_GENERAL_MSG, + COW_WARNING_SETITEM_MSG, + Block, + NumpyBlock, + ensure_block_shape, + extend_blocks, + get_block_type, + maybe_coerce_values, + new_block, + new_block_2d, +) +from pandas.core.internals.ops import ( + blockwise_all, + operate_blockwise, +) + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + AxisInt, + DtypeObj, + QuantileInterpolation, + Self, + Shape, + npt, + ) + + from pandas.api.extensions import ExtensionArray + + +class BaseBlockManager(DataManager): + """ + Core internal data structure to implement DataFrame, Series, etc. + + Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a + lightweight blocked set of labeled data to be manipulated by the DataFrame + public API class + + Attributes + ---------- + shape + ndim + axes + values + items + + Methods + ------- + set_axis(axis, new_labels) + copy(deep=True) + + get_dtypes + + apply(func, axes, block_filter_fn) + + get_bool_data + get_numeric_data + + get_slice(slice_like, axis) + get(label) + iget(loc) + + take(indexer, axis) + reindex_axis(new_labels, axis) + reindex_indexer(new_labels, indexer, axis) + + delete(label) + insert(loc, label, value) + set(label, value) + + Parameters + ---------- + blocks: Sequence of Block + axes: Sequence of Index + verify_integrity: bool, default True + + Notes + ----- + This is *not* a public API class + """ + + __slots__ = () + + _blknos: npt.NDArray[np.intp] + _blklocs: npt.NDArray[np.intp] + blocks: tuple[Block, ...] + axes: list[Index] + + @property + def ndim(self) -> int: + raise NotImplementedError + + _known_consolidated: bool + _is_consolidated: bool + + def __init__(self, blocks, axes, verify_integrity: bool = True) -> None: + raise NotImplementedError + + @classmethod + def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: + raise NotImplementedError + + @property + def blknos(self) -> npt.NDArray[np.intp]: + """ + Suppose we want to find the array corresponding to our i'th column. + + blknos[i] identifies the block from self.blocks that contains this column. + + blklocs[i] identifies the column of interest within + self.blocks[self.blknos[i]] + """ + if self._blknos is None: + # Note: these can be altered by other BlockManager methods. + self._rebuild_blknos_and_blklocs() + + return self._blknos + + @property + def blklocs(self) -> npt.NDArray[np.intp]: + """ + See blknos.__doc__ + """ + if self._blklocs is None: + # Note: these can be altered by other BlockManager methods. + self._rebuild_blknos_and_blklocs() + + return self._blklocs + + def make_empty(self, axes=None) -> Self: + """return an empty BlockManager with the items axis of len 0""" + if axes is None: + axes = [Index([])] + self.axes[1:] + + # preserve dtype if possible + if self.ndim == 1: + assert isinstance(self, SingleBlockManager) # for mypy + blk = self.blocks[0] + arr = blk.values[:0] + bp = BlockPlacement(slice(0, 0)) + nb = blk.make_block_same_class(arr, placement=bp) + blocks = [nb] + else: + blocks = [] + return type(self).from_blocks(blocks, axes) + + def __nonzero__(self) -> bool: + return True + + # Python3 compat + __bool__ = __nonzero__ + + def _normalize_axis(self, axis: AxisInt) -> int: + # switch axis to follow BlockManager logic + if self.ndim == 2: + axis = 1 if axis == 0 else 0 + return axis + + def set_axis(self, axis: AxisInt, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + self._validate_set_axis(axis, new_labels) + self.axes[axis] = new_labels + + @property + def is_single_block(self) -> bool: + # Assumes we are 2D; overridden by SingleBlockManager + return len(self.blocks) == 1 + + @property + def items(self) -> Index: + return self.axes[0] + + def _has_no_reference(self, i: int) -> bool: + """ + Check for column `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the column has no references. + """ + blkno = self.blknos[i] + return self._has_no_reference_block(blkno) + + def _has_no_reference_block(self, blkno: int) -> bool: + """ + Check for block `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the block has no references. + """ + return not self.blocks[blkno].refs.has_reference() + + def add_references(self, mgr: BaseBlockManager) -> None: + """ + Adds the references from one manager to another. We assume that both + managers have the same block structure. + """ + if len(self.blocks) != len(mgr.blocks): + # If block structure changes, then we made a copy + return + for i, blk in enumerate(self.blocks): + blk.refs = mgr.blocks[i].refs + blk.refs.add_reference(blk) + + def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: + """ + Checks if two blocks from two different block managers reference the + same underlying values. + """ + blk = self.blocks[blkno] + return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) + + def get_dtypes(self) -> npt.NDArray[np.object_]: + dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) + return dtypes.take(self.blknos) + + @property + def arrays(self) -> list[ArrayLike]: + """ + Quick access to the backing arrays of the Blocks. + + Only for compatibility with ArrayManager for testing convenience. + Not to be used in actual code, and return value is not the same as the + ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs). + + Warning! The returned arrays don't handle Copy-on-Write, so this should + be used with caution (only in read-mode). + """ + return [blk.values for blk in self.blocks] + + def __repr__(self) -> str: + output = type(self).__name__ + for i, ax in enumerate(self.axes): + if i == 0: + output += f"\nItems: {ax}" + else: + output += f"\nAxis {i}: {ax}" + + for block in self.blocks: + output += f"\n{block}" + return output + + def apply( + self, + f, + align_keys: list[str] | None = None, + **kwargs, + ) -> Self: + """ + Iterate over the blocks, collect and create a new BlockManager. + + Parameters + ---------- + f : str or callable + Name of the Block method to apply. + align_keys: List[str] or None, default None + **kwargs + Keywords to pass to `f` + + Returns + ------- + BlockManager + """ + assert "filter" not in kwargs + + align_keys = align_keys or [] + result_blocks: list[Block] = [] + # fillna: Series/DataFrame is responsible for making sure value is aligned + + aligned_args = {k: kwargs[k] for k in align_keys} + + for b in self.blocks: + if aligned_args: + for k, obj in aligned_args.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values + else: + kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values + else: + # otherwise we have an ndarray + kwargs[k] = obj[b.mgr_locs.indexer] + + if callable(f): + applied = b.apply(f, **kwargs) + else: + applied = getattr(b, f)(**kwargs) + result_blocks = extend_blocks(applied, result_blocks) + + out = type(self).from_blocks(result_blocks, self.axes) + return out + + # Alias so we can share code with ArrayManager + apply_with_block = apply + + def setitem(self, indexer, value, warn: bool = True) -> Self: + """ + Set values with indexer. + + For SingleBlockManager, this backs s[indexer] = value + """ + if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: + raise ValueError(f"Cannot set values with ndim > {self.ndim}") + + if warn and warn_copy_on_write() and not self._has_no_reference(0): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + + elif using_copy_on_write() and not self._has_no_reference(0): + # this method is only called if there is a single block -> hardcoded 0 + # Split blocks to only copy the columns we want to modify + if self.ndim == 2 and isinstance(indexer, tuple): + blk_loc = self.blklocs[indexer[1]] + if is_list_like(blk_loc) and blk_loc.ndim == 2: + blk_loc = np.squeeze(blk_loc, axis=0) + elif not is_list_like(blk_loc): + # Keep dimension and copy data later + blk_loc = [blk_loc] # type: ignore[assignment] + if len(blk_loc) == 0: + return self.copy(deep=False) + + values = self.blocks[0].values + if values.ndim == 2: + values = values[blk_loc] + # "T" has no attribute "_iset_split_block" + self._iset_split_block( # type: ignore[attr-defined] + 0, blk_loc, values + ) + # first block equals values + self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value) + return self + # No need to split if we either set all columns or on a single block + # manager + self = self.copy() + + return self.apply("setitem", indexer=indexer, value=value) + + def diff(self, n: int) -> Self: + # only reached with self.ndim == 2 + return self.apply("diff", n=n) + + def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self: + if copy is None: + if using_copy_on_write(): + copy = False + else: + copy = True + elif using_copy_on_write(): + copy = False + + return self.apply( + "astype", + dtype=dtype, + copy=copy, + errors=errors, + using_cow=using_copy_on_write(), + ) + + def convert(self, copy: bool | None) -> Self: + if copy is None: + if using_copy_on_write(): + copy = False + else: + copy = True + elif using_copy_on_write(): + copy = False + + return self.apply("convert", copy=copy, using_cow=using_copy_on_write()) + + def convert_dtypes(self, **kwargs): + if using_copy_on_write(): + copy = False + else: + copy = True + + return self.apply( + "convert_dtypes", copy=copy, using_cow=using_copy_on_write(), **kwargs + ) + + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Self: + """ + Convert values to native types (strings / python objects) that are used + in formatting (repr / csv). + """ + return self.apply( + "get_values_for_csv", + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) + + @property + def any_extension_types(self) -> bool: + """Whether any of the blocks in this manager are extension blocks""" + return any(block.is_extension for block in self.blocks) + + @property + def is_view(self) -> bool: + """return a boolean if we are a single block and are a view""" + if len(self.blocks) == 1: + return self.blocks[0].is_view + + # It is technically possible to figure out which blocks are views + # e.g. [ b.values.base is not None for b in self.blocks ] + # but then we have the case of possibly some blocks being a view + # and some blocks not. setting in theory is possible on the non-view + # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit + # complicated + + return False + + def _get_data_subset(self, predicate: Callable) -> Self: + blocks = [blk for blk in self.blocks if predicate(blk.values)] + return self._combine(blocks) + + def get_bool_data(self) -> Self: + """ + Select blocks that are bool-dtype and columns from object-dtype blocks + that are all-bool. + """ + + new_blocks = [] + + for blk in self.blocks: + if blk.dtype == bool: + new_blocks.append(blk) + + elif blk.is_object: + nbs = blk._split() + new_blocks.extend(nb for nb in nbs if nb.is_bool) + + return self._combine(new_blocks) + + def get_numeric_data(self) -> Self: + numeric_blocks = [blk for blk in self.blocks if blk.is_numeric] + if len(numeric_blocks) == len(self.blocks): + # Avoid somewhat expensive _combine + return self + return self._combine(numeric_blocks) + + def _combine(self, blocks: list[Block], index: Index | None = None) -> Self: + """return a new manager with the blocks""" + if len(blocks) == 0: + if self.ndim == 2: + # retain our own Index dtype + if index is not None: + axes = [self.items[:0], index] + else: + axes = [self.items[:0]] + self.axes[1:] + return self.make_empty(axes) + return self.make_empty() + + # FIXME: optimization potential + indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) + inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) + + new_blocks: list[Block] = [] + for b in blocks: + nb = b.copy(deep=False) + nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) + new_blocks.append(nb) + + axes = list(self.axes) + if index is not None: + axes[-1] = index + axes[0] = self.items.take(indexer) + + return type(self).from_blocks(new_blocks, axes) + + @property + def nblocks(self) -> int: + return len(self.blocks) + + def copy(self, deep: bool | None | Literal["all"] = True) -> Self: + """ + Make deep or shallow copy of BlockManager + + Parameters + ---------- + deep : bool, string or None, default True + If False or None, return a shallow copy (do not copy data) + If 'all', copy data and a deep copy of the index + + Returns + ------- + BlockManager + """ + if deep is None: + if using_copy_on_write(): + # use shallow copy + deep = False + else: + # preserve deep copy for BlockManager with copy=None + deep = True + + # this preserves the notion of view copying of axes + if deep: + # hit in e.g. tests.io.json.test_pandas + + def copy_func(ax): + return ax.copy(deep=True) if deep == "all" else ax.view() + + new_axes = [copy_func(ax) for ax in self.axes] + else: + if using_copy_on_write(): + new_axes = [ax.view() for ax in self.axes] + else: + new_axes = list(self.axes) + + res = self.apply("copy", deep=deep) + res.axes = new_axes + + if self.ndim > 1: + # Avoid needing to re-compute these + blknos = self._blknos + if blknos is not None: + res._blknos = blknos.copy() + res._blklocs = self._blklocs.copy() + + if deep: + res._consolidate_inplace() + return res + + def consolidate(self) -> Self: + """ + Join together blocks having same dtype + + Returns + ------- + y : BlockManager + """ + if self.is_consolidated(): + return self + + bm = type(self)(self.blocks, self.axes, verify_integrity=False) + bm._is_consolidated = False + bm._consolidate_inplace() + return bm + + def reindex_indexer( + self, + new_axis: Index, + indexer: npt.NDArray[np.intp] | None, + axis: AxisInt, + fill_value=None, + allow_dups: bool = False, + copy: bool | None = True, + only_slice: bool = False, + *, + use_na_proxy: bool = False, + ) -> Self: + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray[intp] or None + axis : int + fill_value : object, default None + allow_dups : bool, default False + copy : bool or None, default True + If None, regard as False to get shallow copy. + only_slice : bool, default False + Whether to take views, not copies, along columns. + use_na_proxy : bool, default False + Whether to use a np.void ndarray for newly introduced columns. + + pandas-indexer with -1's only. + """ + if copy is None: + if using_copy_on_write(): + # use shallow copy + copy = False + else: + # preserve deep copy for BlockManager with copy=None + copy = True + + if indexer is None: + if new_axis is self.axes[axis] and not copy: + return self + + result = self.copy(deep=copy) + result.axes = list(self.axes) + result.axes[axis] = new_axis + return result + + # Should be intp, but in some cases we get int64 on 32bit builds + assert isinstance(indexer, np.ndarray) + + # some axes don't allow reindexing with dups + if not allow_dups: + self.axes[axis]._validate_can_reindex(indexer) + + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0( + indexer, + fill_value=fill_value, + only_slice=only_slice, + use_na_proxy=use_na_proxy, + ) + else: + new_blocks = [ + blk.take_nd( + indexer, + axis=1, + fill_value=( + fill_value if fill_value is not None else blk.fill_value + ), + ) + for blk in self.blocks + ] + + new_axes = list(self.axes) + new_axes[axis] = new_axis + + new_mgr = type(self).from_blocks(new_blocks, new_axes) + if axis == 1: + # We can avoid the need to rebuild these + new_mgr._blknos = self.blknos.copy() + new_mgr._blklocs = self.blklocs.copy() + return new_mgr + + def _slice_take_blocks_ax0( + self, + slice_or_indexer: slice | np.ndarray, + fill_value=lib.no_default, + only_slice: bool = False, + *, + use_na_proxy: bool = False, + ref_inplace_op: bool = False, + ) -> list[Block]: + """ + Slice/take blocks along axis=0. + + Overloaded for SingleBlock + + Parameters + ---------- + slice_or_indexer : slice or np.ndarray[int64] + fill_value : scalar, default lib.no_default + only_slice : bool, default False + If True, we always return views on existing arrays, never copies. + This is used when called from ops.blockwise.operate_blockwise. + use_na_proxy : bool, default False + Whether to use a np.void ndarray for newly introduced columns. + ref_inplace_op: bool, default False + Don't track refs if True because we operate inplace + + Returns + ------- + new_blocks : list of Block + """ + allow_fill = fill_value is not lib.no_default + + sl_type, slobj, sllen = _preprocess_slice_or_indexer( + slice_or_indexer, self.shape[0], allow_fill=allow_fill + ) + + if self.is_single_block: + blk = self.blocks[0] + + if sl_type == "slice": + # GH#32959 EABlock would fail since we can't make 0-width + # TODO(EA2D): special casing unnecessary with 2D EAs + if sllen == 0: + return [] + bp = BlockPlacement(slice(0, sllen)) + return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] + elif not allow_fill or self.ndim == 1: + if allow_fill and fill_value is None: + fill_value = blk.fill_value + + if not allow_fill and only_slice: + # GH#33597 slice instead of take, so we get + # views instead of copies + blocks = [ + blk.getitem_block_columns( + slice(ml, ml + 1), + new_mgr_locs=BlockPlacement(i), + ref_inplace_op=ref_inplace_op, + ) + for i, ml in enumerate(slobj) + ] + return blocks + else: + bp = BlockPlacement(slice(0, sllen)) + return [ + blk.take_nd( + slobj, + axis=0, + new_mgr_locs=bp, + fill_value=fill_value, + ) + ] + + if sl_type == "slice": + blknos = self.blknos[slobj] + blklocs = self.blklocs[slobj] + else: + blknos = algos.take_nd( + self.blknos, slobj, fill_value=-1, allow_fill=allow_fill + ) + blklocs = algos.take_nd( + self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill + ) + + # When filling blknos, make sure blknos is updated before appending to + # blocks list, that way new blkno is exactly len(blocks). + blocks = [] + group = not only_slice + for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): + if blkno == -1: + # If we've got here, fill_value was not lib.no_default + + blocks.append( + self._make_na_block( + placement=mgr_locs, + fill_value=fill_value, + use_na_proxy=use_na_proxy, + ) + ) + else: + blk = self.blocks[blkno] + + # Otherwise, slicing along items axis is necessary. + if not blk._can_consolidate and not blk._validate_ndim: + # i.e. we dont go through here for DatetimeTZBlock + # A non-consolidatable block, it's easy, because there's + # only one item and each mgr loc is a copy of that single + # item. + deep = not (only_slice or using_copy_on_write()) + for mgr_loc in mgr_locs: + newblk = blk.copy(deep=deep) + newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) + blocks.append(newblk) + + else: + # GH#32779 to avoid the performance penalty of copying, + # we may try to only slice + taker = blklocs[mgr_locs.indexer] + max_len = max(len(mgr_locs), taker.max() + 1) + if only_slice or using_copy_on_write(): + taker = lib.maybe_indices_to_slice(taker, max_len) + + if isinstance(taker, slice): + nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) + blocks.append(nb) + elif only_slice: + # GH#33597 slice instead of take, so we get + # views instead of copies + for i, ml in zip(taker, mgr_locs): + slc = slice(i, i + 1) + bp = BlockPlacement(ml) + nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) + # We have np.shares_memory(nb.values, blk.values) + blocks.append(nb) + else: + nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) + blocks.append(nb) + + return blocks + + def _make_na_block( + self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False + ) -> Block: + # Note: we only get here with self.ndim == 2 + + if use_na_proxy: + assert fill_value is None + shape = (len(placement), self.shape[1]) + vals = np.empty(shape, dtype=np.void) + nb = NumpyBlock(vals, placement, ndim=2) + return nb + + if fill_value is None: + fill_value = np.nan + + shape = (len(placement), self.shape[1]) + + dtype, fill_value = infer_dtype_from_scalar(fill_value) + block_values = make_na_array(dtype, shape, fill_value) + return new_block_2d(block_values, placement=placement) + + def take( + self, + indexer: npt.NDArray[np.intp], + axis: AxisInt = 1, + verify: bool = True, + ) -> Self: + """ + Take items along any axis. + + indexer : np.ndarray[np.intp] + axis : int, default 1 + verify : bool, default True + Check that all entries are between 0 and len(self) - 1, inclusive. + Pass verify=False if this check has been done by the caller. + + Returns + ------- + BlockManager + """ + # Caller is responsible for ensuring indexer annotation is accurate + + n = self.shape[axis] + indexer = maybe_convert_indices(indexer, n, verify=verify) + + new_labels = self.axes[axis].take(indexer) + return self.reindex_indexer( + new_axis=new_labels, + indexer=indexer, + axis=axis, + allow_dups=True, + copy=None, + ) + + +class BlockManager(libinternals.BlockManager, BaseBlockManager): + """ + BaseBlockManager that holds 2D blocks. + """ + + ndim = 2 + + # ---------------------------------------------------------------- + # Constructors + + def __init__( + self, + blocks: Sequence[Block], + axes: Sequence[Index], + verify_integrity: bool = True, + ) -> None: + if verify_integrity: + # Assertion disabled for performance + # assert all(isinstance(x, Index) for x in axes) + + for block in blocks: + if self.ndim != block.ndim: + raise AssertionError( + f"Number of Block dimensions ({block.ndim}) must equal " + f"number of axes ({self.ndim})" + ) + # As of 2.0, the caller is responsible for ensuring that + # DatetimeTZBlock with block.ndim == 2 has block.values.ndim ==2; + # previously there was a special check for fastparquet compat. + + self._verify_integrity() + + def _verify_integrity(self) -> None: + mgr_shape = self.shape + tot_items = sum(len(x.mgr_locs) for x in self.blocks) + for block in self.blocks: + if block.shape[1:] != mgr_shape[1:]: + raise_construction_error(tot_items, block.shape[1:], self.axes) + if len(self.items) != tot_items: + raise AssertionError( + "Number of manager items must equal union of " + f"block items\n# manager items: {len(self.items)}, # " + f"tot_items: {tot_items}" + ) + + @classmethod + def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: + """ + Constructor for BlockManager and SingleBlockManager with same signature. + """ + return cls(blocks, axes, verify_integrity=False) + + # ---------------------------------------------------------------- + # Indexing + + def fast_xs(self, loc: int) -> SingleBlockManager: + """ + Return the array corresponding to `frame.iloc[loc]`. + + Parameters + ---------- + loc : int + + Returns + ------- + np.ndarray or ExtensionArray + """ + if len(self.blocks) == 1: + # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like; + # is this ruled out in the general case? + result = self.blocks[0].iget((slice(None), loc)) + # in the case of a single block, the new block is a view + bp = BlockPlacement(slice(0, len(result))) + block = new_block( + result, + placement=bp, + ndim=1, + refs=self.blocks[0].refs, + ) + return SingleBlockManager(block, self.axes[0]) + + dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) + + n = len(self) + + if isinstance(dtype, ExtensionDtype): + # TODO: use object dtype as workaround for non-performant + # EA.__setitem__ methods. (primarily ArrowExtensionArray.__setitem__ + # when iteratively setting individual values) + # https://github.com/pandas-dev/pandas/pull/54508#issuecomment-1675827918 + result = np.empty(n, dtype=object) + else: + result = np.empty(n, dtype=dtype) + result = ensure_wrapped_if_datetimelike(result) + + for blk in self.blocks: + # Such assignment may incorrectly coerce NaT to None + # result[blk.mgr_locs] = blk._slice((slice(None), loc)) + for i, rl in enumerate(blk.mgr_locs): + result[rl] = blk.iget((i, loc)) + + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + result = cls._from_sequence(result, dtype=dtype) + + bp = BlockPlacement(slice(0, len(result))) + block = new_block(result, placement=bp, ndim=1) + return SingleBlockManager(block, self.axes[0]) + + def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: + """ + Return the data as a SingleBlockManager. + """ + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) + + # shortcut for select a single-dim from a 2-dim BM + bp = BlockPlacement(slice(0, len(values))) + nb = type(block)( + values, placement=bp, ndim=1, refs=block.refs if track_ref else None + ) + return SingleBlockManager(nb, self.axes[1]) + + def iget_values(self, i: int) -> ArrayLike: + """ + Return the data for column i as the values (ndarray or ExtensionArray). + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution. + """ + # TODO(CoW) making the arrays read-only might make this safer to use? + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) + return values + + @property + def column_arrays(self) -> list[np.ndarray]: + """ + Used in the JSON C code to access column arrays. + This optimizes compared to using `iget_values` by converting each + + Warning! This doesn't handle Copy-on-Write, so should be used with + caution (current use case of consuming this in the JSON code is fine). + """ + # This is an optimized equivalent to + # result = [self.iget_values(i) for i in range(len(self.items))] + result: list[np.ndarray | None] = [None] * len(self.items) + + for blk in self.blocks: + mgr_locs = blk._mgr_locs + values = blk.array_values._values_for_json() + if values.ndim == 1: + # TODO(EA2D): special casing not needed with 2D EAs + result[mgr_locs[0]] = values + + else: + for i, loc in enumerate(mgr_locs): + result[loc] = values[i] + + # error: Incompatible return value type (got "List[None]", + # expected "List[ndarray[Any, Any]]") + return result # type: ignore[return-value] + + def iset( + self, + loc: int | slice | np.ndarray, + value: ArrayLike, + inplace: bool = False, + refs: BlockValuesRefs | None = None, + ) -> None: + """ + Set new item in-place. Does not consolidate. Adds new Block if not + contained in the current set of items + """ + + # FIXME: refactor, clearly separate broadcasting & zip-like assignment + # can prob also fix the various if tests for sparse/categorical + if self._blklocs is None and self.ndim > 1: + self._rebuild_blknos_and_blklocs() + + # Note: we exclude DTA/TDA here + value_is_extension_type = is_1d_only_ea_dtype(value.dtype) + if not value_is_extension_type: + if value.ndim == 2: + value = value.T + else: + value = ensure_block_shape(value, ndim=2) + + if value.shape[1:] != self.shape[1:]: + raise AssertionError( + "Shape of new values must be compatible with manager shape" + ) + + if lib.is_integer(loc): + # We have 6 tests where loc is _not_ an int. + # In this case, get_blkno_placements will yield only one tuple, + # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1))) + + # Check if we can use _iset_single fastpath + loc = cast(int, loc) + blkno = self.blknos[loc] + blk = self.blocks[blkno] + if len(blk._mgr_locs) == 1: # TODO: fastest way to check this? + return self._iset_single( + loc, + value, + inplace=inplace, + blkno=blkno, + blk=blk, + refs=refs, + ) + + # error: Incompatible types in assignment (expression has type + # "List[Union[int, slice, ndarray]]", variable has type "Union[int, + # slice, ndarray]") + loc = [loc] # type: ignore[assignment] + + # categorical/sparse/datetimetz + if value_is_extension_type: + + def value_getitem(placement): + return value + + else: + + def value_getitem(placement): + return value[placement.indexer] + + # Accessing public blknos ensures the public versions are initialized + blknos = self.blknos[loc] + blklocs = self.blklocs[loc].copy() + + unfit_mgr_locs = [] + unfit_val_locs = [] + removed_blknos = [] + for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True): + blk = self.blocks[blkno_l] + blk_locs = blklocs[val_locs.indexer] + if inplace and blk.should_store(value): + # Updating inplace -> check if we need to do Copy-on-Write + if using_copy_on_write() and not self._has_no_reference_block(blkno_l): + self._iset_split_block( + blkno_l, blk_locs, value_getitem(val_locs), refs=refs + ) + else: + blk.set_inplace(blk_locs, value_getitem(val_locs)) + continue + else: + unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) + unfit_val_locs.append(val_locs) + + # If all block items are unfit, schedule the block for removal. + if len(val_locs) == len(blk.mgr_locs): + removed_blknos.append(blkno_l) + continue + else: + # Defer setting the new values to enable consolidation + self._iset_split_block(blkno_l, blk_locs, refs=refs) + + if len(removed_blknos): + # Remove blocks & update blknos accordingly + is_deleted = np.zeros(self.nblocks, dtype=np.bool_) + is_deleted[removed_blknos] = True + + new_blknos = np.empty(self.nblocks, dtype=np.intp) + new_blknos.fill(-1) + new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos)) + self._blknos = new_blknos[self._blknos] + self.blocks = tuple( + blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) + ) + + if unfit_val_locs: + unfit_idxr = np.concatenate(unfit_mgr_locs) + unfit_count = len(unfit_idxr) + + new_blocks: list[Block] = [] + if value_is_extension_type: + # This code (ab-)uses the fact that EA blocks contain only + # one item. + # TODO(EA2D): special casing unnecessary with 2D EAs + new_blocks.extend( + new_block_2d( + values=value, + placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)), + refs=refs, + ) + for mgr_loc in unfit_idxr + ) + + self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks) + self._blklocs[unfit_idxr] = 0 + + else: + # unfit_val_locs contains BlockPlacement objects + unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) + + new_blocks.append( + new_block_2d( + values=value_getitem(unfit_val_items), + placement=BlockPlacement(unfit_idxr), + refs=refs, + ) + ) + + self._blknos[unfit_idxr] = len(self.blocks) + self._blklocs[unfit_idxr] = np.arange(unfit_count) + + self.blocks += tuple(new_blocks) + + # Newly created block's dtype may already be present. + self._known_consolidated = False + + def _iset_split_block( + self, + blkno_l: int, + blk_locs: np.ndarray | list[int], + value: ArrayLike | None = None, + refs: BlockValuesRefs | None = None, + ) -> None: + """Removes columns from a block by splitting the block. + + Avoids copying the whole block through slicing and updates the manager + after determinint the new block structure. Optionally adds a new block, + otherwise has to be done by the caller. + + Parameters + ---------- + blkno_l: The block number to operate on, relevant for updating the manager + blk_locs: The locations of our block that should be deleted. + value: The value to set as a replacement. + refs: The reference tracking object of the value to set. + """ + blk = self.blocks[blkno_l] + + if self._blklocs is None: + self._rebuild_blknos_and_blklocs() + + nbs_tup = tuple(blk.delete(blk_locs)) + if value is not None: + locs = blk.mgr_locs.as_array[blk_locs] + first_nb = new_block_2d(value, BlockPlacement(locs), refs=refs) + else: + first_nb = nbs_tup[0] + nbs_tup = tuple(nbs_tup[1:]) + + nr_blocks = len(self.blocks) + blocks_tup = ( + self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup + ) + self.blocks = blocks_tup + + if not nbs_tup and value is not None: + # No need to update anything if split did not happen + return + + self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) + + for i, nb in enumerate(nbs_tup): + self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + self._blknos[nb.mgr_locs.indexer] = i + nr_blocks + + def _iset_single( + self, + loc: int, + value: ArrayLike, + inplace: bool, + blkno: int, + blk: Block, + refs: BlockValuesRefs | None = None, + ) -> None: + """ + Fastpath for iset when we are only setting a single position and + the Block currently in that position is itself single-column. + + In this case we can swap out the entire Block and blklocs and blknos + are unaffected. + """ + # Caller is responsible for verifying value.shape + + if inplace and blk.should_store(value): + copy = False + if using_copy_on_write() and not self._has_no_reference_block(blkno): + # perform Copy-on-Write and clear the reference + copy = True + iloc = self.blklocs[loc] + blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) + return + + nb = new_block_2d(value, placement=blk._mgr_locs, refs=refs) + old_blocks = self.blocks + new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :] + self.blocks = new_blocks + return + + def column_setitem( + self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False + ) -> None: + """ + Set values ("setitem") into a single column (not setting the full column). + + This is a method on the BlockManager level, to avoid creating an + intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) + """ + needs_to_warn = False + if warn_copy_on_write() and not self._has_no_reference(loc): + if not isinstance( + self.blocks[self.blknos[loc]].values, + (ArrowExtensionArray, ArrowStringArray), + ): + # We might raise if we are in an expansion case, so defer + # warning till we actually updated + needs_to_warn = True + + elif using_copy_on_write() and not self._has_no_reference(loc): + blkno = self.blknos[loc] + # Split blocks to only copy the column we want to modify + blk_loc = self.blklocs[loc] + # Copy our values + values = self.blocks[blkno].values + if values.ndim == 1: + values = values.copy() + else: + # Use [blk_loc] as indexer to keep ndim=2, this already results in a + # copy + values = values[[blk_loc]] + self._iset_split_block(blkno, [blk_loc], values) + + # this manager is only created temporarily to mutate the values in place + # so don't track references, otherwise the `setitem` would perform CoW again + col_mgr = self.iget(loc, track_ref=False) + if inplace_only: + col_mgr.setitem_inplace(idx, value) + else: + new_mgr = col_mgr.setitem((idx,), value) + self.iset(loc, new_mgr._block.values, inplace=True) + + if needs_to_warn: + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + + def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : np.ndarray or ExtensionArray + refs : The reference tracking object of the value to set. + """ + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype is deprecated", + category=FutureWarning, + ) + new_axis = self.items.insert(loc, item) + + if value.ndim == 2: + value = value.T + if len(value) > 1: + raise ValueError( + f"Expected a 1D array, got an array with shape {value.T.shape}" + ) + else: + value = ensure_block_shape(value, ndim=self.ndim) + + bp = BlockPlacement(slice(loc, loc + 1)) + block = new_block_2d(values=value, placement=bp, refs=refs) + + if not len(self.blocks): + # Fastpath + self._blklocs = np.array([0], dtype=np.intp) + self._blknos = np.array([0], dtype=np.intp) + else: + self._insert_update_mgr_locs(loc) + self._insert_update_blklocs_and_blknos(loc) + + self.axes[0] = new_axis + self.blocks += (block,) + + self._known_consolidated = False + + if sum(not block.is_extension for block in self.blocks) > 100: + warnings.warn( + "DataFrame is highly fragmented. This is usually the result " + "of calling `frame.insert` many times, which has poor performance. " + "Consider joining all columns at once using pd.concat(axis=1) " + "instead. To get a de-fragmented frame, use `newframe = frame.copy()`", + PerformanceWarning, + stacklevel=find_stack_level(), + ) + + def _insert_update_mgr_locs(self, loc) -> None: + """ + When inserting a new Block at location 'loc', we increment + all of the mgr_locs of blocks above that by one. + """ + for blkno, count in _fast_count_smallints(self.blknos[loc:]): + # .620 this way, .326 of which is in increment_above + blk = self.blocks[blkno] + blk._mgr_locs = blk._mgr_locs.increment_above(loc) + + def _insert_update_blklocs_and_blknos(self, loc) -> None: + """ + When inserting a new Block at location 'loc', we update our + _blklocs and _blknos. + """ + + # Accessing public blklocs ensures the public versions are initialized + if loc == self.blklocs.shape[0]: + # np.append is a lot faster, let's use it if we can. + self._blklocs = np.append(self._blklocs, 0) + self._blknos = np.append(self._blknos, len(self.blocks)) + elif loc == 0: + # np.append is a lot faster, let's use it if we can. + self._blklocs = np.append(self._blklocs[::-1], 0)[::-1] + self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1] + else: + new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos( + self.blklocs, self.blknos, loc, len(self.blocks) + ) + self._blklocs = new_blklocs + self._blknos = new_blknos + + def idelete(self, indexer) -> BlockManager: + """ + Delete selected locations, returning a new BlockManager. + """ + is_deleted = np.zeros(self.shape[0], dtype=np.bool_) + is_deleted[indexer] = True + taker = (~is_deleted).nonzero()[0] + + nbs = self._slice_take_blocks_ax0(taker, only_slice=True, ref_inplace_op=True) + new_columns = self.items[~is_deleted] + axes = [new_columns, self.axes[1]] + return type(self)(tuple(nbs), axes, verify_integrity=False) + + # ---------------------------------------------------------------- + # Block-wise Operation + + def grouped_reduce(self, func: Callable) -> Self: + """ + Apply grouped reduction function blockwise, returning a new BlockManager. + + Parameters + ---------- + func : grouped reduction function + + Returns + ------- + BlockManager + """ + result_blocks: list[Block] = [] + + for blk in self.blocks: + if blk.is_object: + # split on object-dtype blocks bc some columns may raise + # while others do not. + for sb in blk._split(): + applied = sb.apply(func) + result_blocks = extend_blocks(applied, result_blocks) + else: + applied = blk.apply(func) + result_blocks = extend_blocks(applied, result_blocks) + + if len(result_blocks) == 0: + nrows = 0 + else: + nrows = result_blocks[0].values.shape[-1] + index = Index(range(nrows)) + + return type(self).from_blocks(result_blocks, [self.axes[0], index]) + + def reduce(self, func: Callable) -> Self: + """ + Apply reduction function blockwise, returning a single-row BlockManager. + + Parameters + ---------- + func : reduction function + + Returns + ------- + BlockManager + """ + # If 2D, we assume that we're operating column-wise + assert self.ndim == 2 + + res_blocks: list[Block] = [] + for blk in self.blocks: + nbs = blk.reduce(func) + res_blocks.extend(nbs) + + index = Index([None]) # placeholder + new_mgr = type(self).from_blocks(res_blocks, [self.items, index]) + return new_mgr + + def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager: + """ + Apply array_op blockwise with another (aligned) BlockManager. + """ + return operate_blockwise(self, other, array_op) + + def _equal_values(self: BlockManager, other: BlockManager) -> bool: + """ + Used in .equals defined in base class. Only check the column values + assuming shape and indexes have already been checked. + """ + return blockwise_all(self, other, array_equals) + + def quantile( + self, + *, + qs: Index, # with dtype float 64 + interpolation: QuantileInterpolation = "linear", + ) -> Self: + """ + Iterate over blocks applying quantile reduction. + This routine is intended for reduction type operations and + will do inference on the generated blocks. + + Parameters + ---------- + interpolation : type of interpolation, default 'linear' + qs : list of the quantiles to be computed + + Returns + ------- + BlockManager + """ + # Series dispatches to DataFrame for quantile, which allows us to + # simplify some of the code here and in the blocks + assert self.ndim >= 2 + assert is_list_like(qs) # caller is responsible for this + + new_axes = list(self.axes) + new_axes[1] = Index(qs, dtype=np.float64) + + blocks = [ + blk.quantile(qs=qs, interpolation=interpolation) for blk in self.blocks + ] + + return type(self)(blocks, new_axes) + + # ---------------------------------------------------------------- + + def unstack(self, unstacker, fill_value) -> BlockManager: + """ + Return a BlockManager with all blocks unstacked. + + Parameters + ---------- + unstacker : reshape._Unstacker + fill_value : Any + fill_value for newly introduced missing values. + + Returns + ------- + unstacked : BlockManager + """ + new_columns = unstacker.get_new_columns(self.items) + new_index = unstacker.new_index + + allow_fill = not unstacker.mask_all + if allow_fill: + # calculating the full mask once and passing it to Block._unstack is + # faster than letting calculating it in each repeated call + new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape) + needs_masking = new_mask2D.any(axis=0) + else: + needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool) + + new_blocks: list[Block] = [] + columns_mask: list[np.ndarray] = [] + + if len(self.items) == 0: + factor = 1 + else: + fac = len(new_columns) / len(self.items) + assert fac == int(fac) + factor = int(fac) + + for blk in self.blocks: + mgr_locs = blk.mgr_locs + new_placement = mgr_locs.tile_for_unstack(factor) + + blocks, mask = blk._unstack( + unstacker, + fill_value, + new_placement=new_placement, + needs_masking=needs_masking, + ) + + new_blocks.extend(blocks) + columns_mask.extend(mask) + + # Block._unstack should ensure this holds, + assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks) + # In turn this ensures that in the BlockManager call below + # we have len(new_columns) == sum(x.shape[0] for x in new_blocks) + # which suffices to allow us to pass verify_inegrity=False + + new_columns = new_columns[columns_mask] + + bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) + return bm + + def to_dict(self) -> dict[str, Self]: + """ + Return a dict of str(dtype) -> BlockManager + + Returns + ------- + values : a dict of dtype -> BlockManager + """ + + bd: dict[str, list[Block]] = {} + for b in self.blocks: + bd.setdefault(str(b.dtype), []).append(b) + + # TODO(EA2D): the combine will be unnecessary with 2D EAs + return {dtype: self._combine(blocks) for dtype, blocks in bd.items()} + + def as_array( + self, + dtype: np.dtype | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Parameters + ---------- + dtype : np.dtype or None, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. + + Returns + ------- + arr : ndarray + """ + passed_nan = lib.is_float(na_value) and isna(na_value) + + if len(self.blocks) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() + + if self.is_single_block: + blk = self.blocks[0] + + if na_value is not lib.no_default: + # We want to copy when na_value is provided to avoid + # mutating the original object + if lib.is_np_dtype(blk.dtype, "f") and passed_nan: + # We are already numpy-float and na_value=np.nan + pass + else: + copy = True + + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + + # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no + # attribute "to_numpy" + arr = blk.values.to_numpy( # type: ignore[union-attr] + dtype=dtype, + na_value=na_value, + copy=copy, + ).reshape(blk.shape) + elif not copy: + arr = np.asarray(blk.values, dtype=dtype) + else: + arr = np.array(blk.values, dtype=dtype, copy=copy) + + if using_copy_on_write() and not copy: + arr = arr.view() + arr.flags.writeable = False + else: + arr = self._interleave(dtype=dtype, na_value=na_value) + # The underlying data was copied within _interleave, so no need + # to further copy if copy=True or setting na_value + + if na_value is lib.no_default: + pass + elif arr.dtype.kind == "f" and passed_nan: + pass + else: + arr[isna(arr)] = na_value + + return arr.transpose() + + def _interleave( + self, + dtype: np.dtype | None = None, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Return ndarray from blocks with specified item order + Items must be contained in the blocks + """ + if not dtype: + # Incompatible types in assignment (expression has type + # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has + # type "Optional[dtype[Any]]") + dtype = interleaved_dtype( # type: ignore[assignment] + [blk.dtype for blk in self.blocks] + ) + + # error: Argument 1 to "ensure_np_dtype" has incompatible type + # "Optional[dtype[Any]]"; expected "Union[dtype[Any], ExtensionDtype]" + dtype = ensure_np_dtype(dtype) # type: ignore[arg-type] + result = np.empty(self.shape, dtype=dtype) + + itemmask = np.zeros(self.shape[0]) + + if dtype == np.dtype("object") and na_value is lib.no_default: + # much more performant than using to_numpy below + for blk in self.blocks: + rl = blk.mgr_locs + arr = blk.get_values(dtype) + result[rl.indexer] = arr + itemmask[rl.indexer] = 1 + return result + + for blk in self.blocks: + rl = blk.mgr_locs + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + + # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no + # attribute "to_numpy" + arr = blk.values.to_numpy( # type: ignore[union-attr] + dtype=dtype, + na_value=na_value, + ) + else: + arr = blk.get_values(dtype) + result[rl.indexer] = arr + itemmask[rl.indexer] = 1 + + if not itemmask.all(): + raise AssertionError("Some items were not contained in blocks") + + return result + + # ---------------------------------------------------------------- + # Consolidation + + def is_consolidated(self) -> bool: + """ + Return True if more than one block with the same dtype + """ + if not self._known_consolidated: + self._consolidate_check() + return self._is_consolidated + + def _consolidate_check(self) -> None: + if len(self.blocks) == 1: + # fastpath + self._is_consolidated = True + self._known_consolidated = True + return + dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate] + self._is_consolidated = len(dtypes) == len(set(dtypes)) + self._known_consolidated = True + + def _consolidate_inplace(self) -> None: + # In general, _consolidate_inplace should only be called via + # DataFrame._consolidate_inplace, otherwise we will fail to invalidate + # the DataFrame's _item_cache. The exception is for newly-created + # BlockManager objects not yet attached to a DataFrame. + if not self.is_consolidated(): + self.blocks = _consolidate(self.blocks) + self._is_consolidated = True + self._known_consolidated = True + self._rebuild_blknos_and_blklocs() + + # ---------------------------------------------------------------- + # Concatenation + + @classmethod + def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed BlockManagers horizontally. + """ + offset = 0 + blocks: list[Block] = [] + for mgr in mgrs: + for blk in mgr.blocks: + # We need to do getitem_block here otherwise we would be altering + # blk.mgr_locs in place, which would render it invalid. This is only + # relevant in the copy=False case. + nb = blk.slice_block_columns(slice(None)) + nb._mgr_locs = nb._mgr_locs.add(offset) + blocks.append(nb) + + offset += len(mgr.items) + + new_mgr = cls(tuple(blocks), axes) + return new_mgr + + @classmethod + def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed BlockManagers vertically. + """ + raise NotImplementedError("This logic lives (for now) in internals.concat") + + +class SingleBlockManager(BaseBlockManager, SingleDataManager): + """manage a single block with""" + + @property + def ndim(self) -> Literal[1]: + return 1 + + _is_consolidated = True + _known_consolidated = True + __slots__ = () + is_single_block = True + + def __init__( + self, + block: Block, + axis: Index, + verify_integrity: bool = False, + ) -> None: + # Assertions disabled for performance + # assert isinstance(block, Block), type(block) + # assert isinstance(axis, Index), type(axis) + + self.axes = [axis] + self.blocks = (block,) + + @classmethod + def from_blocks( + cls, + blocks: list[Block], + axes: list[Index], + ) -> Self: + """ + Constructor for BlockManager and SingleBlockManager with same signature. + """ + assert len(blocks) == 1 + assert len(axes) == 1 + return cls(blocks[0], axes[0], verify_integrity=False) + + @classmethod + def from_array( + cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None + ) -> SingleBlockManager: + """ + Constructor for if we have an array that is not yet a Block. + """ + array = maybe_coerce_values(array) + bp = BlockPlacement(slice(0, len(index))) + block = new_block(array, placement=bp, ndim=1, refs=refs) + return cls(block, index) + + def to_2d_mgr(self, columns: Index) -> BlockManager: + """ + Manager analogue of Series.to_frame + """ + blk = self.blocks[0] + arr = ensure_block_shape(blk.values, ndim=2) + bp = BlockPlacement(0) + new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs) + axes = [columns, self.axes[0]] + return BlockManager([new_blk], axes=axes, verify_integrity=False) + + def _has_no_reference(self, i: int = 0) -> bool: + """ + Check for column `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the column has no references. + """ + return not self.blocks[0].refs.has_reference() + + def __getstate__(self): + block_values = [b.values for b in self.blocks] + block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] + axes_array = list(self.axes) + + extra_state = { + "0.14.1": { + "axes": axes_array, + "blocks": [ + {"values": b.values, "mgr_locs": b.mgr_locs.indexer} + for b in self.blocks + ], + } + } + + # First three elements of the state are to maintain forward + # compatibility with 0.13.1. + return axes_array, block_values, block_items, extra_state + + def __setstate__(self, state) -> None: + def unpickle_block(values, mgr_locs, ndim: int) -> Block: + # TODO(EA2D): ndim would be unnecessary with 2D EAs + # older pickles may store e.g. DatetimeIndex instead of DatetimeArray + values = extract_array(values, extract_numpy=True) + if not isinstance(mgr_locs, BlockPlacement): + mgr_locs = BlockPlacement(mgr_locs) + + values = maybe_coerce_values(values) + return new_block(values, placement=mgr_locs, ndim=ndim) + + if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: + state = state[3]["0.14.1"] + self.axes = [ensure_index(ax) for ax in state["axes"]] + ndim = len(self.axes) + self.blocks = tuple( + unpickle_block(b["values"], b["mgr_locs"], ndim=ndim) + for b in state["blocks"] + ) + else: + raise NotImplementedError("pre-0.14.1 pickles are no longer supported") + + self._post_setstate() + + def _post_setstate(self) -> None: + pass + + @cache_readonly + def _block(self) -> Block: + return self.blocks[0] + + @property + def _blknos(self): + """compat with BlockManager""" + return None + + @property + def _blklocs(self): + """compat with BlockManager""" + return None + + def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self: + # similar to get_slice, but not restricted to slice indexer + blk = self._block + if using_copy_on_write() and len(indexer) > 0 and indexer.all(): + return type(self)(blk.copy(deep=False), self.index) + array = blk.values[indexer] + + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "b": + # boolean indexing always gives a copy with numpy + refs = None + else: + # TODO(CoW) in theory only need to track reference if new_array is a view + refs = blk.refs + + bp = BlockPlacement(slice(0, len(array))) + block = type(blk)(array, placement=bp, ndim=1, refs=refs) + + new_idx = self.index[indexer] + return type(self)(block, new_idx) + + def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager: + # Assertion disabled for performance + # assert isinstance(slobj, slice), type(slobj) + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + blk = self._block + array = blk.values[slobj] + bp = BlockPlacement(slice(0, len(array))) + # TODO this method is only used in groupby SeriesSplitter at the moment, + # so passing refs is not yet covered by the tests + block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs) + new_index = self.index._getitem_slice(slobj) + return type(self)(block, new_index) + + @property + def index(self) -> Index: + return self.axes[0] + + @property + def dtype(self) -> DtypeObj: + return self._block.dtype + + def get_dtypes(self) -> npt.NDArray[np.object_]: + return np.array([self._block.dtype], dtype=object) + + def external_values(self): + """The array that Series.values returns""" + return self._block.external_values() + + def internal_values(self): + """The array that Series._values returns""" + return self._block.values + + def array_values(self) -> ExtensionArray: + """The array that Series.array returns""" + return self._block.array_values + + def get_numeric_data(self) -> Self: + if self._block.is_numeric: + return self.copy(deep=False) + return self.make_empty() + + @property + def _can_hold_na(self) -> bool: + return self._block._can_hold_na + + def setitem_inplace(self, indexer, value, warn: bool = True) -> None: + """ + Set values with indexer. + + For Single[Block/Array]Manager, this backs s[indexer] = value + + This is an inplace version of `setitem()`, mutating the manager/values + in place, not returning a new Manager (and Block), and thus never changing + the dtype. + """ + using_cow = using_copy_on_write() + warn_cow = warn_copy_on_write() + if (using_cow or warn_cow) and not self._has_no_reference(0): + if using_cow: + self.blocks = (self._block.copy(),) + self._cache.clear() + elif warn_cow and warn: + warnings.warn( + COW_WARNING_SETITEM_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + + super().setitem_inplace(indexer, value) + + def idelete(self, indexer) -> SingleBlockManager: + """ + Delete single location from SingleBlockManager. + + Ensures that self.blocks doesn't become empty. + """ + nb = self._block.delete(indexer)[0] + self.blocks = (nb,) + self.axes[0] = self.axes[0].delete(indexer) + self._cache.clear() + return self + + def fast_xs(self, loc): + """ + fast path for getting a cross-section + return a view of the data + """ + raise NotImplementedError("Use series._values[loc] instead") + + def set_values(self, values: ArrayLike) -> None: + """ + Set the values of the single block in place. + + Use at your own risk! This does not check if the passed values are + valid for the current Block/SingleBlockManager (length, dtype, etc), + and this does not properly keep track of references. + """ + # NOTE(CoW) Currently this is only used for FrameColumnApply.series_generator + # which handles CoW by setting the refs manually if necessary + self.blocks[0].values = values + self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values))) + + def _equal_values(self, other: Self) -> bool: + """ + Used in .equals defined in base class. Only check the column values + assuming shape and indexes have already been checked. + """ + # For SingleBlockManager (i.e.Series) + if other.ndim != 1: + return False + left = self.blocks[0].values + right = other.blocks[0].values + return array_equals(left, right) + + +# -------------------------------------------------------------------- +# Constructor Helpers + + +def create_block_manager_from_blocks( + blocks: list[Block], + axes: list[Index], + consolidate: bool = True, + verify_integrity: bool = True, +) -> BlockManager: + # If verify_integrity=False, then caller is responsible for checking + # all(x.shape[-1] == len(axes[1]) for x in blocks) + # sum(x.shape[0] for x in blocks) == len(axes[0]) + # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0]))) + # all(blk.ndim == 2 for blk in blocks) + # This allows us to safely pass verify_integrity=False + + try: + mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity) + + except ValueError as err: + arrays = [blk.values for blk in blocks] + tot_items = sum(arr.shape[0] for arr in arrays) + raise_construction_error(tot_items, arrays[0].shape[1:], axes, err) + + if consolidate: + mgr._consolidate_inplace() + return mgr + + +def create_block_manager_from_column_arrays( + arrays: list[ArrayLike], + axes: list[Index], + consolidate: bool, + refs: list, +) -> BlockManager: + # Assertions disabled for performance (caller is responsible for verifying) + # assert isinstance(axes, list) + # assert all(isinstance(x, Index) for x in axes) + # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays) + # assert all(type(x) is not NumpyExtensionArray for x in arrays) + # assert all(x.ndim == 1 for x in arrays) + # assert all(len(x) == len(axes[1]) for x in arrays) + # assert len(arrays) == len(axes[0]) + # These last three are sufficient to allow us to safely pass + # verify_integrity=False below. + + try: + blocks = _form_blocks(arrays, consolidate, refs) + mgr = BlockManager(blocks, axes, verify_integrity=False) + except ValueError as e: + raise_construction_error(len(arrays), arrays[0].shape, axes, e) + if consolidate: + mgr._consolidate_inplace() + return mgr + + +def raise_construction_error( + tot_items: int, + block_shape: Shape, + axes: list[Index], + e: ValueError | None = None, +): + """raise a helpful message about our construction""" + passed = tuple(map(int, [tot_items] + list(block_shape))) + # Correcting the user facing error message during dataframe construction + if len(passed) <= 2: + passed = passed[::-1] + + implied = tuple(len(ax) for ax in axes) + # Correcting the user facing error message during dataframe construction + if len(implied) <= 2: + implied = implied[::-1] + + # We return the exception object instead of raising it so that we + # can raise it in the caller; mypy plays better with that + if passed == implied and e is not None: + raise e + if block_shape[0] == 0: + raise ValueError("Empty data passed with indices specified.") + raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") + + +# ----------------------------------------------------------------------- + + +def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]: + dtype = tup[1].dtype + + if is_1d_only_ea_dtype(dtype): + # We know these won't be consolidated, so don't need to group these. + # This avoids expensive comparisons of CategoricalDtype objects + sep = id(dtype) + else: + sep = 0 + + return sep, dtype + + +def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: + tuples = list(enumerate(arrays)) + + if not consolidate: + return _tuples_to_blocks_no_consolidate(tuples, refs) + + # when consolidating, we can ignore refs (either stacking always copies, + # or the EA is already copied in the calling dict_to_mgr) + + # group by dtype + grouper = itertools.groupby(tuples, _grouping_func) + + nbs: list[Block] = [] + for (_, dtype), tup_block in grouper: + block_type = get_block_type(dtype) + + if isinstance(dtype, np.dtype): + is_dtlike = dtype.kind in "mM" + + if issubclass(dtype.type, (str, bytes)): + dtype = np.dtype(object) + + values, placement = _stack_arrays(list(tup_block), dtype) + if is_dtlike: + values = ensure_wrapped_if_datetimelike(values) + blk = block_type(values, placement=BlockPlacement(placement), ndim=2) + nbs.append(blk) + + elif is_1d_only_ea_dtype(dtype): + dtype_blocks = [ + block_type(x[1], placement=BlockPlacement(x[0]), ndim=2) + for x in tup_block + ] + nbs.extend(dtype_blocks) + + else: + dtype_blocks = [ + block_type( + ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2 + ) + for x in tup_block + ] + nbs.extend(dtype_blocks) + return nbs + + +def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]: + # tuples produced within _form_blocks are of the form (placement, array) + return [ + new_block_2d( + ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref + ) + for ((i, arr), ref) in zip(tuples, refs) + ] + + +def _stack_arrays(tuples, dtype: np.dtype): + placement, arrays = zip(*tuples) + + first = arrays[0] + shape = (len(arrays),) + first.shape + + stacked = np.empty(shape, dtype=dtype) + for i, arr in enumerate(arrays): + stacked[i] = arr + + return stacked, placement + + +def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]: + """ + Merge blocks having same dtype, exclude non-consolidating blocks + """ + # sort by _can_consolidate, dtype + gkey = lambda x: x._consolidate_key + grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) + + new_blocks: list[Block] = [] + for (_can_consolidate, dtype), group_blocks in grouper: + merged_blocks, _ = _merge_blocks( + list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate + ) + new_blocks = extend_blocks(merged_blocks, new_blocks) + return tuple(new_blocks) + + +def _merge_blocks( + blocks: list[Block], dtype: DtypeObj, can_consolidate: bool +) -> tuple[list[Block], bool]: + if len(blocks) == 1: + return blocks, False + + if can_consolidate: + # TODO: optimization potential in case all mgrs contain slices and + # combination of those slices is a slice, too. + new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) + + new_values: ArrayLike + + if isinstance(blocks[0].dtype, np.dtype): + # error: List comprehension has incompatible type List[Union[ndarray, + # ExtensionArray]]; expected List[Union[complex, generic, + # Sequence[Union[int, float, complex, str, bytes, generic]], + # Sequence[Sequence[Any]], SupportsArray]] + new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc] + else: + bvals = [blk.values for blk in blocks] + bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals) + new_values = bvals2[0]._concat_same_type(bvals2, axis=0) + + argsort = np.argsort(new_mgr_locs) + new_values = new_values[argsort] + new_mgr_locs = new_mgr_locs[argsort] + + bp = BlockPlacement(new_mgr_locs) + return [new_block_2d(new_values, placement=bp)], True + + # can't consolidate --> no merge + return blocks, False + + +def _fast_count_smallints(arr: npt.NDArray[np.intp]): + """Faster version of set(arr) for sequences of small numbers.""" + counts = np.bincount(arr) + nz = counts.nonzero()[0] + # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here, + # in one benchmark by a factor of 11 + return zip(nz, counts[nz]) + + +def _preprocess_slice_or_indexer( + slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool +): + if isinstance(slice_or_indexer, slice): + return ( + "slice", + slice_or_indexer, + libinternals.slice_len(slice_or_indexer, length), + ) + else: + if ( + not isinstance(slice_or_indexer, np.ndarray) + or slice_or_indexer.dtype.kind != "i" + ): + dtype = getattr(slice_or_indexer, "dtype", None) + raise TypeError(type(slice_or_indexer), dtype) + + indexer = ensure_platform_int(slice_or_indexer) + if not allow_fill: + indexer = maybe_convert_indices(indexer, length) + return "fancy", indexer, len(indexer) + + +def make_na_array(dtype: DtypeObj, shape: Shape, fill_value) -> ArrayLike: + if isinstance(dtype, DatetimeTZDtype): + # NB: exclude e.g. pyarrow[dt64tz] dtypes + ts = Timestamp(fill_value).as_unit(dtype.unit) + i8values = np.full(shape, ts._value) + dt64values = i8values.view(f"M8[{dtype.unit}]") + return DatetimeArray._simple_new(dt64values, dtype=dtype) + + elif is_1d_only_ea_dtype(dtype): + dtype = cast(ExtensionDtype, dtype) + cls = dtype.construct_array_type() + + missing_arr = cls._from_sequence([], dtype=dtype) + ncols, nrows = shape + assert ncols == 1, ncols + empty_arr = -1 * np.ones((nrows,), dtype=np.intp) + return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) + elif isinstance(dtype, ExtensionDtype): + # TODO: no tests get here, a handful would if we disabled + # the dt64tz special-case above (which is faster) + cls = dtype.construct_array_type() + missing_arr = cls._empty(shape=shape, dtype=dtype) + missing_arr[:] = fill_value + return missing_arr + else: + # NB: we should never get here with dtype integer or bool; + # if we did, the missing_arr.fill would cast to gibberish + missing_arr = np.empty(shape, dtype=dtype) + missing_arr.fill(fill_value) + + if dtype.kind in "mM": + missing_arr = ensure_wrapped_if_datetimelike(missing_arr) + return missing_arr diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/ops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..cf9466c0bdf0bf4df623e2d819faf3ea7b36c878 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/internals/ops.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + NamedTuple, +) + +from pandas.core.dtypes.common import is_1d_only_ea_dtype + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pandas._libs.internals import BlockPlacement + from pandas._typing import ArrayLike + + from pandas.core.internals.blocks import Block + from pandas.core.internals.managers import BlockManager + + +class BlockPairInfo(NamedTuple): + lvals: ArrayLike + rvals: ArrayLike + locs: BlockPlacement + left_ea: bool + right_ea: bool + rblk: Block + + +def _iter_block_pairs( + left: BlockManager, right: BlockManager +) -> Iterator[BlockPairInfo]: + # At this point we have already checked the parent DataFrames for + # assert rframe._indexed_same(lframe) + + for blk in left.blocks: + locs = blk.mgr_locs + blk_vals = blk.values + + left_ea = blk_vals.ndim == 1 + + rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True) + + # Assertions are disabled for performance, but should hold: + # if left_ea: + # assert len(locs) == 1, locs + # assert len(rblks) == 1, rblks + # assert rblks[0].shape[0] == 1, rblks[0].shape + + for rblk in rblks: + right_ea = rblk.values.ndim == 1 + + lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) + info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk) + yield info + + +def operate_blockwise( + left: BlockManager, right: BlockManager, array_op +) -> BlockManager: + # At this point we have already checked the parent DataFrames for + # assert rframe._indexed_same(lframe) + + res_blks: list[Block] = [] + for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right): + res_values = array_op(lvals, rvals) + if ( + left_ea + and not right_ea + and hasattr(res_values, "reshape") + and not is_1d_only_ea_dtype(res_values.dtype) + ): + res_values = res_values.reshape(1, -1) + nbs = rblk._split_op_result(res_values) + + # Assertions are disabled for performance, but should hold: + # if right_ea or left_ea: + # assert len(nbs) == 1 + # else: + # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) + + _reset_block_mgr_locs(nbs, locs) + + res_blks.extend(nbs) + + # Assertions are disabled for performance, but should hold: + # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} + # nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks) + # assert nlocs == len(left.items), (nlocs, len(left.items)) + # assert len(slocs) == nlocs, (len(slocs), nlocs) + # assert slocs == set(range(nlocs)), slocs + + new_mgr = type(right)(tuple(res_blks), axes=right.axes, verify_integrity=False) + return new_mgr + + +def _reset_block_mgr_locs(nbs: list[Block], locs) -> None: + """ + Reset mgr_locs to correspond to our original DataFrame. + """ + for nb in nbs: + nblocs = locs[nb.mgr_locs.indexer] + nb.mgr_locs = nblocs + # Assertions are disabled for performance, but should hold: + # assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape) + # assert all(x in locs.as_array for x in nb.mgr_locs.as_array) + + +def _get_same_shape_values( + lblk: Block, rblk: Block, left_ea: bool, right_ea: bool +) -> tuple[ArrayLike, ArrayLike]: + """ + Slice lblk.values to align with rblk. Squeeze if we have EAs. + """ + lvals = lblk.values + rvals = rblk.values + + # Require that the indexing into lvals be slice-like + assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs + + # TODO(EA2D): with 2D EAs only this first clause would be needed + if not (left_ea or right_ea): + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[Union[ndarray, slice], slice]" + lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[call-overload] + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif left_ea and right_ea: + assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) + elif right_ea: + # lvals are 2D, rvals are 1D + + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[Union[ndarray, slice], slice]" + lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[call-overload] + assert lvals.shape[0] == 1, lvals.shape + lvals = lvals[0, :] + else: + # lvals are 1D, rvals are 2D + assert rvals.shape[0] == 1, rvals.shape + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[int, slice]" + rvals = rvals[0, :] # type: ignore[call-overload] + + return lvals, rvals + + +def blockwise_all(left: BlockManager, right: BlockManager, op) -> bool: + """ + Blockwise `all` reduction. + """ + for info in _iter_block_pairs(left, right): + res = op(info.lvals, info.rvals) + if not res: + return False + return True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/describe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/describe.py new file mode 100644 index 0000000000000000000000000000000000000000..c620bb9d17976a9deeb682ff323476c484f283ff --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/describe.py @@ -0,0 +1,416 @@ +""" +Module responsible for execution of NDFrame.describe() method. + +Method NDFrame.describe() delegates actual execution to function describe_ndframe(). +""" +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +from typing import ( + TYPE_CHECKING, + Callable, + cast, +) + +import numpy as np + +from pandas._libs.tslibs import Timestamp +from pandas._typing import ( + DtypeObj, + NDFrameT, + npt, +) +from pandas.util._validators import validate_percentile + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_numeric_dtype, +) +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + DatetimeTZDtype, + ExtensionDtype, +) + +from pandas.core.arrays.floating import Float64Dtype +from pandas.core.reshape.concat import concat + +from pandas.io.formats.format import format_percentiles + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Sequence, + ) + + from pandas import ( + DataFrame, + Series, + ) + + +def describe_ndframe( + *, + obj: NDFrameT, + include: str | Sequence[str] | None, + exclude: str | Sequence[str] | None, + percentiles: Sequence[float] | np.ndarray | None, +) -> NDFrameT: + """Describe series or dataframe. + + Called from pandas.core.generic.NDFrame.describe() + + Parameters + ---------- + obj: DataFrame or Series + Either dataframe or series to be described. + include : 'all', list-like of dtypes or None (default), optional + A white list of data types to include in the result. Ignored for ``Series``. + exclude : list-like of dtypes or None (default), optional, + A black list of data types to omit from the result. Ignored for ``Series``. + percentiles : list-like of numbers, optional + The percentiles to include in the output. All should fall between 0 and 1. + The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and + 75th percentiles. + + Returns + ------- + Dataframe or series description. + """ + percentiles = _refine_percentiles(percentiles) + + describer: NDFrameDescriberAbstract + if obj.ndim == 1: + describer = SeriesDescriber( + obj=cast("Series", obj), + ) + else: + describer = DataFrameDescriber( + obj=cast("DataFrame", obj), + include=include, + exclude=exclude, + ) + + result = describer.describe(percentiles=percentiles) + return cast(NDFrameT, result) + + +class NDFrameDescriberAbstract(ABC): + """Abstract class for describing dataframe or series. + + Parameters + ---------- + obj : Series or DataFrame + Object to be described. + """ + + def __init__(self, obj: DataFrame | Series) -> None: + self.obj = obj + + @abstractmethod + def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series: + """Do describe either series or dataframe. + + Parameters + ---------- + percentiles : list-like of numbers + The percentiles to include in the output. + """ + + +class SeriesDescriber(NDFrameDescriberAbstract): + """Class responsible for creating series description.""" + + obj: Series + + def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series: + describe_func = select_describe_func( + self.obj, + ) + return describe_func(self.obj, percentiles) + + +class DataFrameDescriber(NDFrameDescriberAbstract): + """Class responsible for creating dataobj description. + + Parameters + ---------- + obj : DataFrame + DataFrame to be described. + include : 'all', list-like of dtypes or None + A white list of data types to include in the result. + exclude : list-like of dtypes or None + A black list of data types to omit from the result. + """ + + obj: DataFrame + + def __init__( + self, + obj: DataFrame, + *, + include: str | Sequence[str] | None, + exclude: str | Sequence[str] | None, + ) -> None: + self.include = include + self.exclude = exclude + + if obj.ndim == 2 and obj.columns.size == 0: + raise ValueError("Cannot describe a DataFrame without columns") + + super().__init__(obj) + + def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame: + data = self._select_data() + + ldesc: list[Series] = [] + for _, series in data.items(): + describe_func = select_describe_func(series) + ldesc.append(describe_func(series, percentiles)) + + col_names = reorder_columns(ldesc) + d = concat( + [x.reindex(col_names, copy=False) for x in ldesc], + axis=1, + sort=False, + ) + d.columns = data.columns.copy() + return d + + def _select_data(self) -> DataFrame: + """Select columns to be described.""" + if (self.include is None) and (self.exclude is None): + # when some numerics are found, keep only numerics + default_include: list[npt.DTypeLike] = [np.number, "datetime"] + data = self.obj.select_dtypes(include=default_include) + if len(data.columns) == 0: + data = self.obj + elif self.include == "all": + if self.exclude is not None: + msg = "exclude must be None when include is 'all'" + raise ValueError(msg) + data = self.obj + else: + data = self.obj.select_dtypes( + include=self.include, + exclude=self.exclude, + ) + return data + + +def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]: + """Set a convenient order for rows for display.""" + names: list[Hashable] = [] + seen_names: set[Hashable] = set() + ldesc_indexes = sorted((x.index for x in ldesc), key=len) + for idxnames in ldesc_indexes: + for name in idxnames: + if name not in seen_names: + seen_names.add(name) + names.append(name) + return names + + +def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: + """Describe series containing numerical data. + + Parameters + ---------- + series : Series + Series to be described. + percentiles : list-like of numbers + The percentiles to include in the output. + """ + from pandas import Series + + formatted_percentiles = format_percentiles(percentiles) + + stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] + d = ( + [series.count(), series.mean(), series.std(), series.min()] + + series.quantile(percentiles).tolist() + + [series.max()] + ) + # GH#48340 - always return float on non-complex numeric data + dtype: DtypeObj | None + if isinstance(series.dtype, ExtensionDtype): + if isinstance(series.dtype, ArrowDtype): + if series.dtype.kind == "m": + # GH53001: describe timedeltas with object dtype + dtype = None + else: + import pyarrow as pa + + dtype = ArrowDtype(pa.float64()) + else: + dtype = Float64Dtype() + elif series.dtype.kind in "iufb": + # i.e. numeric but exclude complex dtype + dtype = np.dtype("float") + else: + dtype = None + return Series(d, index=stat_index, name=series.name, dtype=dtype) + + +def describe_categorical_1d( + data: Series, + percentiles_ignored: Sequence[float], +) -> Series: + """Describe series containing categorical data. + + Parameters + ---------- + data : Series + Series to be described. + percentiles_ignored : list-like of numbers + Ignored, but in place to unify interface. + """ + names = ["count", "unique", "top", "freq"] + objcounts = data.value_counts() + count_unique = len(objcounts[objcounts != 0]) + if count_unique > 0: + top, freq = objcounts.index[0], objcounts.iloc[0] + dtype = None + else: + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + top, freq = np.nan, np.nan + dtype = "object" + + result = [data.count(), count_unique, top, freq] + + from pandas import Series + + return Series(result, index=names, name=data.name, dtype=dtype) + + +def describe_timestamp_as_categorical_1d( + data: Series, + percentiles_ignored: Sequence[float], +) -> Series: + """Describe series containing timestamp data treated as categorical. + + Parameters + ---------- + data : Series + Series to be described. + percentiles_ignored : list-like of numbers + Ignored, but in place to unify interface. + """ + names = ["count", "unique"] + objcounts = data.value_counts() + count_unique = len(objcounts[objcounts != 0]) + result: list[float | Timestamp] = [data.count(), count_unique] + dtype = None + if count_unique > 0: + top, freq = objcounts.index[0], objcounts.iloc[0] + tz = data.dt.tz + asint = data.dropna().values.view("i8") + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) + else: + top = top.tz_localize(tz) + names += ["top", "freq", "first", "last"] + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] + + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + else: + names += ["top", "freq"] + result += [np.nan, np.nan] + dtype = "object" + + from pandas import Series + + return Series(result, index=names, name=data.name, dtype=dtype) + + +def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series: + """Describe series containing datetime64 dtype. + + Parameters + ---------- + data : Series + Series to be described. + percentiles : list-like of numbers + The percentiles to include in the output. + """ + # GH-30164 + from pandas import Series + + formatted_percentiles = format_percentiles(percentiles) + + stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] + d = ( + [data.count(), data.mean(), data.min()] + + data.quantile(percentiles).tolist() + + [data.max()] + ) + return Series(d, index=stat_index, name=data.name) + + +def select_describe_func( + data: Series, +) -> Callable: + """Select proper function for describing series based on data type. + + Parameters + ---------- + data : Series + Series to be described. + """ + if is_bool_dtype(data.dtype): + return describe_categorical_1d + elif is_numeric_dtype(data): + return describe_numeric_1d + elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype): + return describe_timestamp_1d + elif data.dtype.kind == "m": + return describe_numeric_1d + else: + return describe_categorical_1d + + +def _refine_percentiles( + percentiles: Sequence[float] | np.ndarray | None, +) -> npt.NDArray[np.float64]: + """ + Ensure that percentiles are unique and sorted. + + Parameters + ---------- + percentiles : list-like of numbers, optional + The percentiles to include in the output. + """ + if percentiles is None: + return np.array([0.25, 0.5, 0.75]) + + # explicit conversion of `percentiles` to list + percentiles = list(percentiles) + + # get them all to be in [0, 1] + validate_percentile(percentiles) + + # median should always be included + if 0.5 not in percentiles: + percentiles.append(0.5) + + percentiles = np.asarray(percentiles) + + # sort and check for duplicates + unique_pcts = np.unique(percentiles) + assert percentiles is not None + if len(unique_pcts) < len(percentiles): + raise ValueError("percentiles cannot contain duplicates") + + return unique_pcts diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/selectn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/selectn.py new file mode 100644 index 0000000000000000000000000000000000000000..a2f8ca94134b897bd3f7824a6cb45ca13516e5ff --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/selectn.py @@ -0,0 +1,269 @@ +""" +Implementation of nlargest and nsmallest. +""" + +from __future__ import annotations + +from collections.abc import ( + Hashable, + Sequence, +) +from typing import ( + TYPE_CHECKING, + cast, + final, +) + +import numpy as np + +from pandas._libs import algos as libalgos + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_complex_dtype, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.dtypes import BaseMaskedDtype + +if TYPE_CHECKING: + from pandas._typing import ( + DtypeObj, + IndexLabel, + ) + + from pandas import ( + DataFrame, + Series, + ) + + +class SelectN: + def __init__(self, obj, n: int, keep: str) -> None: + self.obj = obj + self.n = n + self.keep = keep + + if self.keep not in ("first", "last", "all"): + raise ValueError('keep must be either "first", "last" or "all"') + + def compute(self, method: str) -> DataFrame | Series: + raise NotImplementedError + + @final + def nlargest(self): + return self.compute("nlargest") + + @final + def nsmallest(self): + return self.compute("nsmallest") + + @final + @staticmethod + def is_valid_dtype_n_method(dtype: DtypeObj) -> bool: + """ + Helper function to determine if dtype is valid for + nsmallest/nlargest methods + """ + if is_numeric_dtype(dtype): + return not is_complex_dtype(dtype) + return needs_i8_conversion(dtype) + + +class SelectNSeries(SelectN): + """ + Implement n largest/smallest for Series + + Parameters + ---------- + obj : Series + n : int + keep : {'first', 'last'}, default 'first' + + Returns + ------- + nordered : Series + """ + + def compute(self, method: str) -> Series: + from pandas.core.reshape.concat import concat + + n = self.n + dtype = self.obj.dtype + if not self.is_valid_dtype_n_method(dtype): + raise TypeError(f"Cannot use method '{method}' with dtype {dtype}") + + if n <= 0: + return self.obj[[]] + + dropped = self.obj.dropna() + nan_index = self.obj.drop(dropped.index) + + # slow method + if n >= len(self.obj): + ascending = method == "nsmallest" + return self.obj.sort_values(ascending=ascending).head(n) + + # fast method + new_dtype = dropped.dtype + + # Similar to algorithms._ensure_data + arr = dropped._values + if needs_i8_conversion(arr.dtype): + arr = arr.view("i8") + elif isinstance(arr.dtype, BaseMaskedDtype): + arr = arr._data + else: + arr = np.asarray(arr) + if arr.dtype.kind == "b": + arr = arr.view(np.uint8) + + if method == "nlargest": + arr = -arr + if is_integer_dtype(new_dtype): + # GH 21426: ensure reverse ordering at boundaries + arr -= 1 + + elif is_bool_dtype(new_dtype): + # GH 26154: ensure False is smaller than True + arr = 1 - (-arr) + + if self.keep == "last": + arr = arr[::-1] + + nbase = n + narr = len(arr) + n = min(n, narr) + + # arr passed into kth_smallest must be contiguous. We copy + # here because kth_smallest will modify its input + # avoid OOB access with kth_smallest_c when n <= 0 + if len(arr) > 0: + kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1) + else: + kth_val = np.nan + (ns,) = np.nonzero(arr <= kth_val) + inds = ns[arr[ns].argsort(kind="mergesort")] + + if self.keep != "all": + inds = inds[:n] + findex = nbase + else: + if len(inds) < nbase <= len(nan_index) + len(inds): + findex = len(nan_index) + len(inds) + else: + findex = len(inds) + + if self.keep == "last": + # reverse indices + inds = narr - 1 - inds + + return concat([dropped.iloc[inds], nan_index]).iloc[:findex] + + +class SelectNFrame(SelectN): + """ + Implement n largest/smallest for DataFrame + + Parameters + ---------- + obj : DataFrame + n : int + keep : {'first', 'last'}, default 'first' + columns : list or str + + Returns + ------- + nordered : DataFrame + """ + + def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None: + super().__init__(obj, n, keep) + if not is_list_like(columns) or isinstance(columns, tuple): + columns = [columns] + + columns = cast(Sequence[Hashable], columns) + columns = list(columns) + self.columns = columns + + def compute(self, method: str) -> DataFrame: + from pandas.core.api import Index + + n = self.n + frame = self.obj + columns = self.columns + + for column in columns: + dtype = frame[column].dtype + if not self.is_valid_dtype_n_method(dtype): + raise TypeError( + f"Column {repr(column)} has dtype {dtype}, " + f"cannot use method {repr(method)} with this dtype" + ) + + def get_indexer(current_indexer, other_indexer): + """ + Helper function to concat `current_indexer` and `other_indexer` + depending on `method` + """ + if method == "nsmallest": + return current_indexer.append(other_indexer) + else: + return other_indexer.append(current_indexer) + + # Below we save and reset the index in case index contains duplicates + original_index = frame.index + cur_frame = frame = frame.reset_index(drop=True) + cur_n = n + indexer = Index([], dtype=np.int64) + + for i, column in enumerate(columns): + # For each column we apply method to cur_frame[column]. + # If it's the last column or if we have the number of + # results desired we are done. + # Otherwise there are duplicates of the largest/smallest + # value and we need to look at the rest of the columns + # to determine which of the rows with the largest/smallest + # value in the column to keep. + series = cur_frame[column] + is_last_column = len(columns) - 1 == i + values = getattr(series, method)( + cur_n, keep=self.keep if is_last_column else "all" + ) + + if is_last_column or len(values) <= cur_n: + indexer = get_indexer(indexer, values.index) + break + + # Now find all values which are equal to + # the (nsmallest: largest)/(nlargest: smallest) + # from our series. + border_value = values == values[values.index[-1]] + + # Some of these values are among the top-n + # some aren't. + unsafe_values = values[border_value] + + # These values are definitely among the top-n + safe_values = values[~border_value] + indexer = get_indexer(indexer, safe_values.index) + + # Go on and separate the unsafe_values on the remaining + # columns. + cur_frame = cur_frame.loc[unsafe_values.index] + cur_n = n - len(indexer) + + frame = frame.take(indexer) + + # Restore the index on frame + frame.index = original_index.take(indexer) + + # If there is only one column, the frame is already sorted. + if len(columns) == 1: + return frame + + ascending = method == "nsmallest" + + return frame.sort_values(columns, ascending=ascending, kind="mergesort") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/to_dict.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/to_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..accbd92a91ed66fe4dfd1e3939fe8c498184285d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/methods/to_dict.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import ( + lib, + missing as libmissing, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import maybe_box_native +from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, + ExtensionDtype, +) + +from pandas.core import common as com + +if TYPE_CHECKING: + from pandas._typing import MutableMappingT + + from pandas import DataFrame + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., +) -> MutableMappingT: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["records"], + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., +) -> list[MutableMappingT]: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[dict] = ..., + index: bool = ..., +) -> dict: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["records"], + *, + into: type[dict] = ..., + index: bool = ..., +) -> list[dict]: + ... + + +# error: Incompatible default for argument "into" (default has type "type[dict +# [Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") +def to_dict( + df: DataFrame, + orient: Literal[ + "dict", "list", "series", "split", "tight", "records", "index" + ] = "dict", + *, + into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] + index: bool = True, +) -> MutableMappingT | list[MutableMappingT]: + """ + Convert the DataFrame to a dictionary. + + The type of the key-value pairs can be customized with the parameters + (see below). + + Parameters + ---------- + orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'} + Determines the type of the values of the dictionary. + + - 'dict' (default) : dict like {column -> {index -> value}} + - 'list' : dict like {column -> [values]} + - 'series' : dict like {column -> Series(values)} + - 'split' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} + - 'tight' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values], + 'index_names' -> [index.names], 'column_names' -> [column.names]} + - 'records' : list like + [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + + .. versionadded:: 1.4.0 + 'tight' as an allowed value for the ``orient`` argument + + into : class, default dict + The collections.abc.MutableMapping subclass used for all Mappings + in the return value. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + index : bool, default True + Whether to include the index item (and index_names item if `orient` + is 'tight') in the returned dictionary. Can only be ``False`` + when `orient` is 'split' or 'tight'. + + .. versionadded:: 2.0.0 + + Returns + ------- + dict, list or collections.abc.Mapping + Return a collections.abc.MutableMapping object representing the + DataFrame. The resulting transformation depends on the `orient` parameter. + """ + if not df.columns.is_unique: + warnings.warn( + "DataFrame columns are not unique, some columns will be omitted.", + UserWarning, + stacklevel=find_stack_level(), + ) + # GH16122 + into_c = com.standardize_mapping(into) + + # error: Incompatible types in assignment (expression has type "str", + # variable has type "Literal['dict', 'list', 'series', 'split', 'tight', + # 'records', 'index']") + orient = orient.lower() # type: ignore[assignment] + + if not index and orient not in ["split", "tight"]: + raise ValueError( + "'index=False' is only valid when 'orient' is 'split' or 'tight'" + ) + + if orient == "series": + # GH46470 Return quickly if orient series to avoid creating dtype objects + return into_c((k, v) for k, v in df.items()) + + box_native_indices = [ + i + for i, col_dtype in enumerate(df.dtypes.values) + if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype) + ] + box_na_values = [ + lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA + for i, col_dtype in enumerate(df.dtypes.values) + ] + are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes) + + if orient == "dict": + return into_c((k, v.to_dict(into=into)) for k, v in df.items()) + + elif orient == "list": + object_dtype_indices_as_set: set[int] = set(box_native_indices) + return into_c( + ( + k, + list(map(maybe_box_native, v.to_numpy(na_value=box_na_values[i]))) + if i in object_dtype_indices_as_set + else list(map(maybe_box_native, v.to_numpy())), + ) + for i, (k, v) in enumerate(df.items()) + ) + + elif orient == "split": + data = df._create_data_for_split_and_tight_to_dict( + are_all_object_dtype_cols, box_native_indices + ) + + return into_c( + ((("index", df.index.tolist()),) if index else ()) + + ( + ("columns", df.columns.tolist()), + ("data", data), + ) + ) + + elif orient == "tight": + data = df._create_data_for_split_and_tight_to_dict( + are_all_object_dtype_cols, box_native_indices + ) + + return into_c( + ((("index", df.index.tolist()),) if index else ()) + + ( + ("columns", df.columns.tolist()), + ( + "data", + [ + list(map(maybe_box_native, t)) + for t in df.itertuples(index=False, name=None) + ], + ), + ) + + ((("index_names", list(df.index.names)),) if index else ()) + + (("column_names", list(df.columns.names)),) + ) + + elif orient == "records": + columns = df.columns.tolist() + if are_all_object_dtype_cols: + rows = ( + dict(zip(columns, row)) for row in df.itertuples(index=False, name=None) + ) + return [ + into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows + ] + else: + data = [ + into_c(zip(columns, t)) for t in df.itertuples(index=False, name=None) + ] + if box_native_indices: + object_dtype_indices_as_set = set(box_native_indices) + object_dtype_cols = { + col + for i, col in enumerate(df.columns) + if i in object_dtype_indices_as_set + } + for row in data: + for col in object_dtype_cols: + row[col] = maybe_box_native(row[col]) + return data + + elif orient == "index": + if not df.index.is_unique: + raise ValueError("DataFrame index must be unique for orient='index'.") + columns = df.columns.tolist() + if are_all_object_dtype_cols: + return into_c( + (t[0], dict(zip(df.columns, map(maybe_box_native, t[1:])))) + for t in df.itertuples(name=None) + ) + elif box_native_indices: + object_dtype_indices_as_set = set(box_native_indices) + is_object_dtype_by_index = [ + i in object_dtype_indices_as_set for i in range(len(df.columns)) + ] + return into_c( + ( + t[0], + { + columns[i]: maybe_box_native(v) + if is_object_dtype_by_index[i] + else v + for i, v in enumerate(t[1:]) + }, + ) + for t in df.itertuples(name=None) + ) + else: + return into_c( + (t[0], dict(zip(df.columns, t[1:]))) for t in df.itertuples(name=None) + ) + + else: + raise ValueError(f"orient '{orient}' not understood") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ae889a7fdbc24935c0884e8bbcdf56bde8946460 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/__init__.py @@ -0,0 +1,93 @@ +""" +Arithmetic operations for PandasObjects + +This is not a public API. +""" +from __future__ import annotations + +from pandas.core.ops.array_ops import ( + arithmetic_op, + comp_method_OBJECT_ARRAY, + comparison_op, + fill_binop, + get_array_op, + logical_op, + maybe_prepare_scalar_for_op, +) +from pandas.core.ops.common import ( + get_op_result_name, + unpack_zerodim_and_defer, +) +from pandas.core.ops.docstrings import make_flex_doc +from pandas.core.ops.invalid import invalid_comparison +from pandas.core.ops.mask_ops import ( + kleene_and, + kleene_or, + kleene_xor, +) +from pandas.core.roperator import ( + radd, + rand_, + rdiv, + rdivmod, + rfloordiv, + rmod, + rmul, + ror_, + rpow, + rsub, + rtruediv, + rxor, +) + +# ----------------------------------------------------------------------------- +# constants +ARITHMETIC_BINOPS: set[str] = { + "add", + "sub", + "mul", + "pow", + "mod", + "floordiv", + "truediv", + "divmod", + "radd", + "rsub", + "rmul", + "rpow", + "rmod", + "rfloordiv", + "rtruediv", + "rdivmod", +} + + +__all__ = [ + "ARITHMETIC_BINOPS", + "arithmetic_op", + "comparison_op", + "comp_method_OBJECT_ARRAY", + "invalid_comparison", + "fill_binop", + "kleene_and", + "kleene_or", + "kleene_xor", + "logical_op", + "make_flex_doc", + "radd", + "rand_", + "rdiv", + "rdivmod", + "rfloordiv", + "rmod", + "rmul", + "ror_", + "rpow", + "rsub", + "rtruediv", + "rxor", + "unpack_zerodim_and_defer", + "get_op_result_name", + "maybe_prepare_scalar_for_op", + "get_array_op", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/array_ops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/array_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..4b762a359d321ea61660e78ec63d392f8436939d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/array_ops.py @@ -0,0 +1,604 @@ +""" +Functions for arithmetic and comparison operations on NumPy arrays and +ExtensionArrays. +""" +from __future__ import annotations + +import datetime +from functools import partial +import operator +from typing import ( + TYPE_CHECKING, + Any, +) +import warnings + +import numpy as np + +from pandas._libs import ( + NaT, + Timedelta, + Timestamp, + lib, + ops as libops, +) +from pandas._libs.tslibs import ( + BaseOffset, + get_supported_dtype, + is_supported_dtype, + is_unitless, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import ( + construct_1d_object_array_from_listlike, + find_common_type, +) +from pandas.core.dtypes.common import ( + ensure_object, + is_bool_dtype, + is_list_like, + is_numeric_v_string_like, + is_object_dtype, + is_scalar, +) +from pandas.core.dtypes.generic import ( + ABCExtensionArray, + ABCIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + isna, + notna, +) + +from pandas.core import roperator +from pandas.core.computation import expressions +from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core.ops import missing +from pandas.core.ops.dispatch import should_extension_dispatch +from pandas.core.ops.invalid import invalid_comparison + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + Shape, + ) + +# ----------------------------------------------------------------------------- +# Masking NA values and fallbacks for operations numpy does not support + + +def fill_binop(left, right, fill_value): + """ + If a non-None fill_value is given, replace null entries in left and right + with this value, but only in positions where _one_ of left/right is null, + not both. + + Parameters + ---------- + left : array-like + right : array-like + fill_value : object + + Returns + ------- + left : array-like + right : array-like + + Notes + ----- + Makes copies if fill_value is not None and NAs are present. + """ + if fill_value is not None: + left_mask = isna(left) + right_mask = isna(right) + + # one but not both + mask = left_mask ^ right_mask + + if left_mask.any(): + # Avoid making a copy if we can + left = left.copy() + left[left_mask & mask] = fill_value + + if right_mask.any(): + # Avoid making a copy if we can + right = right.copy() + right[right_mask & mask] = fill_value + + return left, right + + +def comp_method_OBJECT_ARRAY(op, x, y): + if isinstance(y, list): + # e.g. test_tuple_categories + y = construct_1d_object_array_from_listlike(y) + + if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): + if not is_object_dtype(y.dtype): + y = y.astype(np.object_) + + if isinstance(y, (ABCSeries, ABCIndex)): + y = y._values + + if x.shape != y.shape: + raise ValueError("Shapes must match", x.shape, y.shape) + result = libops.vec_compare(x.ravel(), y.ravel(), op) + else: + result = libops.scalar_compare(x.ravel(), y, op) + return result.reshape(x.shape) + + +def _masked_arith_op(x: np.ndarray, y, op): + """ + If the given arithmetic operation fails, attempt it again on + only the non-null elements of the input array(s). + + Parameters + ---------- + x : np.ndarray + y : np.ndarray, Series, Index + op : binary operator + """ + # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes + # the logic valid for both Series and DataFrame ops. + xrav = x.ravel() + + if isinstance(y, np.ndarray): + dtype = find_common_type([x.dtype, y.dtype]) + result = np.empty(x.size, dtype=dtype) + + if len(x) != len(y): + raise ValueError(x.shape, y.shape) + ymask = notna(y) + + # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex + # we would get int64 dtype, see GH#19956 + yrav = y.ravel() + mask = notna(xrav) & ymask.ravel() + + # See GH#5284, GH#5035, GH#19448 for historical reference + if mask.any(): + result[mask] = op(xrav[mask], yrav[mask]) + + else: + if not is_scalar(y): + raise TypeError( + f"Cannot broadcast np.ndarray with operand of type { type(y) }" + ) + + # mask is only meaningful for x + result = np.empty(x.size, dtype=x.dtype) + mask = notna(xrav) + + # 1 ** np.nan is 1. So we have to unmask those. + if op is pow: + mask = np.where(x == 1, False, mask) + elif op is roperator.rpow: + mask = np.where(y == 1, False, mask) + + if mask.any(): + result[mask] = op(xrav[mask], y) + + np.putmask(result, ~mask, np.nan) + result = result.reshape(x.shape) # 2D compat + return result + + +def _na_arithmetic_op(left: np.ndarray, right, op, is_cmp: bool = False): + """ + Return the result of evaluating op on the passed in values. + + If native types are not compatible, try coercion to object dtype. + + Parameters + ---------- + left : np.ndarray + right : np.ndarray or scalar + Excludes DataFrame, Series, Index, ExtensionArray. + is_cmp : bool, default False + If this a comparison operation. + + Returns + ------- + array-like + + Raises + ------ + TypeError : invalid operation + """ + if isinstance(right, str): + # can never use numexpr + func = op + else: + func = partial(expressions.evaluate, op) + + try: + result = func(left, right) + except TypeError: + if not is_cmp and ( + left.dtype == object or getattr(right, "dtype", None) == object + ): + # For object dtype, fallback to a masked operation (only operating + # on the non-missing values) + # Don't do this for comparisons, as that will handle complex numbers + # incorrectly, see GH#32047 + result = _masked_arith_op(left, right, op) + else: + raise + + if is_cmp and (is_scalar(result) or result is NotImplemented): + # numpy returned a scalar instead of operating element-wise + # e.g. numeric array vs str + # TODO: can remove this after dropping some future numpy version? + return invalid_comparison(left, right, op) + + return missing.dispatch_fill_zeros(op, left, right, result) + + +def arithmetic_op(left: ArrayLike, right: Any, op): + """ + Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... + + Note: the caller is responsible for ensuring that numpy warnings are + suppressed (with np.errstate(all="ignore")) if needed. + + Parameters + ---------- + left : np.ndarray or ExtensionArray + right : object + Cannot be a DataFrame or Index. Series is *not* excluded. + op : {operator.add, operator.sub, ...} + Or one of the reversed variants from roperator. + + Returns + ------- + ndarray or ExtensionArray + Or a 2-tuple of these in the case of divmod or rdivmod. + """ + # NB: We assume that extract_array and ensure_wrapped_if_datetimelike + # have already been called on `left` and `right`, + # and `maybe_prepare_scalar_for_op` has already been called on `right` + # We need to special-case datetime64/timedelta64 dtypes (e.g. because numpy + # casts integer dtypes to timedelta64 when operating with timedelta64 - GH#22390) + + if ( + should_extension_dispatch(left, right) + or isinstance(right, (Timedelta, BaseOffset, Timestamp)) + or right is NaT + ): + # Timedelta/Timestamp and other custom scalars are included in the check + # because numexpr will fail on it, see GH#31457 + res_values = op(left, right) + else: + # TODO we should handle EAs consistently and move this check before the if/else + # (https://github.com/pandas-dev/pandas/issues/41165) + # error: Argument 2 to "_bool_arith_check" has incompatible type + # "Union[ExtensionArray, ndarray[Any, Any]]"; expected "ndarray[Any, Any]" + _bool_arith_check(op, left, right) # type: ignore[arg-type] + + # error: Argument 1 to "_na_arithmetic_op" has incompatible type + # "Union[ExtensionArray, ndarray[Any, Any]]"; expected "ndarray[Any, Any]" + res_values = _na_arithmetic_op(left, right, op) # type: ignore[arg-type] + + return res_values + + +def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: + """ + Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`. + + Note: the caller is responsible for ensuring that numpy warnings are + suppressed (with np.errstate(all="ignore")) if needed. + + Parameters + ---------- + left : np.ndarray or ExtensionArray + right : object + Cannot be a DataFrame, Series, or Index. + op : {operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le} + + Returns + ------- + ndarray or ExtensionArray + """ + # NB: We assume extract_array has already been called on left and right + lvalues = ensure_wrapped_if_datetimelike(left) + rvalues = ensure_wrapped_if_datetimelike(right) + + rvalues = lib.item_from_zerodim(rvalues) + if isinstance(rvalues, list): + # We don't catch tuple here bc we may be comparing e.g. MultiIndex + # to a tuple that represents a single entry, see test_compare_tuple_strs + rvalues = np.asarray(rvalues) + + if isinstance(rvalues, (np.ndarray, ABCExtensionArray)): + # TODO: make this treatment consistent across ops and classes. + # We are not catching all listlikes here (e.g. frozenset, tuple) + # The ambiguous case is object-dtype. See GH#27803 + if len(lvalues) != len(rvalues): + raise ValueError( + "Lengths must match to compare", lvalues.shape, rvalues.shape + ) + + if should_extension_dispatch(lvalues, rvalues) or ( + (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) + and lvalues.dtype != object + ): + # Call the method on lvalues + res_values = op(lvalues, rvalues) + + elif is_scalar(rvalues) and isna(rvalues): # TODO: but not pd.NA? + # numpy does not like comparisons vs None + if op is operator.ne: + res_values = np.ones(lvalues.shape, dtype=bool) + else: + res_values = np.zeros(lvalues.shape, dtype=bool) + + elif is_numeric_v_string_like(lvalues, rvalues): + # GH#36377 going through the numexpr path would incorrectly raise + return invalid_comparison(lvalues, rvalues, op) + + elif lvalues.dtype == object or isinstance(rvalues, str): + res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) + + else: + res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) + + return res_values + + +def na_logical_op(x: np.ndarray, y, op): + try: + # For exposition, write: + # yarr = isinstance(y, np.ndarray) + # yint = is_integer(y) or (yarr and y.dtype.kind == "i") + # ybool = is_bool(y) or (yarr and y.dtype.kind == "b") + # xint = x.dtype.kind == "i" + # xbool = x.dtype.kind == "b" + # Then Cases where this goes through without raising include: + # (xint or xbool) and (yint or bool) + result = op(x, y) + except TypeError: + if isinstance(y, np.ndarray): + # bool-bool dtype operations should be OK, should not get here + assert not (x.dtype.kind == "b" and y.dtype.kind == "b") + x = ensure_object(x) + y = ensure_object(y) + result = libops.vec_binop(x.ravel(), y.ravel(), op) + else: + # let null fall thru + assert lib.is_scalar(y) + if not isna(y): + y = bool(y) + try: + result = libops.scalar_binop(x, y, op) + except ( + TypeError, + ValueError, + AttributeError, + OverflowError, + NotImplementedError, + ) as err: + typ = type(y).__name__ + raise TypeError( + f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array " + f"and scalar of type [{typ}]" + ) from err + + return result.reshape(x.shape) + + +def logical_op(left: ArrayLike, right: Any, op) -> ArrayLike: + """ + Evaluate a logical operation `|`, `&`, or `^`. + + Parameters + ---------- + left : np.ndarray or ExtensionArray + right : object + Cannot be a DataFrame, Series, or Index. + op : {operator.and_, operator.or_, operator.xor} + Or one of the reversed variants from roperator. + + Returns + ------- + ndarray or ExtensionArray + """ + + def fill_bool(x, left=None): + # if `left` is specifically not-boolean, we do not cast to bool + if x.dtype.kind in "cfO": + # dtypes that can hold NA + mask = isna(x) + if mask.any(): + x = x.astype(object) + x[mask] = False + + if left is None or left.dtype.kind == "b": + x = x.astype(bool) + return x + + right = lib.item_from_zerodim(right) + if is_list_like(right) and not hasattr(right, "dtype"): + # e.g. list, tuple + warnings.warn( + "Logical ops (and, or, xor) between Pandas objects and dtype-less " + "sequences (e.g. list, tuple) are deprecated and will raise in a " + "future version. Wrap the object in a Series, Index, or np.array " + "before operating instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + right = construct_1d_object_array_from_listlike(right) + + # NB: We assume extract_array has already been called on left and right + lvalues = ensure_wrapped_if_datetimelike(left) + rvalues = right + + if should_extension_dispatch(lvalues, rvalues): + # Call the method on lvalues + res_values = op(lvalues, rvalues) + + else: + if isinstance(rvalues, np.ndarray): + is_other_int_dtype = rvalues.dtype.kind in "iu" + if not is_other_int_dtype: + rvalues = fill_bool(rvalues, lvalues) + + else: + # i.e. scalar + is_other_int_dtype = lib.is_integer(rvalues) + + res_values = na_logical_op(lvalues, rvalues, op) + + # For int vs int `^`, `|`, `&` are bitwise operators and return + # integer dtypes. Otherwise these are boolean ops + if not (left.dtype.kind in "iu" and is_other_int_dtype): + res_values = fill_bool(res_values) + + return res_values + + +def get_array_op(op): + """ + Return a binary array operation corresponding to the given operator op. + + Parameters + ---------- + op : function + Binary operator from operator or roperator module. + + Returns + ------- + functools.partial + """ + if isinstance(op, partial): + # We get here via dispatch_to_series in DataFrame case + # e.g. test_rolling_consistency_var_debiasing_factors + return op + + op_name = op.__name__.strip("_").lstrip("r") + if op_name == "arith_op": + # Reached via DataFrame._combine_frame i.e. flex methods + # e.g. test_df_add_flex_filled_mixed_dtypes + return op + + if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}: + return partial(comparison_op, op=op) + elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}: + return partial(logical_op, op=op) + elif op_name in { + "add", + "sub", + "mul", + "truediv", + "floordiv", + "mod", + "divmod", + "pow", + }: + return partial(arithmetic_op, op=op) + else: + raise NotImplementedError(op_name) + + +def maybe_prepare_scalar_for_op(obj, shape: Shape): + """ + Cast non-pandas objects to pandas types to unify behavior of arithmetic + and comparison operations. + + Parameters + ---------- + obj: object + shape : tuple[int] + + Returns + ------- + out : object + + Notes + ----- + Be careful to call this *after* determining the `name` attribute to be + attached to the result of the arithmetic operation. + """ + if type(obj) is datetime.timedelta: + # GH#22390 cast up to Timedelta to rely on Timedelta + # implementation; otherwise operation against numeric-dtype + # raises TypeError + return Timedelta(obj) + elif type(obj) is datetime.datetime: + # cast up to Timestamp to rely on Timestamp implementation, see Timedelta above + return Timestamp(obj) + elif isinstance(obj, np.datetime64): + # GH#28080 numpy casts integer-dtype to datetime64 when doing + # array[int] + datetime64, which we do not allow + if isna(obj): + from pandas.core.arrays import DatetimeArray + + # Avoid possible ambiguities with pd.NaT + # GH 52295 + if is_unitless(obj.dtype): + obj = obj.astype("datetime64[ns]") + elif not is_supported_dtype(obj.dtype): + new_dtype = get_supported_dtype(obj.dtype) + obj = obj.astype(new_dtype) + right = np.broadcast_to(obj, shape) + return DatetimeArray._simple_new(right, dtype=right.dtype) + + return Timestamp(obj) + + elif isinstance(obj, np.timedelta64): + if isna(obj): + from pandas.core.arrays import TimedeltaArray + + # wrapping timedelta64("NaT") in Timedelta returns NaT, + # which would incorrectly be treated as a datetime-NaT, so + # we broadcast and wrap in a TimedeltaArray + # GH 52295 + if is_unitless(obj.dtype): + obj = obj.astype("timedelta64[ns]") + elif not is_supported_dtype(obj.dtype): + new_dtype = get_supported_dtype(obj.dtype) + obj = obj.astype(new_dtype) + right = np.broadcast_to(obj, shape) + return TimedeltaArray._simple_new(right, dtype=right.dtype) + + # In particular non-nanosecond timedelta64 needs to be cast to + # nanoseconds, or else we get undesired behavior like + # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') + return Timedelta(obj) + + # We want NumPy numeric scalars to behave like Python scalars + # post NEP 50 + elif isinstance(obj, np.integer): + return int(obj) + + elif isinstance(obj, np.floating): + return float(obj) + + return obj + + +_BOOL_OP_NOT_ALLOWED = { + operator.truediv, + roperator.rtruediv, + operator.floordiv, + roperator.rfloordiv, + operator.pow, + roperator.rpow, +} + + +def _bool_arith_check(op, a: np.ndarray, b): + """ + In contrast to numpy, pandas raises an error for certain operations + with booleans. + """ + if op in _BOOL_OP_NOT_ALLOWED: + if a.dtype.kind == "b" and (is_bool_dtype(b) or lib.is_bool(b)): + op_name = op.__name__.strip("_").lstrip("r") + raise NotImplementedError( + f"operator '{op_name}' not implemented for bool dtypes" + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/common.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/common.py new file mode 100644 index 0000000000000000000000000000000000000000..559977bacf881552d546e7704d4cf4b12b4a32fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/common.py @@ -0,0 +1,146 @@ +""" +Boilerplate functions used in defining binary operations. +""" +from __future__ import annotations + +from functools import wraps +from typing import ( + TYPE_CHECKING, + Callable, +) + +from pandas._libs.lib import item_from_zerodim +from pandas._libs.missing import is_matching_na + +from pandas.core.dtypes.generic import ( + ABCIndex, + ABCSeries, +) + +if TYPE_CHECKING: + from pandas._typing import F + + +def unpack_zerodim_and_defer(name: str) -> Callable[[F], F]: + """ + Boilerplate for pandas conventions in arithmetic and comparison methods. + + Parameters + ---------- + name : str + + Returns + ------- + decorator + """ + + def wrapper(method: F) -> F: + return _unpack_zerodim_and_defer(method, name) + + return wrapper + + +def _unpack_zerodim_and_defer(method, name: str): + """ + Boilerplate for pandas conventions in arithmetic and comparison methods. + + Ensure method returns NotImplemented when operating against "senior" + classes. Ensure zero-dimensional ndarrays are always unpacked. + + Parameters + ---------- + method : binary method + name : str + + Returns + ------- + method + """ + stripped_name = name.removeprefix("__").removesuffix("__") + is_cmp = stripped_name in {"eq", "ne", "lt", "le", "gt", "ge"} + + @wraps(method) + def new_method(self, other): + if is_cmp and isinstance(self, ABCIndex) and isinstance(other, ABCSeries): + # For comparison ops, Index does *not* defer to Series + pass + else: + prio = getattr(other, "__pandas_priority__", None) + if prio is not None: + if prio > self.__pandas_priority__: + # e.g. other is DataFrame while self is Index/Series/EA + return NotImplemented + + other = item_from_zerodim(other) + + return method(self, other) + + return new_method + + +def get_op_result_name(left, right): + """ + Find the appropriate name to pin to an operation result. This result + should always be either an Index or a Series. + + Parameters + ---------- + left : {Series, Index} + right : object + + Returns + ------- + name : object + Usually a string + """ + if isinstance(right, (ABCSeries, ABCIndex)): + name = _maybe_match_name(left, right) + else: + name = left.name + return name + + +def _maybe_match_name(a, b): + """ + Try to find a name to attach to the result of an operation between + a and b. If only one of these has a `name` attribute, return that + name. Otherwise return a consensus name if they match or None if + they have different names. + + Parameters + ---------- + a : object + b : object + + Returns + ------- + name : str or None + + See Also + -------- + pandas.core.common.consensus_name_attr + """ + a_has = hasattr(a, "name") + b_has = hasattr(b, "name") + if a_has and b_has: + try: + if a.name == b.name: + return a.name + elif is_matching_na(a.name, b.name): + # e.g. both are np.nan + return a.name + else: + return None + except TypeError: + # pd.NA + if is_matching_na(a.name, b.name): + return a.name + return None + except ValueError: + # e.g. np.int64(1) vs (np.int64(1), np.int64(2)) + return None + elif a_has: + return a.name + elif b_has: + return b.name + return None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/dispatch.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/dispatch.py new file mode 100644 index 0000000000000000000000000000000000000000..a939fdd3d041e9f99dde7ea40fd7aa0572d0d9b7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/dispatch.py @@ -0,0 +1,30 @@ +""" +Functions for defining unary operations. +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, +) + +from pandas.core.dtypes.generic import ABCExtensionArray + +if TYPE_CHECKING: + from pandas._typing import ArrayLike + + +def should_extension_dispatch(left: ArrayLike, right: Any) -> bool: + """ + Identify cases where Series operation should dispatch to ExtensionArray method. + + Parameters + ---------- + left : np.ndarray or ExtensionArray + right : object + + Returns + ------- + bool + """ + return isinstance(left, ABCExtensionArray) or isinstance(right, ABCExtensionArray) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/docstrings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/docstrings.py new file mode 100644 index 0000000000000000000000000000000000000000..bd2e532536d8491af44631e52982217a04ef5b17 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/docstrings.py @@ -0,0 +1,772 @@ +""" +Templating for ops docstrings +""" +from __future__ import annotations + + +def make_flex_doc(op_name: str, typ: str) -> str: + """ + Make the appropriate substitutions for the given operation and class-typ + into either _flex_doc_SERIES or _flex_doc_FRAME to return the docstring + to attach to a generated method. + + Parameters + ---------- + op_name : str {'__add__', '__sub__', ... '__eq__', '__ne__', ...} + typ : str {series, 'dataframe']} + + Returns + ------- + doc : str + """ + op_name = op_name.replace("__", "") + op_desc = _op_descriptions[op_name] + + op_desc_op = op_desc["op"] + assert op_desc_op is not None # for mypy + if op_name.startswith("r"): + equiv = f"other {op_desc_op} {typ}" + elif op_name == "divmod": + equiv = f"{op_name}({typ}, other)" + else: + equiv = f"{typ} {op_desc_op} other" + + if typ == "series": + base_doc = _flex_doc_SERIES + if op_desc["reverse"]: + base_doc += _see_also_reverse_SERIES.format( + reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"] + ) + doc_no_examples = base_doc.format( + desc=op_desc["desc"], + op_name=op_name, + equiv=equiv, + series_returns=op_desc["series_returns"], + ) + ser_example = op_desc["series_examples"] + if ser_example: + doc = doc_no_examples + ser_example + else: + doc = doc_no_examples + elif typ == "dataframe": + if op_name in ["eq", "ne", "le", "lt", "ge", "gt"]: + base_doc = _flex_comp_doc_FRAME + doc = _flex_comp_doc_FRAME.format( + op_name=op_name, + desc=op_desc["desc"], + ) + else: + base_doc = _flex_doc_FRAME + doc = base_doc.format( + desc=op_desc["desc"], + op_name=op_name, + equiv=equiv, + reverse=op_desc["reverse"], + ) + else: + raise AssertionError("Invalid typ argument.") + return doc + + +_common_examples_algebra_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64""" + +_common_examples_comparison_SERIES = """ +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan, 1], index=['a', 'b', 'c', 'd', 'e']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +e 1.0 +dtype: float64 +>>> b = pd.Series([0, 1, 2, np.nan, 1], index=['a', 'b', 'c', 'd', 'f']) +>>> b +a 0.0 +b 1.0 +c 2.0 +d NaN +f 1.0 +dtype: float64""" + +_add_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.add(b, fill_value=0) +a 2.0 +b 1.0 +c 1.0 +d 1.0 +e NaN +dtype: float64 +""" +) + +_sub_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.subtract(b, fill_value=0) +a 0.0 +b 1.0 +c 1.0 +d -1.0 +e NaN +dtype: float64 +""" +) + +_mul_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.multiply(b, fill_value=0) +a 1.0 +b 0.0 +c 0.0 +d 0.0 +e NaN +dtype: float64 +""" +) + +_div_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.divide(b, fill_value=0) +a 1.0 +b inf +c inf +d 0.0 +e NaN +dtype: float64 +""" +) + +_floordiv_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.floordiv(b, fill_value=0) +a 1.0 +b inf +c inf +d 0.0 +e NaN +dtype: float64 +""" +) + +_divmod_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.divmod(b, fill_value=0) +(a 1.0 + b inf + c inf + d 0.0 + e NaN + dtype: float64, + a 0.0 + b NaN + c NaN + d 0.0 + e NaN + dtype: float64) +""" +) + +_mod_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.mod(b, fill_value=0) +a 0.0 +b NaN +c NaN +d 0.0 +e NaN +dtype: float64 +""" +) +_pow_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.pow(b, fill_value=0) +a 1.0 +b 1.0 +c 1.0 +d 0.0 +e NaN +dtype: float64 +""" +) + +_ne_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.ne(b, fill_value=0) +a False +b True +c True +d True +e True +dtype: bool +""" +) + +_eq_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.eq(b, fill_value=0) +a True +b False +c False +d False +e False +dtype: bool +""" +) + +_lt_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.lt(b, fill_value=0) +a False +b False +c True +d False +e False +f True +dtype: bool +""" +) + +_le_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.le(b, fill_value=0) +a False +b True +c True +d False +e False +f True +dtype: bool +""" +) + +_gt_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.gt(b, fill_value=0) +a True +b False +c False +d False +e True +f False +dtype: bool +""" +) + +_ge_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.ge(b, fill_value=0) +a True +b True +c False +d False +e True +f False +dtype: bool +""" +) + +_returns_series = """Series\n The result of the operation.""" + +_returns_tuple = """2-Tuple of Series\n The result of the operation.""" + +_op_descriptions: dict[str, dict[str, str | None]] = { + # Arithmetic Operators + "add": { + "op": "+", + "desc": "Addition", + "reverse": "radd", + "series_examples": _add_example_SERIES, + "series_returns": _returns_series, + }, + "sub": { + "op": "-", + "desc": "Subtraction", + "reverse": "rsub", + "series_examples": _sub_example_SERIES, + "series_returns": _returns_series, + }, + "mul": { + "op": "*", + "desc": "Multiplication", + "reverse": "rmul", + "series_examples": _mul_example_SERIES, + "series_returns": _returns_series, + "df_examples": None, + }, + "mod": { + "op": "%", + "desc": "Modulo", + "reverse": "rmod", + "series_examples": _mod_example_SERIES, + "series_returns": _returns_series, + }, + "pow": { + "op": "**", + "desc": "Exponential power", + "reverse": "rpow", + "series_examples": _pow_example_SERIES, + "series_returns": _returns_series, + "df_examples": None, + }, + "truediv": { + "op": "/", + "desc": "Floating division", + "reverse": "rtruediv", + "series_examples": _div_example_SERIES, + "series_returns": _returns_series, + "df_examples": None, + }, + "floordiv": { + "op": "//", + "desc": "Integer division", + "reverse": "rfloordiv", + "series_examples": _floordiv_example_SERIES, + "series_returns": _returns_series, + "df_examples": None, + }, + "divmod": { + "op": "divmod", + "desc": "Integer division and modulo", + "reverse": "rdivmod", + "series_examples": _divmod_example_SERIES, + "series_returns": _returns_tuple, + "df_examples": None, + }, + # Comparison Operators + "eq": { + "op": "==", + "desc": "Equal to", + "reverse": None, + "series_examples": _eq_example_SERIES, + "series_returns": _returns_series, + }, + "ne": { + "op": "!=", + "desc": "Not equal to", + "reverse": None, + "series_examples": _ne_example_SERIES, + "series_returns": _returns_series, + }, + "lt": { + "op": "<", + "desc": "Less than", + "reverse": None, + "series_examples": _lt_example_SERIES, + "series_returns": _returns_series, + }, + "le": { + "op": "<=", + "desc": "Less than or equal to", + "reverse": None, + "series_examples": _le_example_SERIES, + "series_returns": _returns_series, + }, + "gt": { + "op": ">", + "desc": "Greater than", + "reverse": None, + "series_examples": _gt_example_SERIES, + "series_returns": _returns_series, + }, + "ge": { + "op": ">=", + "desc": "Greater than or equal to", + "reverse": None, + "series_examples": _ge_example_SERIES, + "series_returns": _returns_series, + }, +} + +_py_num_ref = """see + `Python documentation + `_ + for more details""" +_op_names = list(_op_descriptions.keys()) +for key in _op_names: + reverse_op = _op_descriptions[key]["reverse"] + if reverse_op is not None: + _op_descriptions[reverse_op] = _op_descriptions[key].copy() + _op_descriptions[reverse_op]["reverse"] = key + _op_descriptions[key][ + "see_also_desc" + ] = f"Reverse of the {_op_descriptions[key]['desc']} operator, {_py_num_ref}" + _op_descriptions[reverse_op][ + "see_also_desc" + ] = f"Element-wise {_op_descriptions[key]['desc']}, {_py_num_ref}" + +_flex_doc_SERIES = """ +Return {desc} of series and other, element-wise (binary operator `{op_name}`). + +Equivalent to ``{equiv}``, but with support to substitute a fill_value for +missing data in either one of the inputs. + +Parameters +---------- +other : Series or scalar value +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. +fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. +axis : {{0 or 'index'}} + Unused. Parameter needed for compatibility with DataFrame. + +Returns +------- +{series_returns} +""" + +_see_also_reverse_SERIES = """ +See Also +-------- +Series.{reverse} : {see_also_desc}. +""" + +_flex_doc_FRAME = """ +Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). + +Equivalent to ``{equiv}``, but with support to substitute a fill_value +for missing data in one of the inputs. With reverse version, `{reverse}`. + +Among flexible wrappers (`add`, `sub`, `mul`, `div`, `floordiv`, `mod`, `pow`) to +arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + +Parameters +---------- +other : scalar, sequence, Series, dict or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}} + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. +level : int or label + Broadcast across a level, matching Index values on the + passed MultiIndex level. +fill_value : float or None, default None + Fill existing missing (NaN) values, and any new element needed for + successful DataFrame alignment, with this value before computation. + If data in both corresponding DataFrame locations is missing + the result will be missing. + +Returns +------- +DataFrame + Result of the arithmetic operation. + +See Also +-------- +DataFrame.add : Add DataFrames. +DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power. + +Notes +----- +Mismatched indices will be unioned together. + +Examples +-------- +>>> df = pd.DataFrame({{'angles': [0, 3, 4], +... 'degrees': [360, 180, 360]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> df + angles degrees +circle 0 360 +triangle 3 180 +rectangle 4 360 + +Add a scalar with operator version which return the same +results. + +>>> df + 1 + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +>>> df.add(1) + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +Divide by constant with reverse version. + +>>> df.div(10) + angles degrees +circle 0.0 36.0 +triangle 0.3 18.0 +rectangle 0.4 36.0 + +>>> df.rdiv(10) + angles degrees +circle inf 0.027778 +triangle 3.333333 0.055556 +rectangle 2.500000 0.027778 + +Subtract a list and Series by axis with operator version. + +>>> df - [1, 2] + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub([1, 2], axis='columns') + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']), +... axis='index') + angles degrees +circle -1 359 +triangle 2 179 +rectangle 3 359 + +Multiply a dictionary by axis. + +>>> df.mul({{'angles': 0, 'degrees': 2}}) + angles degrees +circle 0 720 +triangle 0 360 +rectangle 0 720 + +>>> df.mul({{'circle': 0, 'triangle': 2, 'rectangle': 3}}, axis='index') + angles degrees +circle 0 0 +triangle 6 360 +rectangle 12 1080 + +Multiply a DataFrame of different shape with operator version. + +>>> other = pd.DataFrame({{'angles': [0, 3, 4]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> other + angles +circle 0 +triangle 3 +rectangle 4 + +>>> df * other + angles degrees +circle 0 NaN +triangle 9 NaN +rectangle 16 NaN + +>>> df.mul(other, fill_value=0) + angles degrees +circle 0 0.0 +triangle 9 0.0 +rectangle 16 0.0 + +Divide by a MultiIndex by level. + +>>> df_multindex = pd.DataFrame({{'angles': [0, 3, 4, 4, 5, 6], +... 'degrees': [360, 180, 360, 360, 540, 720]}}, +... index=[['A', 'A', 'A', 'B', 'B', 'B'], +... ['circle', 'triangle', 'rectangle', +... 'square', 'pentagon', 'hexagon']]) +>>> df_multindex + angles degrees +A circle 0 360 + triangle 3 180 + rectangle 4 360 +B square 4 360 + pentagon 5 540 + hexagon 6 720 + +>>> df.div(df_multindex, level=1, fill_value=0) + angles degrees +A circle NaN 1.0 + triangle 1.0 1.0 + rectangle 1.0 1.0 +B square 0.0 0.0 + pentagon 0.0 0.0 + hexagon 0.0 0.0 +""" + +_flex_comp_doc_FRAME = """ +Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). + +Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison +operators. + +Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis +(rows or columns) and level for comparison. + +Parameters +---------- +other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). +level : int or label + Broadcast across a level, matching Index values on the passed + MultiIndex level. + +Returns +------- +DataFrame of bool + Result of the comparison. + +See Also +-------- +DataFrame.eq : Compare DataFrames for equality elementwise. +DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise. + +Notes +----- +Mismatched indices will be unioned together. +`NaN` values are considered different (i.e. `NaN` != `NaN`). + +Examples +-------- +>>> df = pd.DataFrame({{'cost': [250, 150, 100], +... 'revenue': [100, 250, 300]}}, +... index=['A', 'B', 'C']) +>>> df + cost revenue +A 250 100 +B 150 250 +C 100 300 + +Comparison with a scalar, using either the operator or method: + +>>> df == 100 + cost revenue +A False True +B False False +C True False + +>>> df.eq(100) + cost revenue +A False True +B False False +C True False + +When `other` is a :class:`Series`, the columns of a DataFrame are aligned +with the index of `other` and broadcast: + +>>> df != pd.Series([100, 250], index=["cost", "revenue"]) + cost revenue +A True True +B True False +C False True + +Use the method to control the broadcast axis: + +>>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index') + cost revenue +A True False +B True True +C True True +D True True + +When comparing to an arbitrary sequence, the number of columns must +match the number elements in `other`: + +>>> df == [250, 100] + cost revenue +A True True +B False False +C False False + +Use the method to control the axis: + +>>> df.eq([250, 250, 100], axis='index') + cost revenue +A True False +B False True +C True False + +Compare to a DataFrame of different shape. + +>>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}}, +... index=['A', 'B', 'C', 'D']) +>>> other + revenue +A 300 +B 250 +C 100 +D 150 + +>>> df.gt(other) + cost revenue +A False False +B False False +C False True +D False False + +Compare to a MultiIndex by level. + +>>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220], +... 'revenue': [100, 250, 300, 200, 175, 225]}}, +... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'], +... ['A', 'B', 'C', 'A', 'B', 'C']]) +>>> df_multindex + cost revenue +Q1 A 250 100 + B 150 250 + C 100 300 +Q2 A 150 200 + B 300 175 + C 220 225 + +>>> df.le(df_multindex, level=1) + cost revenue +Q1 A True True + B True True + C True True +Q2 A False True + B True False + C True False +""" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/invalid.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/invalid.py new file mode 100644 index 0000000000000000000000000000000000000000..e5ae6d359ac2205b01706211382d116b29176c7a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/invalid.py @@ -0,0 +1,62 @@ +""" +Templates for invalid operations. +""" +from __future__ import annotations + +import operator +from typing import TYPE_CHECKING + +import numpy as np + +if TYPE_CHECKING: + from pandas._typing import npt + + +def invalid_comparison(left, right, op) -> npt.NDArray[np.bool_]: + """ + If a comparison has mismatched types and is not necessarily meaningful, + follow python3 conventions by: + + - returning all-False for equality + - returning all-True for inequality + - raising TypeError otherwise + + Parameters + ---------- + left : array-like + right : scalar, array-like + op : operator.{eq, ne, lt, le, gt} + + Raises + ------ + TypeError : on inequality comparisons + """ + if op is operator.eq: + res_values = np.zeros(left.shape, dtype=bool) + elif op is operator.ne: + res_values = np.ones(left.shape, dtype=bool) + else: + typ = type(right).__name__ + raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}") + return res_values + + +def make_invalid_op(name: str): + """ + Return a binary method that always raises a TypeError. + + Parameters + ---------- + name : str + + Returns + ------- + invalid_op : function + """ + + def invalid_op(self, other=None): + typ = type(self).__name__ + raise TypeError(f"cannot perform {name} with this index type: {typ}") + + invalid_op.__name__ = name + return invalid_op diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/mask_ops.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/mask_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..adc1f63c568bf579f31b13446f4614435d443df1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/mask_ops.py @@ -0,0 +1,189 @@ +""" +Ops for masked arrays. +""" +from __future__ import annotations + +import numpy as np + +from pandas._libs import ( + lib, + missing as libmissing, +) + + +def kleene_or( + left: bool | np.ndarray | libmissing.NAType, + right: bool | np.ndarray | libmissing.NAType, + left_mask: np.ndarray | None, + right_mask: np.ndarray | None, +): + """ + Boolean ``or`` using Kleene logic. + + Values are NA where we have ``NA | NA`` or ``NA | False``. + ``NA | True`` is considered True. + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical or, and the new mask. + """ + # To reduce the number of cases, we ensure that `left` & `left_mask` + # always come from an array, not a scalar. This is safe, since + # A | B == B | A + if left_mask is None: + return kleene_or(right, left, right_mask, left_mask) + + if not isinstance(left, np.ndarray): + raise TypeError("Either `left` or `right` need to be a np.ndarray.") + + raise_for_nan(right, method="or") + + if right is libmissing.NA: + result = left.copy() + else: + result = left | right + + if right_mask is not None: + # output is unknown where (False & NA), (NA & False), (NA & NA) + left_false = ~(left | left_mask) + right_false = ~(right | right_mask) + mask = ( + (left_false & right_mask) + | (right_false & left_mask) + | (left_mask & right_mask) + ) + else: + if right is True: + mask = np.zeros_like(left_mask) + elif right is libmissing.NA: + mask = (~left & ~left_mask) | left_mask + else: + # False + mask = left_mask.copy() + + return result, mask + + +def kleene_xor( + left: bool | np.ndarray | libmissing.NAType, + right: bool | np.ndarray | libmissing.NAType, + left_mask: np.ndarray | None, + right_mask: np.ndarray | None, +): + """ + Boolean ``xor`` using Kleene logic. + + This is the same as ``or``, with the following adjustments + + * True, True -> False + * True, NA -> NA + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical xor, and the new mask. + """ + # To reduce the number of cases, we ensure that `left` & `left_mask` + # always come from an array, not a scalar. This is safe, since + # A ^ B == B ^ A + if left_mask is None: + return kleene_xor(right, left, right_mask, left_mask) + + if not isinstance(left, np.ndarray): + raise TypeError("Either `left` or `right` need to be a np.ndarray.") + + raise_for_nan(right, method="xor") + if right is libmissing.NA: + result = np.zeros_like(left) + else: + result = left ^ right + + if right_mask is None: + if right is libmissing.NA: + mask = np.ones_like(left_mask) + else: + mask = left_mask.copy() + else: + mask = left_mask | right_mask + + return result, mask + + +def kleene_and( + left: bool | libmissing.NAType | np.ndarray, + right: bool | libmissing.NAType | np.ndarray, + left_mask: np.ndarray | None, + right_mask: np.ndarray | None, +): + """ + Boolean ``and`` using Kleene logic. + + Values are ``NA`` for ``NA & NA`` or ``True & NA``. + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical xor, and the new mask. + """ + # To reduce the number of cases, we ensure that `left` & `left_mask` + # always come from an array, not a scalar. This is safe, since + # A & B == B & A + if left_mask is None: + return kleene_and(right, left, right_mask, left_mask) + + if not isinstance(left, np.ndarray): + raise TypeError("Either `left` or `right` need to be a np.ndarray.") + raise_for_nan(right, method="and") + + if right is libmissing.NA: + result = np.zeros_like(left) + else: + result = left & right + + if right_mask is None: + # Scalar `right` + if right is libmissing.NA: + mask = (left & ~left_mask) | left_mask + + else: + mask = left_mask.copy() + if right is False: + # unmask everything + mask[:] = False + else: + # unmask where either left or right is False + left_false = ~(left | left_mask) + right_false = ~(right | right_mask) + mask = (left_mask & ~right_false) | (right_mask & ~left_false) + + return result, mask + + +def raise_for_nan(value, method: str) -> None: + if lib.is_float(value) and np.isnan(value): + raise ValueError(f"Cannot perform logical '{method}' with floating NaN") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/missing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/missing.py new file mode 100644 index 0000000000000000000000000000000000000000..fc685935a35fceab74012912d3c3cae65b9c1818 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/ops/missing.py @@ -0,0 +1,176 @@ +""" +Missing data handling for arithmetic operations. + +In particular, pandas conventions regarding division by zero differ +from numpy in the following ways: + 1) np.array([-1, 0, 1], dtype=dtype1) // np.array([0, 0, 0], dtype=dtype2) + gives [nan, nan, nan] for most dtype combinations, and [0, 0, 0] for + the remaining pairs + (the remaining being dtype1==dtype2==intN and dtype==dtype2==uintN). + + pandas convention is to return [-inf, nan, inf] for all dtype + combinations. + + Note: the numpy behavior described here is py3-specific. + + 2) np.array([-1, 0, 1], dtype=dtype1) % np.array([0, 0, 0], dtype=dtype2) + gives precisely the same results as the // operation. + + pandas convention is to return [nan, nan, nan] for all dtype + combinations. + + 3) divmod behavior consistent with 1) and 2). +""" +from __future__ import annotations + +import operator + +import numpy as np + +from pandas.core import roperator + + +def _fill_zeros(result: np.ndarray, x, y): + """ + If this is a reversed op, then flip x,y + + If we have an integer value (or array in y) + and we have 0's, fill them with np.nan, + return the result. + + Mask the nan's from x. + """ + if result.dtype.kind == "f": + return result + + is_variable_type = hasattr(y, "dtype") + is_scalar_type = not isinstance(y, np.ndarray) + + if not is_variable_type and not is_scalar_type: + # e.g. test_series_ops_name_retention with mod we get here with list/tuple + return result + + if is_scalar_type: + y = np.array(y) + + if y.dtype.kind in "iu": + ymask = y == 0 + if ymask.any(): + # GH#7325, mask and nans must be broadcastable + mask = ymask & ~np.isnan(result) + + # GH#9308 doing ravel on result and mask can improve putmask perf, + # but can also make unwanted copies. + result = result.astype("float64", copy=False) + + np.putmask(result, mask, np.nan) + + return result + + +def mask_zero_div_zero(x, y, result: np.ndarray) -> np.ndarray: + """ + Set results of 0 // 0 to np.nan, regardless of the dtypes + of the numerator or the denominator. + + Parameters + ---------- + x : ndarray + y : ndarray + result : ndarray + + Returns + ------- + ndarray + The filled result. + + Examples + -------- + >>> x = np.array([1, 0, -1], dtype=np.int64) + >>> x + array([ 1, 0, -1]) + >>> y = 0 # int 0; numpy behavior is different with float + >>> result = x // y + >>> result # raw numpy result does not fill division by zero + array([0, 0, 0]) + >>> mask_zero_div_zero(x, y, result) + array([ inf, nan, -inf]) + """ + + if not hasattr(y, "dtype"): + # e.g. scalar, tuple + y = np.array(y) + if not hasattr(x, "dtype"): + # e.g scalar, tuple + x = np.array(x) + + zmask = y == 0 + + if zmask.any(): + # Flip sign if necessary for -0.0 + zneg_mask = zmask & np.signbit(y) + zpos_mask = zmask & ~zneg_mask + + x_lt0 = x < 0 + x_gt0 = x > 0 + nan_mask = zmask & (x == 0) + neginf_mask = (zpos_mask & x_lt0) | (zneg_mask & x_gt0) + posinf_mask = (zpos_mask & x_gt0) | (zneg_mask & x_lt0) + + if nan_mask.any() or neginf_mask.any() or posinf_mask.any(): + # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN + result = result.astype("float64", copy=False) + + result[nan_mask] = np.nan + result[posinf_mask] = np.inf + result[neginf_mask] = -np.inf + + return result + + +def dispatch_fill_zeros(op, left, right, result): + """ + Call _fill_zeros with the appropriate fill value depending on the operation, + with special logic for divmod and rdivmod. + + Parameters + ---------- + op : function (operator.add, operator.div, ...) + left : object (np.ndarray for non-reversed ops) + We have excluded ExtensionArrays here + right : object (np.ndarray for reversed ops) + We have excluded ExtensionArrays here + result : ndarray + + Returns + ------- + result : np.ndarray + + Notes + ----- + For divmod and rdivmod, the `result` parameter and returned `result` + is a 2-tuple of ndarray objects. + """ + if op is divmod: + result = ( + mask_zero_div_zero(left, right, result[0]), + _fill_zeros(result[1], left, right), + ) + elif op is roperator.rdivmod: + result = ( + mask_zero_div_zero(right, left, result[0]), + _fill_zeros(result[1], right, left), + ) + elif op is operator.floordiv: + # Note: no need to do this for truediv; in py3 numpy behaves the way + # we want. + result = mask_zero_div_zero(left, right, result) + elif op is roperator.rfloordiv: + # Note: no need to do this for rtruediv; in py3 numpy behaves the way + # we want. + result = mask_zero_div_zero(right, left, result) + elif op is operator.mod: + result = _fill_zeros(result, left, right) + elif op is roperator.rmod: + result = _fill_zeros(result, right, left) + return result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3dfe83fe721be809554a7689937c6d2032e9c2bd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..823d1773b557c0e87e4638f5b69ab0fa24dd55f9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/concat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/concat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4be9971b14dfa2a4d66bdf603357a26c74e877d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/concat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/encoding.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/encoding.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..816dd384641450c62eb54a11d958b11eed0ea58c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/encoding.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/melt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/melt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06916d5349970d0cdd93c61746320d83bb50855b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/melt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/merge.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/merge.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8f50a723ffd9523a222d4e03cdd96ba42d3fa08 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/merge.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/pivot.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/pivot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef5125535651e71c6338a5fafc5648d3af1fc955 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/pivot.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/reshape.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/reshape.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2f820047a9a1c06e52394eb7f22174fa9f706b6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/reshape.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/tile.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/tile.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1faad294ec9b0e5fb938a67d30c4168ed936a736 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/tile.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/util.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/util.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..729c16a421c673bb64857bea73243b3ab09ffabb Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/__pycache__/util.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/api.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/api.py new file mode 100644 index 0000000000000000000000000000000000000000..b1884c497f0ad7000351e131ef11dadab4a7c700 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/api.py @@ -0,0 +1,41 @@ +from pandas.core.reshape.concat import concat +from pandas.core.reshape.encoding import ( + from_dummies, + get_dummies, +) +from pandas.core.reshape.melt import ( + lreshape, + melt, + wide_to_long, +) +from pandas.core.reshape.merge import ( + merge, + merge_asof, + merge_ordered, +) +from pandas.core.reshape.pivot import ( + crosstab, + pivot, + pivot_table, +) +from pandas.core.reshape.tile import ( + cut, + qcut, +) + +__all__ = [ + "concat", + "crosstab", + "cut", + "from_dummies", + "get_dummies", + "lreshape", + "melt", + "merge", + "merge_asof", + "merge_ordered", + "pivot", + "pivot_table", + "qcut", + "wide_to_long", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py new file mode 100644 index 0000000000000000000000000000000000000000..dc18bb65b35bcfa5c3789b35e7d41690923b50a7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py @@ -0,0 +1,894 @@ +""" +Concat routines. +""" +from __future__ import annotations + +from collections import abc +from typing import ( + TYPE_CHECKING, + Callable, + Literal, + cast, + overload, +) +import warnings + +import numpy as np + +from pandas._config import using_copy_on_write + +from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_bool, + is_iterator, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.arrays.categorical import ( + factorize_from_iterable, + factorize_from_iterables, +) +import pandas.core.common as com +from pandas.core.indexes.api import ( + Index, + MultiIndex, + all_indexes_same, + default_index, + ensure_index, + get_objs_combined_axis, + get_unanimous_names, +) +from pandas.core.internals import concatenate_managers + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterable, + Mapping, + ) + + from pandas._typing import ( + Axis, + AxisInt, + HashableT, + ) + + from pandas import ( + DataFrame, + Series, + ) + +# --------------------------------------------------------------------- +# Concatenate DataFrame objects + + +@overload +def concat( + objs: Iterable[DataFrame] | Mapping[HashableT, DataFrame], + *, + axis: Literal[0, "index"] = ..., + join: str = ..., + ignore_index: bool = ..., + keys: Iterable[Hashable] | None = ..., + levels=..., + names: list[HashableT] | None = ..., + verify_integrity: bool = ..., + sort: bool = ..., + copy: bool | None = ..., +) -> DataFrame: + ... + + +@overload +def concat( + objs: Iterable[Series] | Mapping[HashableT, Series], + *, + axis: Literal[0, "index"] = ..., + join: str = ..., + ignore_index: bool = ..., + keys: Iterable[Hashable] | None = ..., + levels=..., + names: list[HashableT] | None = ..., + verify_integrity: bool = ..., + sort: bool = ..., + copy: bool | None = ..., +) -> Series: + ... + + +@overload +def concat( + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], + *, + axis: Literal[0, "index"] = ..., + join: str = ..., + ignore_index: bool = ..., + keys: Iterable[Hashable] | None = ..., + levels=..., + names: list[HashableT] | None = ..., + verify_integrity: bool = ..., + sort: bool = ..., + copy: bool | None = ..., +) -> DataFrame | Series: + ... + + +@overload +def concat( + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], + *, + axis: Literal[1, "columns"], + join: str = ..., + ignore_index: bool = ..., + keys: Iterable[Hashable] | None = ..., + levels=..., + names: list[HashableT] | None = ..., + verify_integrity: bool = ..., + sort: bool = ..., + copy: bool | None = ..., +) -> DataFrame: + ... + + +@overload +def concat( + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], + *, + axis: Axis = ..., + join: str = ..., + ignore_index: bool = ..., + keys: Iterable[Hashable] | None = ..., + levels=..., + names: list[HashableT] | None = ..., + verify_integrity: bool = ..., + sort: bool = ..., + copy: bool | None = ..., +) -> DataFrame | Series: + ... + + +def concat( + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], + *, + axis: Axis = 0, + join: str = "outer", + ignore_index: bool = False, + keys: Iterable[Hashable] | None = None, + levels=None, + names: list[HashableT] | None = None, + verify_integrity: bool = False, + sort: bool = False, + copy: bool | None = None, +) -> DataFrame | Series: + """ + Concatenate pandas objects along a particular axis. + + Allows optional set logic along the other axes. + + Can also add a layer of hierarchical indexing on the concatenation axis, + which may be useful if the labels are the same (or overlapping) on + the passed axis number. + + Parameters + ---------- + objs : a sequence or mapping of Series or DataFrame objects + If a mapping is passed, the sorted keys will be used as the `keys` + argument, unless it is passed, in which case the values will be + selected (see below). Any None objects will be dropped silently unless + they are all None in which case a ValueError will be raised. + axis : {0/'index', 1/'columns'}, default 0 + The axis to concatenate along. + join : {'inner', 'outer'}, default 'outer' + How to handle indexes on other axis (or axes). + ignore_index : bool, default False + If True, do not use the index values along the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. Note the index values on the other + axes are still respected in the join. + keys : sequence, default None + If multiple levels passed, should contain tuples. Construct + hierarchical index using the passed keys as the outermost level. + levels : list of sequences, default None + Specific levels (unique values) to use for constructing a + MultiIndex. Otherwise they will be inferred from the keys. + names : list, default None + Names for the levels in the resulting hierarchical index. + verify_integrity : bool, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation. + sort : bool, default False + Sort non-concatenation axis if it is not already aligned. One exception to + this is when the non-concatentation axis is a DatetimeIndex and join='outer' + and the axis is not already aligned. In that case, the non-concatenation + axis is always sorted lexicographically. + copy : bool, default True + If False, do not copy data unnecessarily. + + Returns + ------- + object, type of objs + When concatenating all ``Series`` along the index (axis=0), a + ``Series`` is returned. When ``objs`` contains at least one + ``DataFrame``, a ``DataFrame`` is returned. When concatenating along + the columns (axis=1), a ``DataFrame`` is returned. + + See Also + -------- + DataFrame.join : Join DataFrames using indexes. + DataFrame.merge : Merge DataFrames by indexes or columns. + + Notes + ----- + The keys, levels, and names arguments are all optional. + + A walkthrough of how this method fits in with other tools for combining + pandas objects can be found `here + `__. + + It is not recommended to build DataFrames by adding single rows in a + for loop. Build a list of rows and make a DataFrame in a single concat. + + Examples + -------- + Combine two ``Series``. + + >>> s1 = pd.Series(['a', 'b']) + >>> s2 = pd.Series(['c', 'd']) + >>> pd.concat([s1, s2]) + 0 a + 1 b + 0 c + 1 d + dtype: object + + Clear the existing index and reset it in the result + by setting the ``ignore_index`` option to ``True``. + + >>> pd.concat([s1, s2], ignore_index=True) + 0 a + 1 b + 2 c + 3 d + dtype: object + + Add a hierarchical index at the outermost level of + the data with the ``keys`` option. + + >>> pd.concat([s1, s2], keys=['s1', 's2']) + s1 0 a + 1 b + s2 0 c + 1 d + dtype: object + + Label the index keys you create with the ``names`` option. + + >>> pd.concat([s1, s2], keys=['s1', 's2'], + ... names=['Series name', 'Row ID']) + Series name Row ID + s1 0 a + 1 b + s2 0 c + 1 d + dtype: object + + Combine two ``DataFrame`` objects with identical columns. + + >>> df1 = pd.DataFrame([['a', 1], ['b', 2]], + ... columns=['letter', 'number']) + >>> df1 + letter number + 0 a 1 + 1 b 2 + >>> df2 = pd.DataFrame([['c', 3], ['d', 4]], + ... columns=['letter', 'number']) + >>> df2 + letter number + 0 c 3 + 1 d 4 + >>> pd.concat([df1, df2]) + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + Combine ``DataFrame`` objects with overlapping columns + and return everything. Columns outside the intersection will + be filled with ``NaN`` values. + + >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], + ... columns=['letter', 'number', 'animal']) + >>> df3 + letter number animal + 0 c 3 cat + 1 d 4 dog + >>> pd.concat([df1, df3], sort=False) + letter number animal + 0 a 1 NaN + 1 b 2 NaN + 0 c 3 cat + 1 d 4 dog + + Combine ``DataFrame`` objects with overlapping columns + and return only those that are shared by passing ``inner`` to + the ``join`` keyword argument. + + >>> pd.concat([df1, df3], join="inner") + letter number + 0 a 1 + 1 b 2 + 0 c 3 + 1 d 4 + + Combine ``DataFrame`` objects horizontally along the x axis by + passing in ``axis=1``. + + >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']], + ... columns=['animal', 'name']) + >>> pd.concat([df1, df4], axis=1) + letter number animal name + 0 a 1 bird polly + 1 b 2 monkey george + + Prevent the result from including duplicate index values with the + ``verify_integrity`` option. + + >>> df5 = pd.DataFrame([1], index=['a']) + >>> df5 + 0 + a 1 + >>> df6 = pd.DataFrame([2], index=['a']) + >>> df6 + 0 + a 2 + >>> pd.concat([df5, df6], verify_integrity=True) + Traceback (most recent call last): + ... + ValueError: Indexes have overlapping values: ['a'] + + Append a single row to the end of a ``DataFrame`` object. + + >>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0]) + >>> df7 + a b + 0 1 2 + >>> new_row = pd.Series({'a': 3, 'b': 4}) + >>> new_row + a 3 + b 4 + dtype: int64 + >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True) + a b + 0 1 2 + 1 3 4 + """ + if copy is None: + if using_copy_on_write(): + copy = False + else: + copy = True + elif copy and using_copy_on_write(): + copy = False + + op = _Concatenator( + objs, + axis=axis, + ignore_index=ignore_index, + join=join, + keys=keys, + levels=levels, + names=names, + verify_integrity=verify_integrity, + copy=copy, + sort=sort, + ) + + return op.get_result() + + +class _Concatenator: + """ + Orchestrates a concatenation operation for BlockManagers + """ + + sort: bool + + def __init__( + self, + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], + axis: Axis = 0, + join: str = "outer", + keys: Iterable[Hashable] | None = None, + levels=None, + names: list[HashableT] | None = None, + ignore_index: bool = False, + verify_integrity: bool = False, + copy: bool = True, + sort: bool = False, + ) -> None: + if isinstance(objs, (ABCSeries, ABCDataFrame, str)): + raise TypeError( + "first argument must be an iterable of pandas " + f'objects, you passed an object of type "{type(objs).__name__}"' + ) + + if join == "outer": + self.intersect = False + elif join == "inner": + self.intersect = True + else: # pragma: no cover + raise ValueError( + "Only can inner (intersect) or outer (union) join the other axis" + ) + + if not is_bool(sort): + raise ValueError( + f"The 'sort' keyword only accepts boolean values; {sort} was passed." + ) + # Incompatible types in assignment (expression has type "Union[bool, bool_]", + # variable has type "bool") + self.sort = sort # type: ignore[assignment] + + self.ignore_index = ignore_index + self.verify_integrity = verify_integrity + self.copy = copy + + objs, keys = self._clean_keys_and_objs(objs, keys) + + # figure out what our result ndim is going to be + ndims = self._get_ndims(objs) + sample, objs = self._get_sample_object(objs, ndims, keys, names, levels) + + # Standardize axis parameter to int + if sample.ndim == 1: + from pandas import DataFrame + + axis = DataFrame._get_axis_number(axis) + self._is_frame = False + self._is_series = True + else: + axis = sample._get_axis_number(axis) + self._is_frame = True + self._is_series = False + + # Need to flip BlockManager axis in the DataFrame special case + axis = sample._get_block_manager_axis(axis) + + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + objs = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) + + self.objs = objs + + # note: this is the BlockManager axis (since DataFrame is transposed) + self.bm_axis = axis + self.axis = 1 - self.bm_axis if self._is_frame else 0 + self.keys = keys + self.names = names or getattr(keys, "names", None) + self.levels = levels + + def _get_ndims(self, objs: list[Series | DataFrame]) -> set[int]: + # figure out what our result ndim is going to be + ndims = set() + for obj in objs: + if not isinstance(obj, (ABCSeries, ABCDataFrame)): + msg = ( + f"cannot concatenate object of type '{type(obj)}'; " + "only Series and DataFrame objs are valid" + ) + raise TypeError(msg) + + ndims.add(obj.ndim) + return ndims + + def _clean_keys_and_objs( + self, + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], + keys, + ) -> tuple[list[Series | DataFrame], Index | None]: + if isinstance(objs, abc.Mapping): + if keys is None: + keys = list(objs.keys()) + objs_list = [objs[k] for k in keys] + else: + objs_list = list(objs) + + if len(objs_list) == 0: + raise ValueError("No objects to concatenate") + + if keys is None: + objs_list = list(com.not_none(*objs_list)) + else: + # GH#1649 + clean_keys = [] + clean_objs = [] + if is_iterator(keys): + keys = list(keys) + if len(keys) != len(objs_list): + # GH#43485 + warnings.warn( + "The behavior of pd.concat with len(keys) != len(objs) is " + "deprecated. In a future version this will raise instead of " + "truncating to the smaller of the two sequences", + FutureWarning, + stacklevel=find_stack_level(), + ) + for k, v in zip(keys, objs_list): + if v is None: + continue + clean_keys.append(k) + clean_objs.append(v) + objs_list = clean_objs + + if isinstance(keys, MultiIndex): + # TODO: retain levels? + keys = type(keys).from_tuples(clean_keys, names=keys.names) + else: + name = getattr(keys, "name", None) + keys = Index(clean_keys, name=name, dtype=getattr(keys, "dtype", None)) + + if len(objs_list) == 0: + raise ValueError("All objects passed were None") + + return objs_list, keys + + def _get_sample_object( + self, + objs: list[Series | DataFrame], + ndims: set[int], + keys, + names, + levels, + ) -> tuple[Series | DataFrame, list[Series | DataFrame]]: + # get the sample + # want the highest ndim that we have, and must be non-empty + # unless all objs are empty + sample: Series | DataFrame | None = None + if len(ndims) > 1: + max_ndim = max(ndims) + for obj in objs: + if obj.ndim == max_ndim and np.sum(obj.shape): + sample = obj + break + + else: + # filter out the empties if we have not multi-index possibilities + # note to keep empty Series as it affect to result columns / name + non_empties = [obj for obj in objs if sum(obj.shape) > 0 or obj.ndim == 1] + + if len(non_empties) and ( + keys is None and names is None and levels is None and not self.intersect + ): + objs = non_empties + sample = objs[0] + + if sample is None: + sample = objs[0] + return sample, objs + + def _sanitize_mixed_ndim( + self, + objs: list[Series | DataFrame], + sample: Series | DataFrame, + ignore_index: bool, + axis: AxisInt, + ) -> list[Series | DataFrame]: + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + + new_objs = [] + + current_column = 0 + max_ndim = sample.ndim + for obj in objs: + ndim = obj.ndim + if ndim == max_ndim: + pass + + elif ndim != max_ndim - 1: + raise ValueError( + "cannot concatenate unaligned mixed dimensional NDFrame objects" + ) + + else: + name = getattr(obj, "name", None) + if ignore_index or name is None: + if axis == 1: + # doing a row-wise concatenation so need everything + # to line up + name = 0 + else: + # doing a column-wise concatenation so need series + # to have unique names + name = current_column + current_column += 1 + + obj = sample._constructor({name: obj}, copy=False) + + new_objs.append(obj) + + return new_objs + + def get_result(self): + cons: Callable[..., DataFrame | Series] + sample: DataFrame | Series + + # series only + if self._is_series: + sample = cast("Series", self.objs[0]) + + # stack blocks + if self.bm_axis == 0: + name = com.consensus_name_attr(self.objs) + cons = sample._constructor + + arrs = [ser._values for ser in self.objs] + + res = concat_compat(arrs, axis=0) + + new_index: Index + if self.ignore_index: + # We can avoid surprisingly-expensive _get_concat_axis + new_index = default_index(len(res)) + else: + new_index = self.new_axes[0] + + mgr = type(sample._mgr).from_array(res, index=new_index) + + result = sample._constructor_from_mgr(mgr, axes=mgr.axes) + result._name = name + return result.__finalize__(self, method="concat") + + # combine as columns in a frame + else: + data = dict(zip(range(len(self.objs)), self.objs)) + + # GH28330 Preserves subclassed objects through concat + cons = sample._constructor_expanddim + + index, columns = self.new_axes + df = cons(data, index=index, copy=self.copy) + df.columns = columns + return df.__finalize__(self, method="concat") + + # combine block managers + else: + sample = cast("DataFrame", self.objs[0]) + + mgrs_indexers = [] + for obj in self.objs: + indexers = {} + for ax, new_labels in enumerate(self.new_axes): + # ::-1 to convert BlockManager ax to DataFrame ax + if ax == self.bm_axis: + # Suppress reindexing on concat axis + continue + + # 1-ax to convert BlockManager axis to DataFrame axis + obj_labels = obj.axes[1 - ax] + if not new_labels.equals(obj_labels): + indexers[ax] = obj_labels.get_indexer(new_labels) + + mgrs_indexers.append((obj._mgr, indexers)) + + new_data = concatenate_managers( + mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy + ) + if not self.copy and not using_copy_on_write(): + new_data._consolidate_inplace() + + out = sample._constructor_from_mgr(new_data, axes=new_data.axes) + return out.__finalize__(self, method="concat") + + def _get_result_dim(self) -> int: + if self._is_series and self.bm_axis == 1: + return 2 + else: + return self.objs[0].ndim + + @cache_readonly + def new_axes(self) -> list[Index]: + ndim = self._get_result_dim() + return [ + self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i) + for i in range(ndim) + ] + + def _get_comb_axis(self, i: AxisInt) -> Index: + data_axis = self.objs[0]._get_block_manager_axis(i) + return get_objs_combined_axis( + self.objs, + axis=data_axis, + intersect=self.intersect, + sort=self.sort, + copy=self.copy, + ) + + @cache_readonly + def _get_concat_axis(self) -> Index: + """ + Return index to be used along concatenation axis. + """ + if self._is_series: + if self.bm_axis == 0: + indexes = [x.index for x in self.objs] + elif self.ignore_index: + idx = default_index(len(self.objs)) + return idx + elif self.keys is None: + names: list[Hashable] = [None] * len(self.objs) + num = 0 + has_names = False + for i, x in enumerate(self.objs): + if x.ndim != 1: + raise TypeError( + f"Cannot concatenate type 'Series' with " + f"object of type '{type(x).__name__}'" + ) + if x.name is not None: + names[i] = x.name + has_names = True + else: + names[i] = num + num += 1 + if has_names: + return Index(names) + else: + return default_index(len(self.objs)) + else: + return ensure_index(self.keys).set_names(self.names) + else: + indexes = [x.axes[self.axis] for x in self.objs] + + if self.ignore_index: + idx = default_index(sum(len(i) for i in indexes)) + return idx + + if self.keys is None: + if self.levels is not None: + raise ValueError("levels supported only when keys is not None") + concat_axis = _concat_indexes(indexes) + else: + concat_axis = _make_concat_multiindex( + indexes, self.keys, self.levels, self.names + ) + + self._maybe_check_integrity(concat_axis) + + return concat_axis + + def _maybe_check_integrity(self, concat_index: Index): + if self.verify_integrity: + if not concat_index.is_unique: + overlap = concat_index[concat_index.duplicated()].unique() + raise ValueError(f"Indexes have overlapping values: {overlap}") + + +def _concat_indexes(indexes) -> Index: + return indexes[0].append(indexes[1:]) + + +def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex: + if (levels is None and isinstance(keys[0], tuple)) or ( + levels is not None and len(levels) > 1 + ): + zipped = list(zip(*keys)) + if names is None: + names = [None] * len(zipped) + + if levels is None: + _, levels = factorize_from_iterables(zipped) + else: + levels = [ensure_index(x) for x in levels] + else: + zipped = [keys] + if names is None: + names = [None] + + if levels is None: + levels = [ensure_index(keys).unique()] + else: + levels = [ensure_index(x) for x in levels] + + for level in levels: + if not level.is_unique: + raise ValueError(f"Level values not unique: {level.tolist()}") + + if not all_indexes_same(indexes) or not all(level.is_unique for level in levels): + codes_list = [] + + # things are potentially different sizes, so compute the exact codes + # for each level and pass those to MultiIndex.from_arrays + + for hlevel, level in zip(zipped, levels): + to_concat = [] + if isinstance(hlevel, Index) and hlevel.equals(level): + lens = [len(idx) for idx in indexes] + codes_list.append(np.repeat(np.arange(len(hlevel)), lens)) + else: + for key, index in zip(hlevel, indexes): + # Find matching codes, include matching nan values as equal. + mask = (isna(level) & isna(key)) | (level == key) + if not mask.any(): + raise ValueError(f"Key {key} not in level {level}") + i = np.nonzero(mask)[0][0] + + to_concat.append(np.repeat(i, len(index))) + codes_list.append(np.concatenate(to_concat)) + + concat_index = _concat_indexes(indexes) + + # these go at the end + if isinstance(concat_index, MultiIndex): + levels.extend(concat_index.levels) + codes_list.extend(concat_index.codes) + else: + codes, categories = factorize_from_iterable(concat_index) + levels.append(categories) + codes_list.append(codes) + + if len(names) == len(levels): + names = list(names) + else: + # make sure that all of the passed indices have the same nlevels + if not len({idx.nlevels for idx in indexes}) == 1: + raise AssertionError( + "Cannot concat indices that do not have the same number of levels" + ) + + # also copies + names = list(names) + list(get_unanimous_names(*indexes)) + + return MultiIndex( + levels=levels, codes=codes_list, names=names, verify_integrity=False + ) + + new_index = indexes[0] + n = len(new_index) + kpieces = len(indexes) + + # also copies + new_names = list(names) + new_levels = list(levels) + + # construct codes + new_codes = [] + + # do something a bit more speedy + + for hlevel, level in zip(zipped, levels): + hlevel_index = ensure_index(hlevel) + mapped = level.get_indexer(hlevel_index) + + mask = mapped == -1 + if mask.any(): + raise ValueError( + f"Values not found in passed level: {hlevel_index[mask]!s}" + ) + + new_codes.append(np.repeat(mapped, n)) + + if isinstance(new_index, MultiIndex): + new_levels.extend(new_index.levels) + new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) + else: + new_levels.append(new_index.unique()) + single_codes = new_index.unique().get_indexer(new_index) + new_codes.append(np.tile(single_codes, kpieces)) + + if len(new_names) < len(new_levels): + new_names.extend(new_index.names) + + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/encoding.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..3ed67bb7b7c02509f0bf493c5116258f67dc0ef7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/encoding.py @@ -0,0 +1,570 @@ +from __future__ import annotations + +from collections import defaultdict +from collections.abc import ( + Hashable, + Iterable, +) +import itertools +from typing import ( + TYPE_CHECKING, + cast, +) + +import numpy as np + +from pandas._libs.sparse import IntIndex + +from pandas.core.dtypes.common import ( + is_integer_dtype, + is_list_like, + is_object_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, +) + +from pandas.core.arrays import SparseArray +from pandas.core.arrays.categorical import factorize_from_iterable +from pandas.core.arrays.string_ import StringDtype +from pandas.core.frame import DataFrame +from pandas.core.indexes.api import ( + Index, + default_index, +) +from pandas.core.series import Series + +if TYPE_CHECKING: + from pandas._typing import NpDtype + + +def get_dummies( + data, + prefix=None, + prefix_sep: str | Iterable[str] | dict[str, str] = "_", + dummy_na: bool = False, + columns=None, + sparse: bool = False, + drop_first: bool = False, + dtype: NpDtype | None = None, +) -> DataFrame: + """ + Convert categorical variable into dummy/indicator variables. + + Each variable is converted in as many 0/1 variables as there are different + values. Columns in the output are each named after a value; if the input is + a DataFrame, the name of the original variable is prepended to the value. + + Parameters + ---------- + data : array-like, Series, or DataFrame + Data of which to get dummy indicators. + prefix : str, list of str, or dict of str, default None + String to append DataFrame column names. + Pass a list with length equal to the number of columns + when calling get_dummies on a DataFrame. Alternatively, `prefix` + can be a dictionary mapping column names to prefixes. + prefix_sep : str, default '_' + If appending prefix, separator/delimiter to use. Or pass a + list or dictionary as with `prefix`. + dummy_na : bool, default False + Add a column to indicate NaNs, if False NaNs are ignored. + columns : list-like, default None + Column names in the DataFrame to be encoded. + If `columns` is None then all the columns with + `object`, `string`, or `category` dtype will be converted. + sparse : bool, default False + Whether the dummy-encoded columns should be backed by + a :class:`SparseArray` (True) or a regular NumPy array (False). + drop_first : bool, default False + Whether to get k-1 dummies out of k categorical levels by removing the + first level. + dtype : dtype, default bool + Data type for new columns. Only a single dtype is allowed. + + Returns + ------- + DataFrame + Dummy-coded data. If `data` contains other columns than the + dummy-coded one(s), these will be prepended, unaltered, to the result. + + See Also + -------- + Series.str.get_dummies : Convert Series of strings to dummy codes. + :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> s = pd.Series(list('abca')) + + >>> pd.get_dummies(s) + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False + + >>> s1 = ['a', 'b', np.nan] + + >>> pd.get_dummies(s1) + a b + 0 True False + 1 False True + 2 False False + + >>> pd.get_dummies(s1, dummy_na=True) + a b NaN + 0 True False False + 1 False True False + 2 False False True + + >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], + ... 'C': [1, 2, 3]}) + + >>> pd.get_dummies(df, prefix=['col1', 'col2']) + C col1_a col1_b col2_a col2_b col2_c + 0 1 True False False True False + 1 2 False True True False False + 2 3 True False False False True + + >>> pd.get_dummies(pd.Series(list('abcaa'))) + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False + 4 True False False + + >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) + b c + 0 False False + 1 True False + 2 False True + 3 False False + 4 False False + + >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) + a b c + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + """ + from pandas.core.reshape.concat import concat + + dtypes_to_encode = ["object", "string", "category"] + + if isinstance(data, DataFrame): + # determine columns being encoded + if columns is None: + data_to_encode = data.select_dtypes(include=dtypes_to_encode) + elif not is_list_like(columns): + raise TypeError("Input must be a list-like for parameter `columns`") + else: + data_to_encode = data[columns] + + # validate prefixes and separator to avoid silently dropping cols + def check_len(item, name: str): + if is_list_like(item): + if not len(item) == data_to_encode.shape[1]: + len_msg = ( + f"Length of '{name}' ({len(item)}) did not match the " + "length of the columns being encoded " + f"({data_to_encode.shape[1]})." + ) + raise ValueError(len_msg) + + check_len(prefix, "prefix") + check_len(prefix_sep, "prefix_sep") + + if isinstance(prefix, str): + prefix = itertools.cycle([prefix]) + if isinstance(prefix, dict): + prefix = [prefix[col] for col in data_to_encode.columns] + + if prefix is None: + prefix = data_to_encode.columns + + # validate separators + if isinstance(prefix_sep, str): + prefix_sep = itertools.cycle([prefix_sep]) + elif isinstance(prefix_sep, dict): + prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] + + with_dummies: list[DataFrame] + if data_to_encode.shape == data.shape: + # Encoding the entire df, do not prepend any dropped columns + with_dummies = [] + elif columns is not None: + # Encoding only cols specified in columns. Get all cols not in + # columns to prepend to result. + with_dummies = [data.drop(columns, axis=1)] + else: + # Encoding only object and category dtype columns. Get remaining + # columns to prepend to result. + with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] + + for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep): + # col is (column_name, column), use just column data here + dummy = _get_dummies_1d( + col[1], + prefix=pre, + prefix_sep=sep, + dummy_na=dummy_na, + sparse=sparse, + drop_first=drop_first, + dtype=dtype, + ) + with_dummies.append(dummy) + result = concat(with_dummies, axis=1) + else: + result = _get_dummies_1d( + data, + prefix, + prefix_sep, + dummy_na, + sparse=sparse, + drop_first=drop_first, + dtype=dtype, + ) + return result + + +def _get_dummies_1d( + data, + prefix, + prefix_sep: str | Iterable[str] | dict[str, str] = "_", + dummy_na: bool = False, + sparse: bool = False, + drop_first: bool = False, + dtype: NpDtype | None = None, +) -> DataFrame: + from pandas.core.reshape.concat import concat + + # Series avoids inconsistent NaN handling + codes, levels = factorize_from_iterable(Series(data, copy=False)) + + if dtype is None and hasattr(data, "dtype"): + input_dtype = data.dtype + if isinstance(input_dtype, CategoricalDtype): + input_dtype = input_dtype.categories.dtype + + if isinstance(input_dtype, ArrowDtype): + import pyarrow as pa + + dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] + elif ( + isinstance(input_dtype, StringDtype) + and input_dtype.storage != "pyarrow_numpy" + ): + dtype = pandas_dtype("boolean") # type: ignore[assignment] + else: + dtype = np.dtype(bool) + elif dtype is None: + dtype = np.dtype(bool) + + _dtype = pandas_dtype(dtype) + + if is_object_dtype(_dtype): + raise ValueError("dtype=object is not a valid dtype for get_dummies") + + def get_empty_frame(data) -> DataFrame: + index: Index | np.ndarray + if isinstance(data, Series): + index = data.index + else: + index = default_index(len(data)) + return DataFrame(index=index) + + # if all NaN + if not dummy_na and len(levels) == 0: + return get_empty_frame(data) + + codes = codes.copy() + if dummy_na: + codes[codes == -1] = len(levels) + levels = levels.insert(len(levels), np.nan) + + # if dummy_na, we just fake a nan level. drop_first will drop it again + if drop_first and len(levels) == 1: + return get_empty_frame(data) + + number_of_cols = len(levels) + + if prefix is None: + dummy_cols = levels + else: + dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels]) + + index: Index | None + if isinstance(data, Series): + index = data.index + else: + index = None + + if sparse: + fill_value: bool | float + if is_integer_dtype(dtype): + fill_value = 0 + elif dtype == np.dtype(bool): + fill_value = False + else: + fill_value = 0.0 + + sparse_series = [] + N = len(data) + sp_indices: list[list] = [[] for _ in range(len(dummy_cols))] + mask = codes != -1 + codes = codes[mask] + n_idx = np.arange(N)[mask] + + for ndx, code in zip(n_idx, codes): + sp_indices[code].append(ndx) + + if drop_first: + # remove first categorical level to avoid perfect collinearity + # GH12042 + sp_indices = sp_indices[1:] + dummy_cols = dummy_cols[1:] + for col, ixs in zip(dummy_cols, sp_indices): + sarr = SparseArray( + np.ones(len(ixs), dtype=dtype), + sparse_index=IntIndex(N, ixs), + fill_value=fill_value, + dtype=dtype, + ) + sparse_series.append(Series(data=sarr, index=index, name=col, copy=False)) + + return concat(sparse_series, axis=1, copy=False) + + else: + # ensure ndarray layout is column-major + shape = len(codes), number_of_cols + dummy_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummy_dtype = _dtype + else: + dummy_dtype = np.bool_ + dummy_mat = np.zeros(shape=shape, dtype=dummy_dtype, order="F") + dummy_mat[np.arange(len(codes)), codes] = 1 + + if not dummy_na: + # reset NaN GH4446 + dummy_mat[codes == -1] = 0 + + if drop_first: + # remove first GH12042 + dummy_mat = dummy_mat[:, 1:] + dummy_cols = dummy_cols[1:] + return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype) + + +def from_dummies( + data: DataFrame, + sep: None | str = None, + default_category: None | Hashable | dict[str, Hashable] = None, +) -> DataFrame: + """ + Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables. + + Inverts the operation performed by :func:`~pandas.get_dummies`. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + data : DataFrame + Data which contains dummy-coded variables in form of integer columns of + 1's and 0's. + sep : str, default None + Separator used in the column names of the dummy categories they are + character indicating the separation of the categorical names from the prefixes. + For example, if your column names are 'prefix_A' and 'prefix_B', + you can strip the underscore by specifying sep='_'. + default_category : None, Hashable or dict of Hashables, default None + The default category is the implied category when a value has none of the + listed categories specified with a one, i.e. if all dummies in a row are + zero. Can be a single value for all variables or a dict directly mapping + the default categories to a prefix of a variable. + + Returns + ------- + DataFrame + Categorical data decoded from the dummy input-data. + + Raises + ------ + ValueError + * When the input ``DataFrame`` ``data`` contains NA values. + * When the input ``DataFrame`` ``data`` contains column names with separators + that do not match the separator specified with ``sep``. + * When a ``dict`` passed to ``default_category`` does not include an implied + category for each prefix. + * When a value in ``data`` has more than one category assigned to it. + * When ``default_category=None`` and a value in ``data`` has no category + assigned to it. + TypeError + * When the input ``data`` is not of type ``DataFrame``. + * When the input ``DataFrame`` ``data`` contains non-dummy data. + * When the passed ``sep`` is of a wrong data type. + * When the passed ``default_category`` is of a wrong data type. + + See Also + -------- + :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes. + :class:`~pandas.Categorical` : Represent a categorical variable in classic. + + Notes + ----- + The columns of the passed dummy data should only include 1's and 0's, + or boolean values. + + Examples + -------- + >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], + ... "c": [0, 0, 1, 0]}) + + >>> df + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + + >>> pd.from_dummies(df) + 0 a + 1 b + 2 c + 3 a + + >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], + ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 1]}) + + >>> df + col1_a col1_b col2_a col2_b col2_c + 0 1 0 0 1 0 + 1 0 1 1 0 0 + 2 1 0 0 0 1 + + >>> pd.from_dummies(df, sep="_") + col1 col2 + 0 a b + 1 b a + 2 a c + + >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0], + ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], + ... "col2_c": [0, 0, 0]}) + + >>> df + col1_a col1_b col2_a col2_b col2_c + 0 1 0 0 1 0 + 1 0 1 1 0 0 + 2 0 0 0 0 0 + + >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"}) + col1 col2 + 0 a b + 1 b a + 2 d e + """ + from pandas.core.reshape.concat import concat + + if not isinstance(data, DataFrame): + raise TypeError( + "Expected 'data' to be a 'DataFrame'; " + f"Received 'data' of type: {type(data).__name__}" + ) + + col_isna_mask = cast(Series, data.isna().any()) + + if col_isna_mask.any(): + raise ValueError( + "Dummy DataFrame contains NA value in column: " + f"'{col_isna_mask.idxmax()}'" + ) + + # index data with a list of all columns that are dummies + try: + data_to_decode = data.astype("boolean", copy=False) + except TypeError: + raise TypeError("Passed DataFrame contains non-dummy data") + + # collect prefixes and get lists to slice data for each prefix + variables_slice = defaultdict(list) + if sep is None: + variables_slice[""] = list(data.columns) + elif isinstance(sep, str): + for col in data_to_decode.columns: + prefix = col.split(sep)[0] + if len(prefix) == len(col): + raise ValueError(f"Separator not specified for column: {col}") + variables_slice[prefix].append(col) + else: + raise TypeError( + "Expected 'sep' to be of type 'str' or 'None'; " + f"Received 'sep' of type: {type(sep).__name__}" + ) + + if default_category is not None: + if isinstance(default_category, dict): + if not len(default_category) == len(variables_slice): + len_msg = ( + f"Length of 'default_category' ({len(default_category)}) " + f"did not match the length of the columns being encoded " + f"({len(variables_slice)})" + ) + raise ValueError(len_msg) + elif isinstance(default_category, Hashable): + default_category = dict( + zip(variables_slice, [default_category] * len(variables_slice)) + ) + else: + raise TypeError( + "Expected 'default_category' to be of type " + "'None', 'Hashable', or 'dict'; " + "Received 'default_category' of type: " + f"{type(default_category).__name__}" + ) + + cat_data = {} + for prefix, prefix_slice in variables_slice.items(): + if sep is None: + cats = prefix_slice.copy() + else: + cats = [col[len(prefix + sep) :] for col in prefix_slice] + assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1) + if any(assigned > 1): + raise ValueError( + "Dummy DataFrame contains multi-assignment(s); " + f"First instance in row: {assigned.idxmax()}" + ) + if any(assigned == 0): + if isinstance(default_category, dict): + cats.append(default_category[prefix]) + else: + raise ValueError( + "Dummy DataFrame contains unassigned value(s); " + f"First instance in row: {assigned.idxmin()}" + ) + data_slice = concat( + (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1 + ) + else: + data_slice = data_to_decode.loc[:, prefix_slice] + cats_array = data._constructor_sliced(cats, dtype=data.columns.dtype) + # get indices of True entries along axis=1 + true_values = data_slice.idxmax(axis=1) + indexer = data_slice.columns.get_indexer_for(true_values) + cat_data[prefix] = cats_array.take(indexer).set_axis(data.index) + + result = DataFrame(cat_data) + if sep is not None: + result.columns = result.columns.astype(data.columns.dtype) + return result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/melt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/melt.py new file mode 100644 index 0000000000000000000000000000000000000000..e54f847895f1a42cf1782392da684f2bcfa7e81c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/melt.py @@ -0,0 +1,512 @@ +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +import numpy as np + +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.missing import notna + +import pandas.core.algorithms as algos +from pandas.core.indexes.api import MultiIndex +from pandas.core.reshape.concat import concat +from pandas.core.reshape.util import tile_compat +from pandas.core.shared_docs import _shared_docs +from pandas.core.tools.numeric import to_numeric + +if TYPE_CHECKING: + from collections.abc import Hashable + + from pandas._typing import AnyArrayLike + + from pandas import DataFrame + + +def ensure_list_vars(arg_vars, variable: str, columns) -> list: + if arg_vars is not None: + if not is_list_like(arg_vars): + return [arg_vars] + elif isinstance(columns, MultiIndex) and not isinstance(arg_vars, list): + raise ValueError( + f"{variable} must be a list of tuples when columns are a MultiIndex" + ) + else: + return list(arg_vars) + else: + return [] + + +@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"}) +def melt( + frame: DataFrame, + id_vars=None, + value_vars=None, + var_name=None, + value_name: Hashable = "value", + col_level=None, + ignore_index: bool = True, +) -> DataFrame: + if value_name in frame.columns: + raise ValueError( + f"value_name ({value_name}) cannot match an element in " + "the DataFrame columns." + ) + id_vars = ensure_list_vars(id_vars, "id_vars", frame.columns) + value_vars_was_not_none = value_vars is not None + value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns) + + if id_vars or value_vars: + if col_level is not None: + level = frame.columns.get_level_values(col_level) + else: + level = frame.columns + labels = id_vars + value_vars + idx = level.get_indexer_for(labels) + missing = idx == -1 + if missing.any(): + missing_labels = [ + lab for lab, not_found in zip(labels, missing) if not_found + ] + raise KeyError( + "The following id_vars or value_vars are not present in " + f"the DataFrame: {missing_labels}" + ) + if value_vars_was_not_none: + frame = frame.iloc[:, algos.unique(idx)] + else: + frame = frame.copy() + else: + frame = frame.copy() + + if col_level is not None: # allow list or other? + # frame is a copy + frame.columns = frame.columns.get_level_values(col_level) + + if var_name is None: + if isinstance(frame.columns, MultiIndex): + if len(frame.columns.names) == len(set(frame.columns.names)): + var_name = frame.columns.names + else: + var_name = [f"variable_{i}" for i in range(len(frame.columns.names))] + else: + var_name = [ + frame.columns.name if frame.columns.name is not None else "variable" + ] + elif is_list_like(var_name): + raise ValueError(f"{var_name=} must be a scalar.") + else: + var_name = [var_name] + + num_rows, K = frame.shape + num_cols_adjusted = K - len(id_vars) + + mdata: dict[Hashable, AnyArrayLike] = {} + for col in id_vars: + id_data = frame.pop(col) + if not isinstance(id_data.dtype, np.dtype): + # i.e. ExtensionDtype + if num_cols_adjusted > 0: + mdata[col] = concat([id_data] * num_cols_adjusted, ignore_index=True) + else: + # We can't concat empty list. (GH 46044) + mdata[col] = type(id_data)([], name=id_data.name, dtype=id_data.dtype) + else: + mdata[col] = np.tile(id_data._values, num_cols_adjusted) + + mcolumns = id_vars + var_name + [value_name] + + if frame.shape[1] > 0 and not any( + not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes + ): + mdata[value_name] = concat( + [frame.iloc[:, i] for i in range(frame.shape[1])] + ).values + else: + mdata[value_name] = frame._values.ravel("F") + for i, col in enumerate(var_name): + mdata[col] = frame.columns._get_level_values(i).repeat(num_rows) + + result = frame._constructor(mdata, columns=mcolumns) + + if not ignore_index: + result.index = tile_compat(frame.index, num_cols_adjusted) + + return result + + +def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame: + """ + Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. + + Accepts a dictionary, ``groups``, in which each key is a new column name + and each value is a list of old column names that will be "melted" under + the new column name as part of the reshape. + + Parameters + ---------- + data : DataFrame + The wide-format DataFrame. + groups : dict + {new_name : list_of_columns}. + dropna : bool, default True + Do not include columns whose entries are all NaN. + + Returns + ------- + DataFrame + Reshaped DataFrame. + + See Also + -------- + melt : Unpivot a DataFrame from wide to long format, optionally leaving + identifiers set. + pivot : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. + + Examples + -------- + >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], + ... 'team': ['Red Sox', 'Yankees'], + ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) + >>> data + hr1 hr2 team year1 year2 + 0 514 545 Red Sox 2007 2008 + 1 573 526 Yankees 2007 2008 + + >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) + team year hr + 0 Red Sox 2007 514 + 1 Yankees 2007 573 + 2 Red Sox 2008 545 + 3 Yankees 2008 526 + """ + mdata = {} + pivot_cols = [] + all_cols: set[Hashable] = set() + K = len(next(iter(groups.values()))) + for target, names in groups.items(): + if len(names) != K: + raise ValueError("All column lists must be same length") + to_concat = [data[col]._values for col in names] + + mdata[target] = concat_compat(to_concat) + pivot_cols.append(target) + all_cols = all_cols.union(names) + + id_cols = list(data.columns.difference(all_cols)) + for col in id_cols: + mdata[col] = np.tile(data[col]._values, K) + + if dropna: + mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) + for c in pivot_cols: + mask &= notna(mdata[c]) + if not mask.all(): + mdata = {k: v[mask] for k, v in mdata.items()} + + return data._constructor(mdata, columns=id_cols + pivot_cols) + + +def wide_to_long( + df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+" +) -> DataFrame: + r""" + Unpivot a DataFrame from wide to long format. + + Less flexible but more user-friendly than melt. + + With stubnames ['A', 'B'], this function expects to find one or more + group of columns with format + A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,... + You specify what you want to call this suffix in the resulting long format + with `j` (for example `j='year'`) + + Each row of these wide variables are assumed to be uniquely identified by + `i` (can be a single column name or a list of column names) + + All remaining variables in the data frame are left intact. + + Parameters + ---------- + df : DataFrame + The wide-format DataFrame. + stubnames : str or list-like + The stub name(s). The wide format variables are assumed to + start with the stub names. + i : str or list-like + Column(s) to use as id variable(s). + j : str + The name of the sub-observation variable. What you wish to name your + suffix in the long format. + sep : str, default "" + A character indicating the separation of the variable names + in the wide format, to be stripped from the names in the long format. + For example, if your column names are A-suffix1, A-suffix2, you + can strip the hyphen by specifying `sep='-'`. + suffix : str, default '\\d+' + A regular expression capturing the wanted suffixes. '\\d+' captures + numeric suffixes. Suffixes with no numbers could be specified with the + negated character class '\\D+'. You can also further disambiguate + suffixes, for example, if your wide variables are of the form A-one, + B-two,.., and you have an unrelated column A-rating, you can ignore the + last one by specifying `suffix='(!?one|two)'`. When all suffixes are + numeric, they are cast to int64/float64. + + Returns + ------- + DataFrame + A DataFrame that contains each stub name as a variable, with new index + (i, j). + + See Also + -------- + melt : Unpivot a DataFrame from wide to long format, optionally leaving + identifiers set. + pivot : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + + Notes + ----- + All extra variables are left untouched. This simply uses + `pandas.melt` under the hood, but is hard-coded to "do the right thing" + in a typical case. + + Examples + -------- + >>> np.random.seed(123) + >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, + ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, + ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, + ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, + ... "X" : dict(zip(range(3), np.random.randn(3))) + ... }) + >>> df["id"] = df.index + >>> df + A1970 A1980 B1970 B1980 X id + 0 a d 2.5 3.2 -1.085631 0 + 1 b e 1.2 1.3 0.997345 1 + 2 c f 0.7 0.1 0.282978 2 + >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") + ... # doctest: +NORMALIZE_WHITESPACE + X A B + id year + 0 1970 -1.085631 a 2.5 + 1 1970 0.997345 b 1.2 + 2 1970 0.282978 c 0.7 + 0 1980 -1.085631 d 3.2 + 1 1980 0.997345 e 1.3 + 2 1980 0.282978 f 0.1 + + With multiple id columns + + >>> df = pd.DataFrame({ + ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + ... }) + >>> df + famid birth ht1 ht2 + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 + >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') + >>> l + ... # doctest: +NORMALIZE_WHITESPACE + ht + famid birth age + 1 1 1 2.8 + 2 3.4 + 2 1 2.9 + 2 3.8 + 3 1 2.2 + 2 2.9 + 2 1 1 2.0 + 2 3.2 + 2 1 1.8 + 2 2.8 + 3 1 1.9 + 2 2.4 + 3 1 1 2.2 + 2 3.3 + 2 1 2.3 + 2 3.4 + 3 1 2.1 + 2 2.9 + + Going from long back to wide just takes some creative use of `unstack` + + >>> w = l.unstack() + >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format) + >>> w.reset_index() + famid birth ht1 ht2 + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 + + Less wieldy column names are also handled + + >>> np.random.seed(0) + >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3), + ... 'A(weekly)-2011': np.random.rand(3), + ... 'B(weekly)-2010': np.random.rand(3), + ... 'B(weekly)-2011': np.random.rand(3), + ... 'X' : np.random.randint(3, size=3)}) + >>> df['id'] = df.index + >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id + 0 0.548814 0.544883 0.437587 0.383442 0 0 + 1 0.715189 0.423655 0.891773 0.791725 1 1 + 2 0.602763 0.645894 0.963663 0.528895 1 2 + + >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id', + ... j='year', sep='-') + ... # doctest: +NORMALIZE_WHITESPACE + X A(weekly) B(weekly) + id year + 0 2010 0 0.548814 0.437587 + 1 2010 1 0.715189 0.891773 + 2 2010 1 0.602763 0.963663 + 0 2011 0 0.544883 0.383442 + 1 2011 1 0.423655 0.791725 + 2 2011 1 0.645894 0.528895 + + If we have many columns, we could also use a regex to find our + stubnames and pass that list on to wide_to_long + + >>> stubnames = sorted( + ... set([match[0] for match in df.columns.str.findall( + ... r'[A-B]\(.*\)').values if match != []]) + ... ) + >>> list(stubnames) + ['A(weekly)', 'B(weekly)'] + + All of the above examples have integers as suffixes. It is possible to + have non-integers as suffixes. + + >>> df = pd.DataFrame({ + ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + ... }) + >>> df + famid birth ht_one ht_two + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 + + >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', + ... sep='_', suffix=r'\w+') + >>> l + ... # doctest: +NORMALIZE_WHITESPACE + ht + famid birth age + 1 1 one 2.8 + two 3.4 + 2 one 2.9 + two 3.8 + 3 one 2.2 + two 2.9 + 2 1 one 2.0 + two 3.2 + 2 one 1.8 + two 2.8 + 3 one 1.9 + two 2.4 + 3 1 one 2.2 + two 3.3 + 2 one 2.3 + two 3.4 + 3 one 2.1 + two 2.9 + """ + + def get_var_names(df, stub: str, sep: str, suffix: str): + regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" + return df.columns[df.columns.str.match(regex)] + + def melt_stub(df, stub: str, i, j, value_vars, sep: str): + newdf = melt( + df, + id_vars=i, + value_vars=value_vars, + value_name=stub.rstrip(sep), + var_name=j, + ) + newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True) + + # GH17627 Cast numerics suffixes to int/float + try: + newdf[j] = to_numeric(newdf[j]) + except (TypeError, ValueError, OverflowError): + # TODO: anything else to catch? + pass + + return newdf.set_index(i + [j]) + + if not is_list_like(stubnames): + stubnames = [stubnames] + else: + stubnames = list(stubnames) + + if df.columns.isin(stubnames).any(): + raise ValueError("stubname can't be identical to a column name") + + if not is_list_like(i): + i = [i] + else: + i = list(i) + + if df[i].duplicated().any(): + raise ValueError("the id variables need to uniquely identify each row") + + _melted = [] + value_vars_flattened = [] + for stub in stubnames: + value_var = get_var_names(df, stub, sep, suffix) + value_vars_flattened.extend(value_var) + _melted.append(melt_stub(df, stub, i, j, value_var, sep)) + + melted = concat(_melted, axis=1) + id_vars = df.columns.difference(value_vars_flattened) + new = df[id_vars] + + if len(i) == 1: + return new.set_index(i).join(melted) + else: + return new.merge(melted.reset_index(), on=i).set_index(i + [j]) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/merge.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/merge.py new file mode 100644 index 0000000000000000000000000000000000000000..646f40f6141d836a8ecf5354e56203010f00f6d5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/merge.py @@ -0,0 +1,2762 @@ +""" +SQL-style merge routines +""" +from __future__ import annotations + +from collections.abc import ( + Hashable, + Sequence, +) +import datetime +from functools import partial +from typing import ( + TYPE_CHECKING, + Literal, + cast, + final, +) +import uuid +import warnings + +import numpy as np + +from pandas._libs import ( + Timedelta, + hashtable as libhashtable, + join as libjoin, + lib, +) +from pandas._libs.lib import is_range_indexer +from pandas._typing import ( + AnyArrayLike, + ArrayLike, + IndexLabel, + JoinHow, + MergeHow, + Shape, + Suffixes, + npt, +) +from pandas.errors import MergeError +from pandas.util._decorators import ( + Appender, + Substitution, + cache_readonly, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_object, + is_bool, + is_bool_dtype, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_number, + is_numeric_dtype, + is_object_dtype, + is_string_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + isna, + na_value_for_dtype, +) + +from pandas import ( + ArrowDtype, + Categorical, + Index, + MultiIndex, + Series, +) +import pandas.core.algorithms as algos +from pandas.core.arrays import ( + ArrowExtensionArray, + BaseMaskedArray, + ExtensionArray, +) +from pandas.core.arrays.string_ import StringDtype +import pandas.core.common as com +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, +) +from pandas.core.frame import _merge_doc +from pandas.core.indexes.api import default_index +from pandas.core.sorting import ( + get_group_index, + is_int64_overflow_possible, +) + +if TYPE_CHECKING: + from pandas import DataFrame + from pandas.core import groupby + from pandas.core.arrays import DatetimeArray + from pandas.core.indexes.frozen import FrozenList + +_factorizers = { + np.int64: libhashtable.Int64Factorizer, + np.longlong: libhashtable.Int64Factorizer, + np.int32: libhashtable.Int32Factorizer, + np.int16: libhashtable.Int16Factorizer, + np.int8: libhashtable.Int8Factorizer, + np.uint64: libhashtable.UInt64Factorizer, + np.uint32: libhashtable.UInt32Factorizer, + np.uint16: libhashtable.UInt16Factorizer, + np.uint8: libhashtable.UInt8Factorizer, + np.bool_: libhashtable.UInt8Factorizer, + np.float64: libhashtable.Float64Factorizer, + np.float32: libhashtable.Float32Factorizer, + np.complex64: libhashtable.Complex64Factorizer, + np.complex128: libhashtable.Complex128Factorizer, + np.object_: libhashtable.ObjectFactorizer, +} + +# See https://github.com/pandas-dev/pandas/issues/52451 +if np.intc is not np.int32: + _factorizers[np.intc] = libhashtable.Int64Factorizer + +_known = (np.ndarray, ExtensionArray, Index, ABCSeries) + + +@Substitution("\nleft : DataFrame or named Series") +@Appender(_merge_doc, indents=0) +def merge( + left: DataFrame | Series, + right: DataFrame | Series, + how: MergeHow = "inner", + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, + suffixes: Suffixes = ("_x", "_y"), + copy: bool | None = None, + indicator: str | bool = False, + validate: str | None = None, +) -> DataFrame: + left_df = _validate_operand(left) + right_df = _validate_operand(right) + if how == "cross": + return _cross_merge( + left_df, + right_df, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + indicator=indicator, + validate=validate, + copy=copy, + ) + else: + op = _MergeOperation( + left_df, + right_df, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + indicator=indicator, + validate=validate, + ) + return op.get_result(copy=copy) + + +def _cross_merge( + left: DataFrame, + right: DataFrame, + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, + suffixes: Suffixes = ("_x", "_y"), + copy: bool | None = None, + indicator: str | bool = False, + validate: str | None = None, +) -> DataFrame: + """ + See merge.__doc__ with how='cross' + """ + + if ( + left_index + or right_index + or right_on is not None + or left_on is not None + or on is not None + ): + raise MergeError( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + + cross_col = f"_cross_{uuid.uuid4()}" + left = left.assign(**{cross_col: 1}) + right = right.assign(**{cross_col: 1}) + + left_on = right_on = [cross_col] + + res = merge( + left, + right, + how="inner", + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + indicator=indicator, + validate=validate, + copy=copy, + ) + del res[cross_col] + return res + + +def _groupby_and_merge( + by, left: DataFrame | Series, right: DataFrame | Series, merge_pieces +): + """ + groupby & merge; we are always performing a left-by type operation + + Parameters + ---------- + by: field to group + left: DataFrame + right: DataFrame + merge_pieces: function for merging + """ + pieces = [] + if not isinstance(by, (list, tuple)): + by = [by] + + lby = left.groupby(by, sort=False) + rby: groupby.DataFrameGroupBy | groupby.SeriesGroupBy | None = None + + # if we can groupby the rhs + # then we can get vastly better perf + if all(item in right.columns for item in by): + rby = right.groupby(by, sort=False) + + for key, lhs in lby._grouper.get_iterator(lby._selected_obj, axis=lby.axis): + if rby is None: + rhs = right + else: + try: + rhs = right.take(rby.indices[key]) + except KeyError: + # key doesn't exist in left + lcols = lhs.columns.tolist() + cols = lcols + [r for r in right.columns if r not in set(lcols)] + merged = lhs.reindex(columns=cols) + merged.index = range(len(merged)) + pieces.append(merged) + continue + + merged = merge_pieces(lhs, rhs) + + # make sure join keys are in the merged + # TODO, should merge_pieces do this? + merged[by] = key + + pieces.append(merged) + + # preserve the original order + # if we have a missing piece this can be reset + from pandas.core.reshape.concat import concat + + result = concat(pieces, ignore_index=True) + result = result.reindex(columns=pieces[0].columns, copy=False) + return result, lby + + +def merge_ordered( + left: DataFrame | Series, + right: DataFrame | Series, + on: IndexLabel | None = None, + left_on: IndexLabel | None = None, + right_on: IndexLabel | None = None, + left_by=None, + right_by=None, + fill_method: str | None = None, + suffixes: Suffixes = ("_x", "_y"), + how: JoinHow = "outer", +) -> DataFrame: + """ + Perform a merge for ordered data with optional filling/interpolation. + + Designed for ordered data like time series data. Optionally + perform group-wise merge (see examples). + + Parameters + ---------- + left : DataFrame or named Series + right : DataFrame or named Series + on : label or list + Field names to join on. Must be found in both DataFrames. + left_on : label or list, or array-like + Field names to join on in left DataFrame. Can be a vector or list of + vectors of the length of the DataFrame to use a particular vector as + the join key instead of columns. + right_on : label or list, or array-like + Field names to join on in right DataFrame or vector/list of vectors per + left_on docs. + left_by : column name or list of column names + Group left DataFrame by group columns and merge piece by piece with + right DataFrame. Must be None if either left or right are a Series. + right_by : column name or list of column names + Group right DataFrame by group columns and merge piece by piece with + left DataFrame. Must be None if either left or right are a Series. + fill_method : {'ffill', None}, default None + Interpolation method for data. + suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. + + how : {'left', 'right', 'outer', 'inner'}, default 'outer' + * left: use only keys from left frame (SQL: left outer join) + * right: use only keys from right frame (SQL: right outer join) + * outer: use union of keys from both frames (SQL: full outer join) + * inner: use intersection of keys from both frames (SQL: inner join). + + Returns + ------- + DataFrame + The merged DataFrame output type will be the same as + 'left', if it is a subclass of DataFrame. + + See Also + -------- + merge : Merge with a database-style join. + merge_asof : Merge on nearest keys. + + Examples + -------- + >>> from pandas import merge_ordered + >>> df1 = pd.DataFrame( + ... { + ... "key": ["a", "c", "e", "a", "c", "e"], + ... "lvalue": [1, 2, 3, 1, 2, 3], + ... "group": ["a", "a", "a", "b", "b", "b"] + ... } + ... ) + >>> df1 + key lvalue group + 0 a 1 a + 1 c 2 a + 2 e 3 a + 3 a 1 b + 4 c 2 b + 5 e 3 b + + >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + >>> df2 + key rvalue + 0 b 1 + 1 c 2 + 2 d 3 + + >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group") + key lvalue group rvalue + 0 a 1 a NaN + 1 b 1 a 1.0 + 2 c 2 a 2.0 + 3 d 2 a 3.0 + 4 e 3 a 3.0 + 5 a 1 b NaN + 6 b 1 b 1.0 + 7 c 2 b 2.0 + 8 d 2 b 3.0 + 9 e 3 b 3.0 + """ + + def _merger(x, y) -> DataFrame: + # perform the ordered merge operation + op = _OrderedMerge( + x, + y, + on=on, + left_on=left_on, + right_on=right_on, + suffixes=suffixes, + fill_method=fill_method, + how=how, + ) + return op.get_result() + + if left_by is not None and right_by is not None: + raise ValueError("Can only group either left or right frames") + if left_by is not None: + if isinstance(left_by, str): + left_by = [left_by] + check = set(left_by).difference(left.columns) + if len(check) != 0: + raise KeyError(f"{check} not found in left columns") + result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y)) + elif right_by is not None: + if isinstance(right_by, str): + right_by = [right_by] + check = set(right_by).difference(right.columns) + if len(check) != 0: + raise KeyError(f"{check} not found in right columns") + result, _ = _groupby_and_merge( + right_by, right, left, lambda x, y: _merger(y, x) + ) + else: + result = _merger(left, right) + return result + + +def merge_asof( + left: DataFrame | Series, + right: DataFrame | Series, + on: IndexLabel | None = None, + left_on: IndexLabel | None = None, + right_on: IndexLabel | None = None, + left_index: bool = False, + right_index: bool = False, + by=None, + left_by=None, + right_by=None, + suffixes: Suffixes = ("_x", "_y"), + tolerance: int | Timedelta | None = None, + allow_exact_matches: bool = True, + direction: str = "backward", +) -> DataFrame: + """ + Perform a merge by key distance. + + This is similar to a left-join except that we match on nearest + key rather than equal keys. Both DataFrames must be sorted by the key. + + For each row in the left DataFrame: + + - A "backward" search selects the last row in the right DataFrame whose + 'on' key is less than or equal to the left's key. + + - A "forward" search selects the first row in the right DataFrame whose + 'on' key is greater than or equal to the left's key. + + - A "nearest" search selects the row in the right DataFrame whose 'on' + key is closest in absolute distance to the left's key. + + Optionally match on equivalent keys with 'by' before searching with 'on'. + + Parameters + ---------- + left : DataFrame or named Series + right : DataFrame or named Series + on : label + Field name to join on. Must be found in both DataFrames. + The data MUST be ordered. Furthermore this must be a numeric column, + such as datetimelike, integer, or float. On or left_on/right_on + must be given. + left_on : label + Field name to join on in left DataFrame. + right_on : label + Field name to join on in right DataFrame. + left_index : bool + Use the index of the left DataFrame as the join key. + right_index : bool + Use the index of the right DataFrame as the join key. + by : column name or list of column names + Match on these columns before performing merge operation. + left_by : column name + Field names to match on in the left DataFrame. + right_by : column name + Field names to match on in the right DataFrame. + suffixes : 2-length sequence (tuple, list, ...) + Suffix to apply to overlapping column names in the left and right + side, respectively. + tolerance : int or Timedelta, optional, default None + Select asof tolerance within this range; must be compatible + with the merge index. + allow_exact_matches : bool, default True + + - If True, allow matching with the same 'on' value + (i.e. less-than-or-equal-to / greater-than-or-equal-to) + - If False, don't match the same 'on' value + (i.e., strictly less-than / strictly greater-than). + + direction : 'backward' (default), 'forward', or 'nearest' + Whether to search for prior, subsequent, or closest matches. + + Returns + ------- + DataFrame + + See Also + -------- + merge : Merge with a database-style join. + merge_ordered : Merge with optional filling/interpolation. + + Examples + -------- + >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + >>> left + a left_val + 0 1 a + 1 5 b + 2 10 c + + >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) + >>> right + a right_val + 0 1 1 + 1 2 2 + 2 3 3 + 3 6 6 + 4 7 7 + + >>> pd.merge_asof(left, right, on="a") + a left_val right_val + 0 1 a 1 + 1 5 b 3 + 2 10 c 7 + + >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False) + a left_val right_val + 0 1 a NaN + 1 5 b 3.0 + 2 10 c 7.0 + + >>> pd.merge_asof(left, right, on="a", direction="forward") + a left_val right_val + 0 1 a 1.0 + 1 5 b 6.0 + 2 10 c NaN + + >>> pd.merge_asof(left, right, on="a", direction="nearest") + a left_val right_val + 0 1 a 1 + 1 5 b 6 + 2 10 c 7 + + We can use indexed DataFrames as well. + + >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10]) + >>> left + left_val + 1 a + 5 b + 10 c + + >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) + >>> right + right_val + 1 1 + 2 2 + 3 3 + 6 6 + 7 7 + + >>> pd.merge_asof(left, right, left_index=True, right_index=True) + left_val right_val + 1 a 1 + 5 b 3 + 10 c 7 + + Here is a real-world times-series example + + >>> quotes = pd.DataFrame( + ... { + ... "time": [ + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.030"), + ... pd.Timestamp("2016-05-25 13:30:00.041"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.049"), + ... pd.Timestamp("2016-05-25 13:30:00.072"), + ... pd.Timestamp("2016-05-25 13:30:00.075") + ... ], + ... "ticker": [ + ... "GOOG", + ... "MSFT", + ... "MSFT", + ... "MSFT", + ... "GOOG", + ... "AAPL", + ... "GOOG", + ... "MSFT" + ... ], + ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] + ... } + ... ) + >>> quotes + time ticker bid ask + 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 + 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 + 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 + 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 + 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 + 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 + 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 + 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 + + >>> trades = pd.DataFrame( + ... { + ... "time": [ + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.038"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048") + ... ], + ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], + ... "quantity": [75, 155, 100, 100, 100] + ... } + ... ) + >>> trades + time ticker price quantity + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 + + By default we are taking the asof of the quotes + + >>> pd.merge_asof(trades, quotes, on="time", by="ticker") + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + We only asof within 2ms between the quote time and the trade time + + >>> pd.merge_asof( + ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") + ... ) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + We only asof within 10ms between the quote time and the trade time + and we exclude exact matches on time. However *prior* data will + propagate forward + + >>> pd.merge_asof( + ... trades, + ... quotes, + ... on="time", + ... by="ticker", + ... tolerance=pd.Timedelta("10ms"), + ... allow_exact_matches=False + ... ) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + """ + op = _AsOfMerge( + left, + right, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + by=by, + left_by=left_by, + right_by=right_by, + suffixes=suffixes, + how="asof", + tolerance=tolerance, + allow_exact_matches=allow_exact_matches, + direction=direction, + ) + return op.get_result() + + +# TODO: transformations?? +# TODO: only copy DataFrames when modification necessary +class _MergeOperation: + """ + Perform a database (SQL) merge operation between two DataFrame or Series + objects using either columns as keys or their row indexes + """ + + _merge_type = "merge" + how: JoinHow | Literal["asof"] + on: IndexLabel | None + # left_on/right_on may be None when passed, but in validate_specification + # get replaced with non-None. + left_on: Sequence[Hashable | AnyArrayLike] + right_on: Sequence[Hashable | AnyArrayLike] + left_index: bool + right_index: bool + sort: bool + suffixes: Suffixes + copy: bool + indicator: str | bool + validate: str | None + join_names: list[Hashable] + right_join_keys: list[ArrayLike] + left_join_keys: list[ArrayLike] + + def __init__( + self, + left: DataFrame | Series, + right: DataFrame | Series, + how: JoinHow | Literal["asof"] = "inner", + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, + left_index: bool = False, + right_index: bool = False, + sort: bool = True, + suffixes: Suffixes = ("_x", "_y"), + indicator: str | bool = False, + validate: str | None = None, + ) -> None: + _left = _validate_operand(left) + _right = _validate_operand(right) + self.left = self.orig_left = _left + self.right = self.orig_right = _right + self.how = how + + self.on = com.maybe_make_list(on) + + self.suffixes = suffixes + self.sort = sort or how == "outer" + + self.left_index = left_index + self.right_index = right_index + + self.indicator = indicator + + if not is_bool(left_index): + raise ValueError( + f"left_index parameter must be of type bool, not {type(left_index)}" + ) + if not is_bool(right_index): + raise ValueError( + f"right_index parameter must be of type bool, not {type(right_index)}" + ) + + # GH 40993: raise when merging between different levels; enforced in 2.0 + if _left.columns.nlevels != _right.columns.nlevels: + msg = ( + "Not allowed to merge between different levels. " + f"({_left.columns.nlevels} levels on the left, " + f"{_right.columns.nlevels} on the right)" + ) + raise MergeError(msg) + + self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) + + ( + self.left_join_keys, + self.right_join_keys, + self.join_names, + left_drop, + right_drop, + ) = self._get_merge_keys() + + if left_drop: + self.left = self.left._drop_labels_or_levels(left_drop) + + if right_drop: + self.right = self.right._drop_labels_or_levels(right_drop) + + self._maybe_require_matching_dtypes(self.left_join_keys, self.right_join_keys) + self._validate_tolerance(self.left_join_keys) + + # validate the merge keys dtypes. We may need to coerce + # to avoid incompatible dtypes + self._maybe_coerce_merge_keys() + + # If argument passed to validate, + # check if columns specified as unique + # are in fact unique. + if validate is not None: + self._validate_validate_kwd(validate) + + def _maybe_require_matching_dtypes( + self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike] + ) -> None: + # Overridden by AsOfMerge + pass + + def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None: + # Overridden by AsOfMerge + pass + + @final + def _reindex_and_concat( + self, + join_index: Index, + left_indexer: npt.NDArray[np.intp] | None, + right_indexer: npt.NDArray[np.intp] | None, + copy: bool | None, + ) -> DataFrame: + """ + reindex along index and concat along columns. + """ + # Take views so we do not alter the originals + left = self.left[:] + right = self.right[:] + + llabels, rlabels = _items_overlap_with_suffix( + self.left._info_axis, self.right._info_axis, self.suffixes + ) + + if left_indexer is not None and not is_range_indexer(left_indexer, len(left)): + # Pinning the index here (and in the right code just below) is not + # necessary, but makes the `.take` more performant if we have e.g. + # a MultiIndex for left.index. + lmgr = left._mgr.reindex_indexer( + join_index, + left_indexer, + axis=1, + copy=False, + only_slice=True, + allow_dups=True, + use_na_proxy=True, + ) + left = left._constructor_from_mgr(lmgr, axes=lmgr.axes) + left.index = join_index + + if right_indexer is not None and not is_range_indexer( + right_indexer, len(right) + ): + rmgr = right._mgr.reindex_indexer( + join_index, + right_indexer, + axis=1, + copy=False, + only_slice=True, + allow_dups=True, + use_na_proxy=True, + ) + right = right._constructor_from_mgr(rmgr, axes=rmgr.axes) + right.index = join_index + + from pandas import concat + + left.columns = llabels + right.columns = rlabels + result = concat([left, right], axis=1, copy=copy) + return result + + def get_result(self, copy: bool | None = True) -> DataFrame: + if self.indicator: + self.left, self.right = self._indicator_pre_merge(self.left, self.right) + + join_index, left_indexer, right_indexer = self._get_join_info() + + result = self._reindex_and_concat( + join_index, left_indexer, right_indexer, copy=copy + ) + result = result.__finalize__(self, method=self._merge_type) + + if self.indicator: + result = self._indicator_post_merge(result) + + self._maybe_add_join_keys(result, left_indexer, right_indexer) + + self._maybe_restore_index_levels(result) + + return result.__finalize__(self, method="merge") + + @final + @cache_readonly + def _indicator_name(self) -> str | None: + if isinstance(self.indicator, str): + return self.indicator + elif isinstance(self.indicator, bool): + return "_merge" if self.indicator else None + else: + raise ValueError( + "indicator option can only accept boolean or string arguments" + ) + + @final + def _indicator_pre_merge( + self, left: DataFrame, right: DataFrame + ) -> tuple[DataFrame, DataFrame]: + columns = left.columns.union(right.columns) + + for i in ["_left_indicator", "_right_indicator"]: + if i in columns: + raise ValueError( + "Cannot use `indicator=True` option when " + f"data contains a column named {i}" + ) + if self._indicator_name in columns: + raise ValueError( + "Cannot use name of an existing column for indicator column" + ) + + left = left.copy() + right = right.copy() + + left["_left_indicator"] = 1 + left["_left_indicator"] = left["_left_indicator"].astype("int8") + + right["_right_indicator"] = 2 + right["_right_indicator"] = right["_right_indicator"].astype("int8") + + return left, right + + @final + def _indicator_post_merge(self, result: DataFrame) -> DataFrame: + result["_left_indicator"] = result["_left_indicator"].fillna(0) + result["_right_indicator"] = result["_right_indicator"].fillna(0) + + result[self._indicator_name] = Categorical( + (result["_left_indicator"] + result["_right_indicator"]), + categories=[1, 2, 3], + ) + result[self._indicator_name] = result[ + self._indicator_name + ].cat.rename_categories(["left_only", "right_only", "both"]) + + result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1) + return result + + @final + def _maybe_restore_index_levels(self, result: DataFrame) -> None: + """ + Restore index levels specified as `on` parameters + + Here we check for cases where `self.left_on` and `self.right_on` pairs + each reference an index level in their respective DataFrames. The + joined columns corresponding to these pairs are then restored to the + index of `result`. + + **Note:** This method has side effects. It modifies `result` in-place + + Parameters + ---------- + result: DataFrame + merge result + + Returns + ------- + None + """ + names_to_restore = [] + for name, left_key, right_key in zip( + self.join_names, self.left_on, self.right_on + ): + if ( + # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible + # type "Union[Hashable, ExtensionArray, Index, Series]"; expected + # "Hashable" + self.orig_left._is_level_reference(left_key) # type: ignore[arg-type] + # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible + # type "Union[Hashable, ExtensionArray, Index, Series]"; expected + # "Hashable" + and self.orig_right._is_level_reference( + right_key # type: ignore[arg-type] + ) + and left_key == right_key + and name not in result.index.names + ): + names_to_restore.append(name) + + if names_to_restore: + result.set_index(names_to_restore, inplace=True) + + @final + def _maybe_add_join_keys( + self, + result: DataFrame, + left_indexer: npt.NDArray[np.intp] | None, + right_indexer: npt.NDArray[np.intp] | None, + ) -> None: + left_has_missing = None + right_has_missing = None + + assert all(isinstance(x, _known) for x in self.left_join_keys) + + keys = zip(self.join_names, self.left_on, self.right_on) + for i, (name, lname, rname) in enumerate(keys): + if not _should_fill(lname, rname): + continue + + take_left, take_right = None, None + + if name in result: + if left_indexer is not None or right_indexer is not None: + if name in self.left: + if left_has_missing is None: + left_has_missing = ( + False + if left_indexer is None + else (left_indexer == -1).any() + ) + + if left_has_missing: + take_right = self.right_join_keys[i] + + if result[name].dtype != self.left[name].dtype: + take_left = self.left[name]._values + + elif name in self.right: + if right_has_missing is None: + right_has_missing = ( + False + if right_indexer is None + else (right_indexer == -1).any() + ) + + if right_has_missing: + take_left = self.left_join_keys[i] + + if result[name].dtype != self.right[name].dtype: + take_right = self.right[name]._values + + else: + take_left = self.left_join_keys[i] + take_right = self.right_join_keys[i] + + if take_left is not None or take_right is not None: + if take_left is None: + lvals = result[name]._values + elif left_indexer is None: + lvals = take_left + else: + # TODO: can we pin down take_left's type earlier? + take_left = extract_array(take_left, extract_numpy=True) + lfill = na_value_for_dtype(take_left.dtype) + lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill) + + if take_right is None: + rvals = result[name]._values + elif right_indexer is None: + rvals = take_right + else: + # TODO: can we pin down take_right's type earlier? + taker = extract_array(take_right, extract_numpy=True) + rfill = na_value_for_dtype(taker.dtype) + rvals = algos.take_nd(taker, right_indexer, fill_value=rfill) + + # if we have an all missing left_indexer + # make sure to just use the right values or vice-versa + if left_indexer is not None and (left_indexer == -1).all(): + key_col = Index(rvals) + result_dtype = rvals.dtype + elif right_indexer is not None and (right_indexer == -1).all(): + key_col = Index(lvals) + result_dtype = lvals.dtype + else: + key_col = Index(lvals) + if left_indexer is not None: + mask_left = left_indexer == -1 + key_col = key_col.where(~mask_left, rvals) + result_dtype = find_common_type([lvals.dtype, rvals.dtype]) + if ( + lvals.dtype.kind == "M" + and rvals.dtype.kind == "M" + and result_dtype.kind == "O" + ): + # TODO(non-nano) Workaround for common_type not dealing + # with different resolutions + result_dtype = key_col.dtype + + if result._is_label_reference(name): + result[name] = result._constructor_sliced( + key_col, dtype=result_dtype, index=result.index + ) + elif result._is_level_reference(name): + if isinstance(result.index, MultiIndex): + key_col.name = name + idx_list = [ + result.index.get_level_values(level_name) + if level_name != name + else key_col + for level_name in result.index.names + ] + + result.set_index(idx_list, inplace=True) + else: + result.index = Index(key_col, name=name) + else: + result.insert(i, name or f"key_{i}", key_col) + + def _get_join_indexers( + self, + ) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + """return the join indexers""" + # make mypy happy + assert self.how != "asof" + return get_join_indexers( + self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how + ) + + @final + def _get_join_info( + self, + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + left_ax = self.left.index + right_ax = self.right.index + + if self.left_index and self.right_index and self.how != "asof": + join_index, left_indexer, right_indexer = left_ax.join( + right_ax, how=self.how, return_indexers=True, sort=self.sort + ) + + elif self.right_index and self.how == "left": + join_index, left_indexer, right_indexer = _left_join_on_index( + left_ax, right_ax, self.left_join_keys, sort=self.sort + ) + + elif self.left_index and self.how == "right": + join_index, right_indexer, left_indexer = _left_join_on_index( + right_ax, left_ax, self.right_join_keys, sort=self.sort + ) + else: + (left_indexer, right_indexer) = self._get_join_indexers() + + if self.right_index: + if len(self.left) > 0: + join_index = self._create_join_index( + left_ax, + right_ax, + left_indexer, + how="right", + ) + elif right_indexer is None: + join_index = right_ax.copy() + else: + join_index = right_ax.take(right_indexer) + elif self.left_index: + if self.how == "asof": + # GH#33463 asof should always behave like a left merge + join_index = self._create_join_index( + left_ax, + right_ax, + left_indexer, + how="left", + ) + + elif len(self.right) > 0: + join_index = self._create_join_index( + right_ax, + left_ax, + right_indexer, + how="left", + ) + elif left_indexer is None: + join_index = left_ax.copy() + else: + join_index = left_ax.take(left_indexer) + else: + n = len(left_ax) if left_indexer is None else len(left_indexer) + join_index = default_index(n) + + return join_index, left_indexer, right_indexer + + @final + def _create_join_index( + self, + index: Index, + other_index: Index, + indexer: npt.NDArray[np.intp] | None, + how: JoinHow = "left", + ) -> Index: + """ + Create a join index by rearranging one index to match another + + Parameters + ---------- + index : Index + index being rearranged + other_index : Index + used to supply values not found in index + indexer : np.ndarray[np.intp] or None + how to rearrange index + how : str + Replacement is only necessary if indexer based on other_index. + + Returns + ------- + Index + """ + if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): + # if final index requires values in other_index but not target + # index, indexer may hold missing (-1) values, causing Index.take + # to take the final value in target index. So, we set the last + # element to be the desired fill value. We do not use allow_fill + # and fill_value because it throws a ValueError on integer indices + mask = indexer == -1 + if np.any(mask): + fill_value = na_value_for_dtype(index.dtype, compat=False) + index = index.append(Index([fill_value])) + if indexer is None: + return index.copy() + return index.take(indexer) + + @final + def _get_merge_keys( + self, + ) -> tuple[ + list[ArrayLike], + list[ArrayLike], + list[Hashable], + list[Hashable], + list[Hashable], + ]: + """ + Returns + ------- + left_keys, right_keys, join_names, left_drop, right_drop + """ + left_keys: list[ArrayLike] = [] + right_keys: list[ArrayLike] = [] + join_names: list[Hashable] = [] + right_drop: list[Hashable] = [] + left_drop: list[Hashable] = [] + + left, right = self.left, self.right + + is_lkey = lambda x: isinstance(x, _known) and len(x) == len(left) + is_rkey = lambda x: isinstance(x, _known) and len(x) == len(right) + + # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A + # user could, for example, request 'left_index' and 'left_by'. In a + # regular pd.merge(), users cannot specify both 'left_index' and + # 'left_on'. (Instead, users have a MultiIndex). That means the + # self.left_on in this function is always empty in a pd.merge(), but + # a pd.merge_asof(left_index=True, left_by=...) will result in a + # self.left_on array with a None in the middle of it. This requires + # a work-around as designated in the code below. + # See _validate_left_right_on() for where this happens. + + # ugh, spaghetti re #733 + if _any(self.left_on) and _any(self.right_on): + for lk, rk in zip(self.left_on, self.right_on): + lk = extract_array(lk, extract_numpy=True) + rk = extract_array(rk, extract_numpy=True) + if is_lkey(lk): + lk = cast(ArrayLike, lk) + left_keys.append(lk) + if is_rkey(rk): + rk = cast(ArrayLike, rk) + right_keys.append(rk) + join_names.append(None) # what to do? + else: + # Then we're either Hashable or a wrong-length arraylike, + # the latter of which will raise + rk = cast(Hashable, rk) + if rk is not None: + right_keys.append(right._get_label_or_level_values(rk)) + join_names.append(rk) + else: + # work-around for merge_asof(right_index=True) + right_keys.append(right.index._values) + join_names.append(right.index.name) + else: + if not is_rkey(rk): + # Then we're either Hashable or a wrong-length arraylike, + # the latter of which will raise + rk = cast(Hashable, rk) + if rk is not None: + right_keys.append(right._get_label_or_level_values(rk)) + else: + # work-around for merge_asof(right_index=True) + right_keys.append(right.index._values) + if lk is not None and lk == rk: # FIXME: what about other NAs? + right_drop.append(rk) + else: + rk = cast(ArrayLike, rk) + right_keys.append(rk) + if lk is not None: + # Then we're either Hashable or a wrong-length arraylike, + # the latter of which will raise + lk = cast(Hashable, lk) + left_keys.append(left._get_label_or_level_values(lk)) + join_names.append(lk) + else: + # work-around for merge_asof(left_index=True) + left_keys.append(left.index._values) + join_names.append(left.index.name) + elif _any(self.left_on): + for k in self.left_on: + if is_lkey(k): + k = extract_array(k, extract_numpy=True) + k = cast(ArrayLike, k) + left_keys.append(k) + join_names.append(None) + else: + # Then we're either Hashable or a wrong-length arraylike, + # the latter of which will raise + k = cast(Hashable, k) + left_keys.append(left._get_label_or_level_values(k)) + join_names.append(k) + if isinstance(self.right.index, MultiIndex): + right_keys = [ + lev._values.take(lev_codes) + for lev, lev_codes in zip( + self.right.index.levels, self.right.index.codes + ) + ] + else: + right_keys = [self.right.index._values] + elif _any(self.right_on): + for k in self.right_on: + k = extract_array(k, extract_numpy=True) + if is_rkey(k): + k = cast(ArrayLike, k) + right_keys.append(k) + join_names.append(None) + else: + # Then we're either Hashable or a wrong-length arraylike, + # the latter of which will raise + k = cast(Hashable, k) + right_keys.append(right._get_label_or_level_values(k)) + join_names.append(k) + if isinstance(self.left.index, MultiIndex): + left_keys = [ + lev._values.take(lev_codes) + for lev, lev_codes in zip( + self.left.index.levels, self.left.index.codes + ) + ] + else: + left_keys = [self.left.index._values] + + return left_keys, right_keys, join_names, left_drop, right_drop + + @final + def _maybe_coerce_merge_keys(self) -> None: + # we have valid merges but we may have to further + # coerce these if they are originally incompatible types + # + # for example if these are categorical, but are not dtype_equal + # or if we have object and integer dtypes + + for lk, rk, name in zip( + self.left_join_keys, self.right_join_keys, self.join_names + ): + if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): + continue + + lk = extract_array(lk, extract_numpy=True) + rk = extract_array(rk, extract_numpy=True) + + lk_is_cat = isinstance(lk.dtype, CategoricalDtype) + rk_is_cat = isinstance(rk.dtype, CategoricalDtype) + lk_is_object_or_string = is_object_dtype(lk.dtype) or is_string_dtype( + lk.dtype + ) + rk_is_object_or_string = is_object_dtype(rk.dtype) or is_string_dtype( + rk.dtype + ) + + # if either left or right is a categorical + # then the must match exactly in categories & ordered + if lk_is_cat and rk_is_cat: + lk = cast(Categorical, lk) + rk = cast(Categorical, rk) + if lk._categories_match_up_to_permutation(rk): + continue + + elif lk_is_cat or rk_is_cat: + pass + + elif lk.dtype == rk.dtype: + continue + + msg = ( + f"You are trying to merge on {lk.dtype} and {rk.dtype} columns " + f"for key '{name}'. If you wish to proceed you should use pd.concat" + ) + + # if we are numeric, then allow differing + # kinds to proceed, eg. int64 and int8, int and float + # further if we are object, but we infer to + # the same, then proceed + if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype): + if lk.dtype.kind == rk.dtype.kind: + continue + + if isinstance(lk.dtype, ExtensionDtype) and not isinstance( + rk.dtype, ExtensionDtype + ): + ct = find_common_type([lk.dtype, rk.dtype]) + if isinstance(ct, ExtensionDtype): + com_cls = ct.construct_array_type() + rk = com_cls._from_sequence(rk, dtype=ct, copy=False) + else: + rk = rk.astype(ct) + elif isinstance(rk.dtype, ExtensionDtype): + ct = find_common_type([lk.dtype, rk.dtype]) + if isinstance(ct, ExtensionDtype): + com_cls = ct.construct_array_type() + lk = com_cls._from_sequence(lk, dtype=ct, copy=False) + else: + lk = lk.astype(ct) + + # check whether ints and floats + if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype): + # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int + with np.errstate(invalid="ignore"): + # error: Argument 1 to "astype" of "ndarray" has incompatible + # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected + # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]" + casted = lk.astype(rk.dtype) # type: ignore[arg-type] + + mask = ~np.isnan(lk) + match = lk == casted + if not match[mask].all(): + warnings.warn( + "You are merging on int and float " + "columns where the float values " + "are not equal to their int representation.", + UserWarning, + stacklevel=find_stack_level(), + ) + continue + + if is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype): + # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int + with np.errstate(invalid="ignore"): + # error: Argument 1 to "astype" of "ndarray" has incompatible + # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected + # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]" + casted = rk.astype(lk.dtype) # type: ignore[arg-type] + + mask = ~np.isnan(rk) + match = rk == casted + if not match[mask].all(): + warnings.warn( + "You are merging on int and float " + "columns where the float values " + "are not equal to their int representation.", + UserWarning, + stacklevel=find_stack_level(), + ) + continue + + # let's infer and see if we are ok + if lib.infer_dtype(lk, skipna=False) == lib.infer_dtype( + rk, skipna=False + ): + continue + + # Check if we are trying to merge on obviously + # incompatible dtypes GH 9780, GH 15800 + + # bool values are coerced to object + elif (lk_is_object_or_string and is_bool_dtype(rk.dtype)) or ( + is_bool_dtype(lk.dtype) and rk_is_object_or_string + ): + pass + + # object values are allowed to be merged + elif (lk_is_object_or_string and is_numeric_dtype(rk.dtype)) or ( + is_numeric_dtype(lk.dtype) and rk_is_object_or_string + ): + inferred_left = lib.infer_dtype(lk, skipna=False) + inferred_right = lib.infer_dtype(rk, skipna=False) + bool_types = ["integer", "mixed-integer", "boolean", "empty"] + string_types = ["string", "unicode", "mixed", "bytes", "empty"] + + # inferred bool + if inferred_left in bool_types and inferred_right in bool_types: + pass + + # unless we are merging non-string-like with string-like + elif ( + inferred_left in string_types and inferred_right not in string_types + ) or ( + inferred_right in string_types and inferred_left not in string_types + ): + raise ValueError(msg) + + # datetimelikes must match exactly + elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype): + raise ValueError(msg) + elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype): + raise ValueError(msg) + elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance( + rk.dtype, DatetimeTZDtype + ): + raise ValueError(msg) + elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance( + rk.dtype, DatetimeTZDtype + ): + raise ValueError(msg) + elif ( + isinstance(lk.dtype, DatetimeTZDtype) + and isinstance(rk.dtype, DatetimeTZDtype) + ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"): + # allows datetime with different resolutions + continue + # datetime and timedelta not allowed + elif lk.dtype.kind == "M" and rk.dtype.kind == "m": + raise ValueError(msg) + elif lk.dtype.kind == "m" and rk.dtype.kind == "M": + raise ValueError(msg) + + elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype): + continue + + # Houston, we have a problem! + # let's coerce to object if the dtypes aren't + # categorical, otherwise coerce to the category + # dtype. If we coerced categories to object, + # then we would lose type information on some + # columns, and end up trying to merge + # incompatible dtypes. See GH 16900. + if name in self.left.columns: + typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object + self.left = self.left.copy() + self.left[name] = self.left[name].astype(typ) + if name in self.right.columns: + typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object + self.right = self.right.copy() + self.right[name] = self.right[name].astype(typ) + + def _validate_left_right_on(self, left_on, right_on): + left_on = com.maybe_make_list(left_on) + right_on = com.maybe_make_list(right_on) + + # Hm, any way to make this logic less complicated?? + if self.on is None and left_on is None and right_on is None: + if self.left_index and self.right_index: + left_on, right_on = (), () + elif self.left_index: + raise MergeError("Must pass right_on or right_index=True") + elif self.right_index: + raise MergeError("Must pass left_on or left_index=True") + else: + # use the common columns + left_cols = self.left.columns + right_cols = self.right.columns + common_cols = left_cols.intersection(right_cols) + if len(common_cols) == 0: + raise MergeError( + "No common columns to perform merge on. " + f"Merge options: left_on={left_on}, " + f"right_on={right_on}, " + f"left_index={self.left_index}, " + f"right_index={self.right_index}" + ) + if ( + not left_cols.join(common_cols, how="inner").is_unique + or not right_cols.join(common_cols, how="inner").is_unique + ): + raise MergeError(f"Data columns not unique: {repr(common_cols)}") + left_on = right_on = common_cols + elif self.on is not None: + if left_on is not None or right_on is not None: + raise MergeError( + 'Can only pass argument "on" OR "left_on" ' + 'and "right_on", not a combination of both.' + ) + if self.left_index or self.right_index: + raise MergeError( + 'Can only pass argument "on" OR "left_index" ' + 'and "right_index", not a combination of both.' + ) + left_on = right_on = self.on + elif left_on is not None: + if self.left_index: + raise MergeError( + 'Can only pass argument "left_on" OR "left_index" not both.' + ) + if not self.right_index and right_on is None: + raise MergeError('Must pass "right_on" OR "right_index".') + n = len(left_on) + if self.right_index: + if len(left_on) != self.right.index.nlevels: + raise ValueError( + "len(left_on) must equal the number " + 'of levels in the index of "right"' + ) + right_on = [None] * n + elif right_on is not None: + if self.right_index: + raise MergeError( + 'Can only pass argument "right_on" OR "right_index" not both.' + ) + if not self.left_index and left_on is None: + raise MergeError('Must pass "left_on" OR "left_index".') + n = len(right_on) + if self.left_index: + if len(right_on) != self.left.index.nlevels: + raise ValueError( + "len(right_on) must equal the number " + 'of levels in the index of "left"' + ) + left_on = [None] * n + if len(right_on) != len(left_on): + raise ValueError("len(right_on) must equal len(left_on)") + + return left_on, right_on + + @final + def _validate_validate_kwd(self, validate: str) -> None: + # Check uniqueness of each + if self.left_index: + left_unique = self.orig_left.index.is_unique + else: + left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique + + if self.right_index: + right_unique = self.orig_right.index.is_unique + else: + right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique + + # Check data integrity + if validate in ["one_to_one", "1:1"]: + if not left_unique and not right_unique: + raise MergeError( + "Merge keys are not unique in either left " + "or right dataset; not a one-to-one merge" + ) + if not left_unique: + raise MergeError( + "Merge keys are not unique in left dataset; not a one-to-one merge" + ) + if not right_unique: + raise MergeError( + "Merge keys are not unique in right dataset; not a one-to-one merge" + ) + + elif validate in ["one_to_many", "1:m"]: + if not left_unique: + raise MergeError( + "Merge keys are not unique in left dataset; not a one-to-many merge" + ) + + elif validate in ["many_to_one", "m:1"]: + if not right_unique: + raise MergeError( + "Merge keys are not unique in right dataset; " + "not a many-to-one merge" + ) + + elif validate in ["many_to_many", "m:m"]: + pass + + else: + raise ValueError( + f'"{validate}" is not a valid argument. ' + "Valid arguments are:\n" + '- "1:1"\n' + '- "1:m"\n' + '- "m:1"\n' + '- "m:m"\n' + '- "one_to_one"\n' + '- "one_to_many"\n' + '- "many_to_one"\n' + '- "many_to_many"' + ) + + +def get_join_indexers( + left_keys: list[ArrayLike], + right_keys: list[ArrayLike], + sort: bool = False, + how: JoinHow = "inner", +) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + """ + + Parameters + ---------- + left_keys : list[ndarray, ExtensionArray, Index, Series] + right_keys : list[ndarray, ExtensionArray, Index, Series] + sort : bool, default False + how : {'inner', 'outer', 'left', 'right'}, default 'inner' + + Returns + ------- + np.ndarray[np.intp] or None + Indexer into the left_keys. + np.ndarray[np.intp] or None + Indexer into the right_keys. + """ + assert len(left_keys) == len( + right_keys + ), "left_keys and right_keys must be the same length" + + # fast-path for empty left/right + left_n = len(left_keys[0]) + right_n = len(right_keys[0]) + if left_n == 0: + if how in ["left", "inner"]: + return _get_empty_indexer() + elif not sort and how in ["right", "outer"]: + return _get_no_sort_one_missing_indexer(right_n, True) + elif right_n == 0: + if how in ["right", "inner"]: + return _get_empty_indexer() + elif not sort and how in ["left", "outer"]: + return _get_no_sort_one_missing_indexer(left_n, False) + + lkey: ArrayLike + rkey: ArrayLike + if len(left_keys) > 1: + # get left & right join labels and num. of levels at each location + mapped = ( + _factorize_keys(left_keys[n], right_keys[n], sort=sort) + for n in range(len(left_keys)) + ) + zipped = zip(*mapped) + llab, rlab, shape = (list(x) for x in zipped) + + # get flat i8 keys from label lists + lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort) + else: + lkey = left_keys[0] + rkey = right_keys[0] + + left = Index(lkey) + right = Index(rkey) + + if ( + left.is_monotonic_increasing + and right.is_monotonic_increasing + and (left.is_unique or right.is_unique) + ): + _, lidx, ridx = left.join(right, how=how, return_indexers=True, sort=sort) + else: + lidx, ridx = get_join_indexers_non_unique( + left._values, right._values, sort, how + ) + + if lidx is not None and is_range_indexer(lidx, len(left)): + lidx = None + if ridx is not None and is_range_indexer(ridx, len(right)): + ridx = None + return lidx, ridx + + +def get_join_indexers_non_unique( + left: ArrayLike, + right: ArrayLike, + sort: bool = False, + how: JoinHow = "inner", +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + """ + Get join indexers for left and right. + + Parameters + ---------- + left : ArrayLike + right : ArrayLike + sort : bool, default False + how : {'inner', 'outer', 'left', 'right'}, default 'inner' + + Returns + ------- + np.ndarray[np.intp] + Indexer into left. + np.ndarray[np.intp] + Indexer into right. + """ + lkey, rkey, count = _factorize_keys(left, right, sort=sort) + if how == "left": + lidx, ridx = libjoin.left_outer_join(lkey, rkey, count, sort=sort) + elif how == "right": + ridx, lidx = libjoin.left_outer_join(rkey, lkey, count, sort=sort) + elif how == "inner": + lidx, ridx = libjoin.inner_join(lkey, rkey, count, sort=sort) + elif how == "outer": + lidx, ridx = libjoin.full_outer_join(lkey, rkey, count) + return lidx, ridx + + +def restore_dropped_levels_multijoin( + left: MultiIndex, + right: MultiIndex, + dropped_level_names, + join_index: Index, + lindexer: npt.NDArray[np.intp], + rindexer: npt.NDArray[np.intp], +) -> tuple[FrozenList, FrozenList, FrozenList]: + """ + *this is an internal non-public method* + + Returns the levels, labels and names of a multi-index to multi-index join. + Depending on the type of join, this method restores the appropriate + dropped levels of the joined multi-index. + The method relies on lindexer, rindexer which hold the index positions of + left and right, where a join was feasible + + Parameters + ---------- + left : MultiIndex + left index + right : MultiIndex + right index + dropped_level_names : str array + list of non-common level names + join_index : Index + the index of the join between the + common levels of left and right + lindexer : np.ndarray[np.intp] + left indexer + rindexer : np.ndarray[np.intp] + right indexer + + Returns + ------- + levels : list of Index + levels of combined multiindexes + labels : np.ndarray[np.intp] + labels of combined multiindexes + names : List[Hashable] + names of combined multiindex levels + + """ + + def _convert_to_multiindex(index: Index) -> MultiIndex: + if isinstance(index, MultiIndex): + return index + else: + return MultiIndex.from_arrays([index._values], names=[index.name]) + + # For multi-multi joins with one overlapping level, + # the returned index if of type Index + # Assure that join_index is of type MultiIndex + # so that dropped levels can be appended + join_index = _convert_to_multiindex(join_index) + + join_levels = join_index.levels + join_codes = join_index.codes + join_names = join_index.names + + # Iterate through the levels that must be restored + for dropped_level_name in dropped_level_names: + if dropped_level_name in left.names: + idx = left + indexer = lindexer + else: + idx = right + indexer = rindexer + + # The index of the level name to be restored + name_idx = idx.names.index(dropped_level_name) + + restore_levels = idx.levels[name_idx] + # Inject -1 in the codes list where a join was not possible + # IOW indexer[i]=-1 + codes = idx.codes[name_idx] + if indexer is None: + restore_codes = codes + else: + restore_codes = algos.take_nd(codes, indexer, fill_value=-1) + + # error: Cannot determine type of "__add__" + join_levels = join_levels + [restore_levels] # type: ignore[has-type] + join_codes = join_codes + [restore_codes] # type: ignore[has-type] + join_names = join_names + [dropped_level_name] + + return join_levels, join_codes, join_names + + +class _OrderedMerge(_MergeOperation): + _merge_type = "ordered_merge" + + def __init__( + self, + left: DataFrame | Series, + right: DataFrame | Series, + on: IndexLabel | None = None, + left_on: IndexLabel | None = None, + right_on: IndexLabel | None = None, + left_index: bool = False, + right_index: bool = False, + suffixes: Suffixes = ("_x", "_y"), + fill_method: str | None = None, + how: JoinHow | Literal["asof"] = "outer", + ) -> None: + self.fill_method = fill_method + _MergeOperation.__init__( + self, + left, + right, + on=on, + left_on=left_on, + left_index=left_index, + right_index=right_index, + right_on=right_on, + how=how, + suffixes=suffixes, + sort=True, # factorize sorts + ) + + def get_result(self, copy: bool | None = True) -> DataFrame: + join_index, left_indexer, right_indexer = self._get_join_info() + + left_join_indexer: npt.NDArray[np.intp] | None + right_join_indexer: npt.NDArray[np.intp] | None + + if self.fill_method == "ffill": + if left_indexer is None: + left_join_indexer = None + else: + left_join_indexer = libjoin.ffill_indexer(left_indexer) + if right_indexer is None: + right_join_indexer = None + else: + right_join_indexer = libjoin.ffill_indexer(right_indexer) + elif self.fill_method is None: + left_join_indexer = left_indexer + right_join_indexer = right_indexer + else: + raise ValueError("fill_method must be 'ffill' or None") + + result = self._reindex_and_concat( + join_index, left_join_indexer, right_join_indexer, copy=copy + ) + self._maybe_add_join_keys(result, left_indexer, right_indexer) + + return result + + +def _asof_by_function(direction: str): + name = f"asof_join_{direction}_on_X_by_Y" + return getattr(libjoin, name, None) + + +class _AsOfMerge(_OrderedMerge): + _merge_type = "asof_merge" + + def __init__( + self, + left: DataFrame | Series, + right: DataFrame | Series, + on: IndexLabel | None = None, + left_on: IndexLabel | None = None, + right_on: IndexLabel | None = None, + left_index: bool = False, + right_index: bool = False, + by=None, + left_by=None, + right_by=None, + suffixes: Suffixes = ("_x", "_y"), + how: Literal["asof"] = "asof", + tolerance=None, + allow_exact_matches: bool = True, + direction: str = "backward", + ) -> None: + self.by = by + self.left_by = left_by + self.right_by = right_by + self.tolerance = tolerance + self.allow_exact_matches = allow_exact_matches + self.direction = direction + + # check 'direction' is valid + if self.direction not in ["backward", "forward", "nearest"]: + raise MergeError(f"direction invalid: {self.direction}") + + # validate allow_exact_matches + if not is_bool(self.allow_exact_matches): + msg = ( + "allow_exact_matches must be boolean, " + f"passed {self.allow_exact_matches}" + ) + raise MergeError(msg) + + _OrderedMerge.__init__( + self, + left, + right, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + how=how, + suffixes=suffixes, + fill_method=None, + ) + + def _validate_left_right_on(self, left_on, right_on): + left_on, right_on = super()._validate_left_right_on(left_on, right_on) + + # we only allow on to be a single item for on + if len(left_on) != 1 and not self.left_index: + raise MergeError("can only asof on a key for left") + + if len(right_on) != 1 and not self.right_index: + raise MergeError("can only asof on a key for right") + + if self.left_index and isinstance(self.left.index, MultiIndex): + raise MergeError("left can only have one index") + + if self.right_index and isinstance(self.right.index, MultiIndex): + raise MergeError("right can only have one index") + + # set 'by' columns + if self.by is not None: + if self.left_by is not None or self.right_by is not None: + raise MergeError("Can only pass by OR left_by and right_by") + self.left_by = self.right_by = self.by + if self.left_by is None and self.right_by is not None: + raise MergeError("missing left_by") + if self.left_by is not None and self.right_by is None: + raise MergeError("missing right_by") + + # GH#29130 Check that merge keys do not have dtype object + if not self.left_index: + left_on_0 = left_on[0] + if isinstance(left_on_0, _known): + lo_dtype = left_on_0.dtype + else: + lo_dtype = ( + self.left._get_label_or_level_values(left_on_0).dtype + if left_on_0 in self.left.columns + else self.left.index.get_level_values(left_on_0) + ) + else: + lo_dtype = self.left.index.dtype + + if not self.right_index: + right_on_0 = right_on[0] + if isinstance(right_on_0, _known): + ro_dtype = right_on_0.dtype + else: + ro_dtype = ( + self.right._get_label_or_level_values(right_on_0).dtype + if right_on_0 in self.right.columns + else self.right.index.get_level_values(right_on_0) + ) + else: + ro_dtype = self.right.index.dtype + + if ( + is_object_dtype(lo_dtype) + or is_object_dtype(ro_dtype) + or is_string_dtype(lo_dtype) + or is_string_dtype(ro_dtype) + ): + raise MergeError( + f"Incompatible merge dtype, {repr(ro_dtype)} and " + f"{repr(lo_dtype)}, both sides must have numeric dtype" + ) + + # add 'by' to our key-list so we can have it in the + # output as a key + if self.left_by is not None: + if not is_list_like(self.left_by): + self.left_by = [self.left_by] + if not is_list_like(self.right_by): + self.right_by = [self.right_by] + + if len(self.left_by) != len(self.right_by): + raise MergeError("left_by and right_by must be the same length") + + left_on = self.left_by + list(left_on) + right_on = self.right_by + list(right_on) + + return left_on, right_on + + def _maybe_require_matching_dtypes( + self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike] + ) -> None: + # TODO: why do we do this for AsOfMerge but not the others? + + def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int): + if left.dtype != right.dtype: + if isinstance(left.dtype, CategoricalDtype) and isinstance( + right.dtype, CategoricalDtype + ): + # The generic error message is confusing for categoricals. + # + # In this function, the join keys include both the original + # ones of the merge_asof() call, and also the keys passed + # to its by= argument. Unordered but equal categories + # are not supported for the former, but will fail + # later with a ValueError, so we don't *need* to check + # for them here. + msg = ( + f"incompatible merge keys [{i}] {repr(left.dtype)} and " + f"{repr(right.dtype)}, both sides category, but not equal ones" + ) + else: + msg = ( + f"incompatible merge keys [{i}] {repr(left.dtype)} and " + f"{repr(right.dtype)}, must be the same type" + ) + raise MergeError(msg) + + # validate index types are the same + for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): + _check_dtype_match(lk, rk, i) + + if self.left_index: + lt = self.left.index._values + else: + lt = left_join_keys[-1] + + if self.right_index: + rt = self.right.index._values + else: + rt = right_join_keys[-1] + + _check_dtype_match(lt, rt, 0) + + def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None: + # validate tolerance; datetime.timedelta or Timedelta if we have a DTI + if self.tolerance is not None: + if self.left_index: + lt = self.left.index._values + else: + lt = left_join_keys[-1] + + msg = ( + f"incompatible tolerance {self.tolerance}, must be compat " + f"with type {repr(lt.dtype)}" + ) + + if needs_i8_conversion(lt.dtype) or ( + isinstance(lt, ArrowExtensionArray) and lt.dtype.kind in "mM" + ): + if not isinstance(self.tolerance, datetime.timedelta): + raise MergeError(msg) + if self.tolerance < Timedelta(0): + raise MergeError("tolerance must be positive") + + elif is_integer_dtype(lt.dtype): + if not is_integer(self.tolerance): + raise MergeError(msg) + if self.tolerance < 0: + raise MergeError("tolerance must be positive") + + elif is_float_dtype(lt.dtype): + if not is_number(self.tolerance): + raise MergeError(msg) + # error: Unsupported operand types for > ("int" and "Number") + if self.tolerance < 0: # type: ignore[operator] + raise MergeError("tolerance must be positive") + + else: + raise MergeError("key must be integer, timestamp or float") + + def _convert_values_for_libjoin( + self, values: AnyArrayLike, side: str + ) -> np.ndarray: + # we require sortedness and non-null values in the join keys + if not Index(values).is_monotonic_increasing: + if isna(values).any(): + raise ValueError(f"Merge keys contain null values on {side} side") + raise ValueError(f"{side} keys must be sorted") + + if isinstance(values, ArrowExtensionArray): + values = values._maybe_convert_datelike_array() + + if needs_i8_conversion(values.dtype): + values = values.view("i8") + + elif isinstance(values, BaseMaskedArray): + # we've verified above that no nulls exist + values = values._data + elif isinstance(values, ExtensionArray): + values = values.to_numpy() + + # error: Incompatible return value type (got "Union[ExtensionArray, + # Any, ndarray[Any, Any], ndarray[Any, dtype[Any]], Index, Series]", + # expected "ndarray[Any, Any]") + return values # type: ignore[return-value] + + def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + """return the join indexers""" + + # values to compare + left_values = ( + self.left.index._values if self.left_index else self.left_join_keys[-1] + ) + right_values = ( + self.right.index._values if self.right_index else self.right_join_keys[-1] + ) + + # _maybe_require_matching_dtypes already checked for dtype matching + assert left_values.dtype == right_values.dtype + + tolerance = self.tolerance + if tolerance is not None: + # TODO: can we reuse a tolerance-conversion function from + # e.g. TimedeltaIndex? + if needs_i8_conversion(left_values.dtype) or ( + isinstance(left_values, ArrowExtensionArray) + and left_values.dtype.kind in "mM" + ): + tolerance = Timedelta(tolerance) + # TODO: we have no test cases with PeriodDtype here; probably + # need to adjust tolerance for that case. + if left_values.dtype.kind in "mM": + # Make sure the i8 representation for tolerance + # matches that for left_values/right_values. + if isinstance(left_values, ArrowExtensionArray): + unit = left_values.dtype.pyarrow_dtype.unit + else: + unit = ensure_wrapped_if_datetimelike(left_values).unit + tolerance = tolerance.as_unit(unit) + + tolerance = tolerance._value + + # initial type conversion as needed + left_values = self._convert_values_for_libjoin(left_values, "left") + right_values = self._convert_values_for_libjoin(right_values, "right") + + # a "by" parameter requires special handling + if self.left_by is not None: + # remove 'on' parameter from values if one existed + if self.left_index and self.right_index: + left_join_keys = self.left_join_keys + right_join_keys = self.right_join_keys + else: + left_join_keys = self.left_join_keys[0:-1] + right_join_keys = self.right_join_keys[0:-1] + + mapped = [ + _factorize_keys( + left_join_keys[n], + right_join_keys[n], + sort=False, + ) + for n in range(len(left_join_keys)) + ] + + if len(left_join_keys) == 1: + left_by_values = mapped[0][0] + right_by_values = mapped[0][1] + else: + arrs = [np.concatenate(m[:2]) for m in mapped] + shape = tuple(m[2] for m in mapped) + group_index = get_group_index( + arrs, shape=shape, sort=False, xnull=False + ) + left_len = len(left_join_keys[0]) + left_by_values = group_index[:left_len] + right_by_values = group_index[left_len:] + + left_by_values = ensure_int64(left_by_values) + right_by_values = ensure_int64(right_by_values) + + # choose appropriate function by type + func = _asof_by_function(self.direction) + return func( + left_values, + right_values, + left_by_values, + right_by_values, + self.allow_exact_matches, + tolerance, + ) + else: + # choose appropriate function by type + func = _asof_by_function(self.direction) + return func( + left_values, + right_values, + None, + None, + self.allow_exact_matches, + tolerance, + False, + ) + + +def _get_multiindex_indexer( + join_keys: list[ArrayLike], index: MultiIndex, sort: bool +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + # left & right join labels and num. of levels at each location + mapped = ( + _factorize_keys(index.levels[n]._values, join_keys[n], sort=sort) + for n in range(index.nlevels) + ) + zipped = zip(*mapped) + rcodes, lcodes, shape = (list(x) for x in zipped) + if sort: + rcodes = list(map(np.take, rcodes, index.codes)) + else: + i8copy = lambda a: a.astype("i8", subok=False, copy=True) + rcodes = list(map(i8copy, index.codes)) + + # fix right labels if there were any nulls + for i, join_key in enumerate(join_keys): + mask = index.codes[i] == -1 + if mask.any(): + # check if there already was any nulls at this location + # if there was, it is factorized to `shape[i] - 1` + a = join_key[lcodes[i] == shape[i] - 1] + if a.size == 0 or not a[0] != a[0]: + shape[i] += 1 + + rcodes[i][mask] = shape[i] - 1 + + # get flat i8 join keys + lkey, rkey = _get_join_keys(lcodes, rcodes, tuple(shape), sort) + return lkey, rkey + + +def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + """Return empty join indexers.""" + return ( + np.array([], dtype=np.intp), + np.array([], dtype=np.intp), + ) + + +def _get_no_sort_one_missing_indexer( + n: int, left_missing: bool +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + """ + Return join indexers where all of one side is selected without sorting + and none of the other side is selected. + + Parameters + ---------- + n : int + Length of indexers to create. + left_missing : bool + If True, the left indexer will contain only -1's. + If False, the right indexer will contain only -1's. + + Returns + ------- + np.ndarray[np.intp] + Left indexer + np.ndarray[np.intp] + Right indexer + """ + idx = np.arange(n, dtype=np.intp) + idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp) + if left_missing: + return idx_missing, idx + return idx, idx_missing + + +def _left_join_on_index( + left_ax: Index, right_ax: Index, join_keys: list[ArrayLike], sort: bool = False +) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]: + if isinstance(right_ax, MultiIndex): + lkey, rkey = _get_multiindex_indexer(join_keys, right_ax, sort=sort) + else: + # error: Incompatible types in assignment (expression has type + # "Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]", + # variable has type "ndarray[Any, dtype[signedinteger[Any]]]") + lkey = join_keys[0] # type: ignore[assignment] + # error: Incompatible types in assignment (expression has type "Index", + # variable has type "ndarray[Any, dtype[signedinteger[Any]]]") + rkey = right_ax._values # type: ignore[assignment] + + left_key, right_key, count = _factorize_keys(lkey, rkey, sort=sort) + left_indexer, right_indexer = libjoin.left_outer_join( + left_key, right_key, count, sort=sort + ) + + if sort or len(left_ax) != len(left_indexer): + # if asked to sort or there are 1-to-many matches + join_index = left_ax.take(left_indexer) + return join_index, left_indexer, right_indexer + + # left frame preserves order & length of its index + return left_ax, None, right_indexer + + +def _factorize_keys( + lk: ArrayLike, rk: ArrayLike, sort: bool = True +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: + """ + Encode left and right keys as enumerated types. + + This is used to get the join indexers to be used when merging DataFrames. + + Parameters + ---------- + lk : ndarray, ExtensionArray + Left key. + rk : ndarray, ExtensionArray + Right key. + sort : bool, defaults to True + If True, the encoding is done such that the unique elements in the + keys are sorted. + + Returns + ------- + np.ndarray[np.intp] + Left (resp. right if called with `key='right'`) labels, as enumerated type. + np.ndarray[np.intp] + Right (resp. left if called with `key='right'`) labels, as enumerated type. + int + Number of unique elements in union of left and right labels. + + See Also + -------- + merge : Merge DataFrame or named Series objects + with a database-style join. + algorithms.factorize : Encode the object as an enumerated type + or categorical variable. + + Examples + -------- + >>> lk = np.array(["a", "c", "b"]) + >>> rk = np.array(["a", "c"]) + + Here, the unique values are `'a', 'b', 'c'`. With the default + `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`: + + >>> pd.core.reshape.merge._factorize_keys(lk, rk) + (array([0, 2, 1]), array([0, 2]), 3) + + With the `sort=False`, the encoding will correspond to the order + in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`: + + >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False) + (array([0, 1, 2]), array([0, 1]), 3) + """ + # TODO: if either is a RangeIndex, we can likely factorize more efficiently? + + if ( + isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype) + ) or (lib.is_np_dtype(lk.dtype, "M") and lib.is_np_dtype(rk.dtype, "M")): + # Extract the ndarray (UTC-localized) values + # Note: we dont need the dtypes to match, as these can still be compared + lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk) + lk = cast("DatetimeArray", lk)._ndarray + rk = cast("DatetimeArray", rk)._ndarray + + elif ( + isinstance(lk.dtype, CategoricalDtype) + and isinstance(rk.dtype, CategoricalDtype) + and lk.dtype == rk.dtype + ): + assert isinstance(lk, Categorical) + assert isinstance(rk, Categorical) + # Cast rk to encoding so we can compare codes with lk + + rk = lk._encode_with_my_categories(rk) + + lk = ensure_int64(lk.codes) + rk = ensure_int64(rk.codes) + + elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: + if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( + isinstance(lk.dtype, StringDtype) + and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] + ): + import pyarrow as pa + import pyarrow.compute as pc + + len_lk = len(lk) + lk = lk._pa_array # type: ignore[attr-defined] + rk = rk._pa_array # type: ignore[union-attr] + dc = ( + pa.chunked_array(lk.chunks + rk.chunks) # type: ignore[union-attr] + .combine_chunks() + .dictionary_encode() + ) + + llab, rlab, count = ( + pc.fill_null(dc.indices[slice(len_lk)], -1) + .to_numpy() + .astype(np.intp, copy=False), + pc.fill_null(dc.indices[slice(len_lk, None)], -1) + .to_numpy() + .astype(np.intp, copy=False), + len(dc.dictionary), + ) + + if sort: + uniques = dc.dictionary.to_numpy(zero_copy_only=False) + llab, rlab = _sort_labels(uniques, llab, rlab) + + if dc.null_count > 0: + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) + count += 1 + return llab, rlab, count + + if not isinstance(lk, BaseMaskedArray) and not ( + # exclude arrow dtypes that would get cast to object + isinstance(lk.dtype, ArrowDtype) + and ( + is_numeric_dtype(lk.dtype.numpy_dtype) + or is_string_dtype(lk.dtype) + and not sort + ) + ): + lk, _ = lk._values_for_factorize() + + # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute + # "_values_for_factorize" + rk, _ = rk._values_for_factorize() # type: ignore[union-attr] + + if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype: + # GH#23917 TODO: Needs tests for non-matching dtypes + # GH#23917 TODO: needs tests for case where lk is integer-dtype + # and rk is datetime-dtype + lk = np.asarray(lk, dtype=np.int64) + rk = np.asarray(rk, dtype=np.int64) + + klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk) + + rizer = klass(max(len(lk), len(rk))) + + if isinstance(lk, BaseMaskedArray): + assert isinstance(rk, BaseMaskedArray) + llab = rizer.factorize(lk._data, mask=lk._mask) + rlab = rizer.factorize(rk._data, mask=rk._mask) + elif isinstance(lk, ArrowExtensionArray): + assert isinstance(rk, ArrowExtensionArray) + # we can only get here with numeric dtypes + # TODO: Remove when we have a Factorizer for Arrow + llab = rizer.factorize( + lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna() + ) + rlab = rizer.factorize( + rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna() + ) + else: + # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type + # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], + # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" + llab = rizer.factorize(lk) # type: ignore[arg-type] + rlab = rizer.factorize(rk) # type: ignore[arg-type] + assert llab.dtype == np.dtype(np.intp), llab.dtype + assert rlab.dtype == np.dtype(np.intp), rlab.dtype + + count = rizer.get_count() + + if sort: + uniques = rizer.uniques.to_array() + llab, rlab = _sort_labels(uniques, llab, rlab) + + # NA group + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + + if lany or rany: + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) + count += 1 + + return llab, rlab, count + + +def _convert_arrays_and_get_rizer_klass( + lk: ArrayLike, rk: ArrayLike +) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]: + klass: type[libhashtable.Factorizer] + if is_numeric_dtype(lk.dtype): + if lk.dtype != rk.dtype: + dtype = find_common_type([lk.dtype, rk.dtype]) + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + if not isinstance(lk, ExtensionArray): + lk = cls._from_sequence(lk, dtype=dtype, copy=False) + else: + lk = lk.astype(dtype, copy=False) + + if not isinstance(rk, ExtensionArray): + rk = cls._from_sequence(rk, dtype=dtype, copy=False) + else: + rk = rk.astype(dtype, copy=False) + else: + lk = lk.astype(dtype, copy=False) + rk = rk.astype(dtype, copy=False) + if isinstance(lk, BaseMaskedArray): + # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]"; + # expected type "Type[object]" + klass = _factorizers[lk.dtype.type] # type: ignore[index] + elif isinstance(lk.dtype, ArrowDtype): + klass = _factorizers[lk.dtype.numpy_dtype.type] + else: + klass = _factorizers[lk.dtype.type] + + else: + klass = libhashtable.ObjectFactorizer + lk = ensure_object(lk) + rk = ensure_object(rk) + return klass, lk, rk + + +def _sort_labels( + uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp] +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + llength = len(left) + labels = np.concatenate([left, right]) + + _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True) + new_left, new_right = new_labels[:llength], new_labels[llength:] + + return new_left, new_right + + +def _get_join_keys( + llab: list[npt.NDArray[np.int64 | np.intp]], + rlab: list[npt.NDArray[np.int64 | np.intp]], + shape: Shape, + sort: bool, +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: + # how many levels can be done without overflow + nlev = next( + lev + for lev in range(len(shape), 0, -1) + if not is_int64_overflow_possible(shape[:lev]) + ) + + # get keys for the first `nlev` levels + stride = np.prod(shape[1:nlev], dtype="i8") + lkey = stride * llab[0].astype("i8", subok=False, copy=False) + rkey = stride * rlab[0].astype("i8", subok=False, copy=False) + + for i in range(1, nlev): + with np.errstate(divide="ignore"): + stride //= shape[i] + lkey += llab[i] * stride + rkey += rlab[i] * stride + + if nlev == len(shape): # all done! + return lkey, rkey + + # densify current keys to avoid overflow + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) + + llab = [lkey] + llab[nlev:] + rlab = [rkey] + rlab[nlev:] + shape = (count,) + shape[nlev:] + + return _get_join_keys(llab, rlab, shape, sort) + + +def _should_fill(lname, rname) -> bool: + if not isinstance(lname, str) or not isinstance(rname, str): + return True + return lname == rname + + +def _any(x) -> bool: + return x is not None and com.any_not_none(*x) + + +def _validate_operand(obj: DataFrame | Series) -> DataFrame: + if isinstance(obj, ABCDataFrame): + return obj + elif isinstance(obj, ABCSeries): + if obj.name is None: + raise ValueError("Cannot merge a Series without a name") + return obj.to_frame() + else: + raise TypeError( + f"Can only merge Series or DataFrame objects, a {type(obj)} was passed" + ) + + +def _items_overlap_with_suffix( + left: Index, right: Index, suffixes: Suffixes +) -> tuple[Index, Index]: + """ + Suffixes type validation. + + If two indices overlap, add suffixes to overlapping entries. + + If corresponding suffix is empty, the entry is simply converted to string. + + """ + if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict): + raise TypeError( + f"Passing 'suffixes' as a {type(suffixes)}, is not supported. " + "Provide 'suffixes' as a tuple instead." + ) + + to_rename = left.intersection(right) + if len(to_rename) == 0: + return left, right + + lsuffix, rsuffix = suffixes + + if not lsuffix and not rsuffix: + raise ValueError(f"columns overlap but no suffix specified: {to_rename}") + + def renamer(x, suffix: str | None): + """ + Rename the left and right indices. + + If there is overlap, and suffix is not None, add + suffix, otherwise, leave it as-is. + + Parameters + ---------- + x : original column name + suffix : str or None + + Returns + ------- + x : renamed column name + """ + if x in to_rename and suffix is not None: + return f"{x}{suffix}" + return x + + lrenamer = partial(renamer, suffix=lsuffix) + rrenamer = partial(renamer, suffix=rsuffix) + + llabels = left._transform_index(lrenamer) + rlabels = right._transform_index(rrenamer) + + dups = [] + if not llabels.is_unique: + # Only warn when duplicates are caused because of suffixes, already duplicated + # columns in origin should not warn + dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist() + if not rlabels.is_unique: + dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) + if dups: + raise MergeError( + f"Passing 'suffixes' which cause duplicate columns {set(dups)} is " + f"not allowed.", + ) + + return llabels, rlabels diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/pivot.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/pivot.py new file mode 100644 index 0000000000000000000000000000000000000000..b2a915589cba756c16ecaa95c3d056bd7c902c68 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/pivot.py @@ -0,0 +1,899 @@ +from __future__ import annotations + +from collections.abc import ( + Hashable, + Sequence, +) +from typing import ( + TYPE_CHECKING, + Callable, + Literal, + cast, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas.util._decorators import ( + Appender, + Substitution, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.common import ( + is_list_like, + is_nested_list_like, + is_scalar, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) + +import pandas.core.common as com +from pandas.core.frame import _shared_docs +from pandas.core.groupby import Grouper +from pandas.core.indexes.api import ( + Index, + MultiIndex, + get_objs_combined_axis, +) +from pandas.core.reshape.concat import concat +from pandas.core.reshape.util import cartesian_product +from pandas.core.series import Series + +if TYPE_CHECKING: + from pandas._typing import ( + AggFuncType, + AggFuncTypeBase, + AggFuncTypeDict, + IndexLabel, + ) + + from pandas import DataFrame + + +# Note: We need to make sure `frame` is imported before `pivot`, otherwise +# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency +@Substitution("\ndata : DataFrame") +@Appender(_shared_docs["pivot_table"], indents=1) +def pivot_table( + data: DataFrame, + values=None, + index=None, + columns=None, + aggfunc: AggFuncType = "mean", + fill_value=None, + margins: bool = False, + dropna: bool = True, + margins_name: Hashable = "All", + observed: bool | lib.NoDefault = lib.no_default, + sort: bool = True, +) -> DataFrame: + index = _convert_by(index) + columns = _convert_by(columns) + + if isinstance(aggfunc, list): + pieces: list[DataFrame] = [] + keys = [] + for func in aggfunc: + _table = __internal_pivot_table( + data, + values=values, + index=index, + columns=columns, + fill_value=fill_value, + aggfunc=func, + margins=margins, + dropna=dropna, + margins_name=margins_name, + observed=observed, + sort=sort, + ) + pieces.append(_table) + keys.append(getattr(func, "__name__", func)) + + table = concat(pieces, keys=keys, axis=1) + return table.__finalize__(data, method="pivot_table") + + table = __internal_pivot_table( + data, + values, + index, + columns, + aggfunc, + fill_value, + margins, + dropna, + margins_name, + observed, + sort, + ) + return table.__finalize__(data, method="pivot_table") + + +def __internal_pivot_table( + data: DataFrame, + values, + index, + columns, + aggfunc: AggFuncTypeBase | AggFuncTypeDict, + fill_value, + margins: bool, + dropna: bool, + margins_name: Hashable, + observed: bool | lib.NoDefault, + sort: bool, +) -> DataFrame: + """ + Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``. + """ + keys = index + columns + + values_passed = values is not None + if values_passed: + if is_list_like(values): + values_multi = True + values = list(values) + else: + values_multi = False + values = [values] + + # GH14938 Make sure value labels are in data + for i in values: + if i not in data: + raise KeyError(i) + + to_filter = [] + for x in keys + values: + if isinstance(x, Grouper): + x = x.key + try: + if x in data: + to_filter.append(x) + except TypeError: + pass + if len(to_filter) < len(data.columns): + data = data[to_filter] + + else: + values = data.columns + for key in keys: + try: + values = values.drop(key) + except (TypeError, ValueError, KeyError): + pass + values = list(values) + + observed_bool = False if observed is lib.no_default else observed + grouped = data.groupby(keys, observed=observed_bool, sort=sort, dropna=dropna) + if observed is lib.no_default and any( + ping._passed_categorical for ping in grouped._grouper.groupings + ): + warnings.warn( + "The default value of observed=False is deprecated and will change " + "to observed=True in a future version of pandas. Specify " + "observed=False to silence this warning and retain the current behavior", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + agged = grouped.agg(aggfunc) + + if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): + agged = agged.dropna(how="all") + + table = agged + + # GH17038, this check should only happen if index is defined (not None) + if table.index.nlevels > 1 and index: + # Related GH #17123 + # If index_names are integers, determine whether the integers refer + # to the level position or name. + index_names = agged.index.names[: len(index)] + to_unstack = [] + for i in range(len(index), len(keys)): + name = agged.index.names[i] + if name is None or name in index_names: + to_unstack.append(i) + else: + to_unstack.append(name) + table = agged.unstack(to_unstack, fill_value=fill_value) + + if not dropna: + if isinstance(table.index, MultiIndex): + m = MultiIndex.from_arrays( + cartesian_product(table.index.levels), names=table.index.names + ) + table = table.reindex(m, axis=0, fill_value=fill_value) + + if isinstance(table.columns, MultiIndex): + m = MultiIndex.from_arrays( + cartesian_product(table.columns.levels), names=table.columns.names + ) + table = table.reindex(m, axis=1, fill_value=fill_value) + + if sort is True and isinstance(table, ABCDataFrame): + table = table.sort_index(axis=1) + + if fill_value is not None: + table = table.fillna(fill_value) + if aggfunc is len and not observed and lib.is_integer(fill_value): + # TODO: can we avoid this? this used to be handled by + # downcast="infer" in fillna + table = table.astype(np.int64) + + if margins: + if dropna: + data = data[data.notna().all(axis=1)] + table = _add_margins( + table, + data, + values, + rows=index, + cols=columns, + aggfunc=aggfunc, + observed=dropna, + margins_name=margins_name, + fill_value=fill_value, + ) + + # discard the top level + if values_passed and not values_multi and table.columns.nlevels > 1: + table.columns = table.columns.droplevel(0) + if len(index) == 0 and len(columns) > 0: + table = table.T + + # GH 15193 Make sure empty columns are removed if dropna=True + if isinstance(table, ABCDataFrame) and dropna: + table = table.dropna(how="all", axis=1) + + return table + + +def _add_margins( + table: DataFrame | Series, + data: DataFrame, + values, + rows, + cols, + aggfunc, + observed: bool, + margins_name: Hashable = "All", + fill_value=None, +): + if not isinstance(margins_name, str): + raise ValueError("margins_name argument must be a string") + + msg = f'Conflicting name "{margins_name}" in margins' + for level in table.index.names: + if margins_name in table.index.get_level_values(level): + raise ValueError(msg) + + grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) + + if table.ndim == 2: + # i.e. DataFrame + for level in table.columns.names[1:]: + if margins_name in table.columns.get_level_values(level): + raise ValueError(msg) + + key: str | tuple[str, ...] + if len(rows) > 1: + key = (margins_name,) + ("",) * (len(rows) - 1) + else: + key = margins_name + + if not values and isinstance(table, ABCSeries): + # If there are no values and the table is a series, then there is only + # one column in the data. Compute grand margin and return it. + return table._append(table._constructor({key: grand_margin[margins_name]})) + + elif values: + marginal_result_set = _generate_marginal_results( + table, data, values, rows, cols, aggfunc, observed, margins_name + ) + if not isinstance(marginal_result_set, tuple): + return marginal_result_set + result, margin_keys, row_margin = marginal_result_set + else: + # no values, and table is a DataFrame + assert isinstance(table, ABCDataFrame) + marginal_result_set = _generate_marginal_results_without_values( + table, data, rows, cols, aggfunc, observed, margins_name + ) + if not isinstance(marginal_result_set, tuple): + return marginal_result_set + result, margin_keys, row_margin = marginal_result_set + + row_margin = row_margin.reindex(result.columns, fill_value=fill_value) + # populate grand margin + for k in margin_keys: + if isinstance(k, str): + row_margin[k] = grand_margin[k] + else: + row_margin[k] = grand_margin[k[0]] + + from pandas import DataFrame + + margin_dummy = DataFrame(row_margin, columns=Index([key])).T + + row_names = result.index.names + # check the result column and leave floats + + for dtype in set(result.dtypes): + if isinstance(dtype, ExtensionDtype): + # Can hold NA already + continue + + cols = result.select_dtypes([dtype]).columns + margin_dummy[cols] = margin_dummy[cols].apply( + maybe_downcast_to_dtype, args=(dtype,) + ) + result = result._append(margin_dummy) + result.index.names = row_names + + return result + + +def _compute_grand_margin( + data: DataFrame, values, aggfunc, margins_name: Hashable = "All" +): + if values: + grand_margin = {} + for k, v in data[values].items(): + try: + if isinstance(aggfunc, str): + grand_margin[k] = getattr(v, aggfunc)() + elif isinstance(aggfunc, dict): + if isinstance(aggfunc[k], str): + grand_margin[k] = getattr(v, aggfunc[k])() + else: + grand_margin[k] = aggfunc[k](v) + else: + grand_margin[k] = aggfunc(v) + except TypeError: + pass + return grand_margin + else: + return {margins_name: aggfunc(data.index)} + + +def _generate_marginal_results( + table, + data: DataFrame, + values, + rows, + cols, + aggfunc, + observed: bool, + margins_name: Hashable = "All", +): + margin_keys: list | Index + if len(cols) > 0: + # need to "interleave" the margins + table_pieces = [] + margin_keys = [] + + def _all_key(key): + return (key, margins_name) + ("",) * (len(cols) - 1) + + if len(rows) > 0: + margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc) + cat_axis = 1 + + for key, piece in table.T.groupby(level=0, observed=observed): + piece = piece.T + all_key = _all_key(key) + + # we are going to mutate this, so need to copy! + piece = piece.copy() + piece[all_key] = margin[key] + + table_pieces.append(piece) + margin_keys.append(all_key) + else: + from pandas import DataFrame + + cat_axis = 0 + for key, piece in table.groupby(level=0, observed=observed): + if len(cols) > 1: + all_key = _all_key(key) + else: + all_key = margins_name + table_pieces.append(piece) + # GH31016 this is to calculate margin for each group, and assign + # corresponded key as index + transformed_piece = DataFrame(piece.apply(aggfunc)).T + if isinstance(piece.index, MultiIndex): + # We are adding an empty level + transformed_piece.index = MultiIndex.from_tuples( + [all_key], names=piece.index.names + [None] + ) + else: + transformed_piece.index = Index([all_key], name=piece.index.name) + + # append piece for margin into table_piece + table_pieces.append(transformed_piece) + margin_keys.append(all_key) + + if not table_pieces: + # GH 49240 + return table + else: + result = concat(table_pieces, axis=cat_axis) + + if len(rows) == 0: + return result + else: + result = table + margin_keys = table.columns + + if len(cols) > 0: + row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) + row_margin = row_margin.stack(future_stack=True) + + # GH#26568. Use names instead of indices in case of numeric names + new_order_indices = [len(cols)] + list(range(len(cols))) + new_order_names = [row_margin.index.names[i] for i in new_order_indices] + row_margin.index = row_margin.index.reorder_levels(new_order_names) + else: + row_margin = data._constructor_sliced(np.nan, index=result.columns) + + return result, margin_keys, row_margin + + +def _generate_marginal_results_without_values( + table: DataFrame, + data: DataFrame, + rows, + cols, + aggfunc, + observed: bool, + margins_name: Hashable = "All", +): + margin_keys: list | Index + if len(cols) > 0: + # need to "interleave" the margins + margin_keys = [] + + def _all_key(): + if len(cols) == 1: + return margins_name + return (margins_name,) + ("",) * (len(cols) - 1) + + if len(rows) > 0: + margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc) + all_key = _all_key() + table[all_key] = margin + result = table + margin_keys.append(all_key) + + else: + margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc) + all_key = _all_key() + table[all_key] = margin + result = table + margin_keys.append(all_key) + return result + else: + result = table + margin_keys = table.columns + + if len(cols): + row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc) + else: + row_margin = Series(np.nan, index=result.columns) + + return result, margin_keys, row_margin + + +def _convert_by(by): + if by is None: + by = [] + elif ( + is_scalar(by) + or isinstance(by, (np.ndarray, Index, ABCSeries, Grouper)) + or callable(by) + ): + by = [by] + else: + by = list(by) + return by + + +@Substitution("\ndata : DataFrame") +@Appender(_shared_docs["pivot"], indents=1) +def pivot( + data: DataFrame, + *, + columns: IndexLabel, + index: IndexLabel | lib.NoDefault = lib.no_default, + values: IndexLabel | lib.NoDefault = lib.no_default, +) -> DataFrame: + columns_listlike = com.convert_to_list_like(columns) + + # If columns is None we will create a MultiIndex level with None as name + # which might cause duplicated names because None is the default for + # level names + data = data.copy(deep=False) + data.index = data.index.copy() + data.index.names = [ + name if name is not None else lib.no_default for name in data.index.names + ] + + indexed: DataFrame | Series + if values is lib.no_default: + if index is not lib.no_default: + cols = com.convert_to_list_like(index) + else: + cols = [] + + append = index is lib.no_default + # error: Unsupported operand types for + ("List[Any]" and "ExtensionArray") + # error: Unsupported left operand type for + ("ExtensionArray") + indexed = data.set_index( + cols + columns_listlike, append=append # type: ignore[operator] + ) + else: + index_list: list[Index] | list[Series] + if index is lib.no_default: + if isinstance(data.index, MultiIndex): + # GH 23955 + index_list = [ + data.index.get_level_values(i) for i in range(data.index.nlevels) + ] + else: + index_list = [ + data._constructor_sliced(data.index, name=data.index.name) + ] + else: + index_list = [data[idx] for idx in com.convert_to_list_like(index)] + + data_columns = [data[col] for col in columns_listlike] + index_list.extend(data_columns) + multiindex = MultiIndex.from_arrays(index_list) + + if is_list_like(values) and not isinstance(values, tuple): + # Exclude tuple because it is seen as a single column name + values = cast(Sequence[Hashable], values) + indexed = data._constructor( + data[values]._values, index=multiindex, columns=values + ) + else: + indexed = data._constructor_sliced(data[values]._values, index=multiindex) + # error: Argument 1 to "unstack" of "DataFrame" has incompatible type "Union + # [List[Any], ExtensionArray, ndarray[Any, Any], Index, Series]"; expected + # "Hashable" + result = indexed.unstack(columns_listlike) # type: ignore[arg-type] + result.index.names = [ + name if name is not lib.no_default else None for name in result.index.names + ] + + return result + + +def crosstab( + index, + columns, + values=None, + rownames=None, + colnames=None, + aggfunc=None, + margins: bool = False, + margins_name: Hashable = "All", + dropna: bool = True, + normalize: bool | Literal[0, 1, "all", "index", "columns"] = False, +) -> DataFrame: + """ + Compute a simple cross tabulation of two (or more) factors. + + By default, computes a frequency table of the factors unless an + array of values and an aggregation function are passed. + + Parameters + ---------- + index : array-like, Series, or list of arrays/Series + Values to group by in the rows. + columns : array-like, Series, or list of arrays/Series + Values to group by in the columns. + values : array-like, optional + Array of values to aggregate according to the factors. + Requires `aggfunc` be specified. + rownames : sequence, default None + If passed, must match number of row arrays passed. + colnames : sequence, default None + If passed, must match number of column arrays passed. + aggfunc : function, optional + If specified, requires `values` be specified as well. + margins : bool, default False + Add row/column margins (subtotals). + margins_name : str, default 'All' + Name of the row/column that will contain the totals + when margins is True. + dropna : bool, default True + Do not include columns whose entries are all NaN. + normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False + Normalize by dividing all values by the sum of values. + + - If passed 'all' or `True`, will normalize over all values. + - If passed 'index' will normalize over each row. + - If passed 'columns' will normalize over each column. + - If margins is `True`, will also normalize margin values. + + Returns + ------- + DataFrame + Cross tabulation of the data. + + See Also + -------- + DataFrame.pivot : Reshape data based on column values. + pivot_table : Create a pivot table as a DataFrame. + + Notes + ----- + Any Series passed will have their name attributes used unless row or column + names for the cross-tabulation are specified. + + Any input passed containing Categorical data will have **all** of its + categories included in the cross-tabulation, even if the actual data does + not contain any instances of a particular category. + + In the event that there aren't overlapping indexes an empty DataFrame will + be returned. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", + ... "bar", "bar", "foo", "foo", "foo"], dtype=object) + >>> b = np.array(["one", "one", "one", "two", "one", "one", + ... "one", "two", "two", "two", "one"], dtype=object) + >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", + ... "shiny", "dull", "shiny", "shiny", "shiny"], + ... dtype=object) + >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + b one two + c dull shiny dull shiny + a + bar 1 2 1 0 + foo 2 2 1 2 + + Here 'c' and 'f' are not represented in the data and will not be + shown in the output because dropna is True by default. Set + dropna=False to preserve categories with no data. + + >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + >>> pd.crosstab(foo, bar) + col_0 d e + row_0 + a 1 0 + b 0 1 + >>> pd.crosstab(foo, bar, dropna=False) + col_0 d e f + row_0 + a 1 0 0 + b 0 1 0 + c 0 0 0 + """ + if values is None and aggfunc is not None: + raise ValueError("aggfunc cannot be used without values.") + + if values is not None and aggfunc is None: + raise ValueError("values cannot be used without an aggfunc.") + + if not is_nested_list_like(index): + index = [index] + if not is_nested_list_like(columns): + columns = [columns] + + common_idx = None + pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))] + if pass_objs: + common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) + + rownames = _get_names(index, rownames, prefix="row") + colnames = _get_names(columns, colnames, prefix="col") + + # duplicate names mapped to unique names for pivot op + ( + rownames_mapper, + unique_rownames, + colnames_mapper, + unique_colnames, + ) = _build_names_mapper(rownames, colnames) + + from pandas import DataFrame + + data = { + **dict(zip(unique_rownames, index)), + **dict(zip(unique_colnames, columns)), + } + df = DataFrame(data, index=common_idx) + + if values is None: + df["__dummy__"] = 0 + kwargs = {"aggfunc": len, "fill_value": 0} + else: + df["__dummy__"] = values + kwargs = {"aggfunc": aggfunc} + + # error: Argument 7 to "pivot_table" of "DataFrame" has incompatible type + # "**Dict[str, object]"; expected "Union[...]" + table = df.pivot_table( + "__dummy__", + index=unique_rownames, + columns=unique_colnames, + margins=margins, + margins_name=margins_name, + dropna=dropna, + observed=False, + **kwargs, # type: ignore[arg-type] + ) + + # Post-process + if normalize is not False: + table = _normalize( + table, normalize=normalize, margins=margins, margins_name=margins_name + ) + + table = table.rename_axis(index=rownames_mapper, axis=0) + table = table.rename_axis(columns=colnames_mapper, axis=1) + + return table + + +def _normalize( + table: DataFrame, normalize, margins: bool, margins_name: Hashable = "All" +) -> DataFrame: + if not isinstance(normalize, (bool, str)): + axis_subs = {0: "index", 1: "columns"} + try: + normalize = axis_subs[normalize] + except KeyError as err: + raise ValueError("Not a valid normalize argument") from err + + if margins is False: + # Actual Normalizations + normalizers: dict[bool | str, Callable] = { + "all": lambda x: x / x.sum(axis=1).sum(axis=0), + "columns": lambda x: x / x.sum(), + "index": lambda x: x.div(x.sum(axis=1), axis=0), + } + + normalizers[True] = normalizers["all"] + + try: + f = normalizers[normalize] + except KeyError as err: + raise ValueError("Not a valid normalize argument") from err + + table = f(table) + table = table.fillna(0) + + elif margins is True: + # keep index and column of pivoted table + table_index = table.index + table_columns = table.columns + last_ind_or_col = table.iloc[-1, :].name + + # check if margin name is not in (for MI cases) and not equal to last + # index/column and save the column and index margin + if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col): + raise ValueError(f"{margins_name} not in pivoted DataFrame") + column_margin = table.iloc[:-1, -1] + index_margin = table.iloc[-1, :-1] + + # keep the core table + table = table.iloc[:-1, :-1] + + # Normalize core + table = _normalize(table, normalize=normalize, margins=False) + + # Fix Margins + if normalize == "columns": + column_margin = column_margin / column_margin.sum() + table = concat([table, column_margin], axis=1) + table = table.fillna(0) + table.columns = table_columns + + elif normalize == "index": + index_margin = index_margin / index_margin.sum() + table = table._append(index_margin) + table = table.fillna(0) + table.index = table_index + + elif normalize == "all" or normalize is True: + column_margin = column_margin / column_margin.sum() + index_margin = index_margin / index_margin.sum() + index_margin.loc[margins_name] = 1 + table = concat([table, column_margin], axis=1) + table = table._append(index_margin) + + table = table.fillna(0) + table.index = table_index + table.columns = table_columns + + else: + raise ValueError("Not a valid normalize argument") + + else: + raise ValueError("Not a valid margins argument") + + return table + + +def _get_names(arrs, names, prefix: str = "row"): + if names is None: + names = [] + for i, arr in enumerate(arrs): + if isinstance(arr, ABCSeries) and arr.name is not None: + names.append(arr.name) + else: + names.append(f"{prefix}_{i}") + else: + if len(names) != len(arrs): + raise AssertionError("arrays and names must have the same length") + if not isinstance(names, list): + names = list(names) + + return names + + +def _build_names_mapper( + rownames: list[str], colnames: list[str] +) -> tuple[dict[str, str], list[str], dict[str, str], list[str]]: + """ + Given the names of a DataFrame's rows and columns, returns a set of unique row + and column names and mappers that convert to original names. + + A row or column name is replaced if it is duplicate among the rows of the inputs, + among the columns of the inputs or between the rows and the columns. + + Parameters + ---------- + rownames: list[str] + colnames: list[str] + + Returns + ------- + Tuple(Dict[str, str], List[str], Dict[str, str], List[str]) + + rownames_mapper: dict[str, str] + a dictionary with new row names as keys and original rownames as values + unique_rownames: list[str] + a list of rownames with duplicate names replaced by dummy names + colnames_mapper: dict[str, str] + a dictionary with new column names as keys and original column names as values + unique_colnames: list[str] + a list of column names with duplicate names replaced by dummy names + + """ + + def get_duplicates(names): + seen: set = set() + return {name for name in names if name not in seen} + + shared_names = set(rownames).intersection(set(colnames)) + dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names + + rownames_mapper = { + f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names + } + unique_rownames = [ + f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames) + ] + + colnames_mapper = { + f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names + } + unique_colnames = [ + f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames) + ] + + return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/reshape.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/reshape.py new file mode 100644 index 0000000000000000000000000000000000000000..7a49682d7c57c90ed26e890777758ad806bd961b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/reshape.py @@ -0,0 +1,989 @@ +from __future__ import annotations + +import itertools +from typing import ( + TYPE_CHECKING, + cast, +) +import warnings + +import numpy as np + +import pandas._libs.reshape as libreshape +from pandas.errors import PerformanceWarning +from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.cast import ( + find_common_type, + maybe_promote, +) +from pandas.core.dtypes.common import ( + ensure_platform_int, + is_1d_only_ea_dtype, + is_integer, + needs_i8_conversion, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.missing import notna + +import pandas.core.algorithms as algos +from pandas.core.algorithms import ( + factorize, + unique, +) +from pandas.core.arrays.categorical import factorize_from_iterable +from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core.frame import DataFrame +from pandas.core.indexes.api import ( + Index, + MultiIndex, + RangeIndex, +) +from pandas.core.reshape.concat import concat +from pandas.core.series import Series +from pandas.core.sorting import ( + compress_group_index, + decons_obs_group_ids, + get_compressed_ids, + get_group_index, + get_group_index_sorter, +) + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + Level, + npt, + ) + + from pandas.core.arrays import ExtensionArray + from pandas.core.indexes.frozen import FrozenList + + +class _Unstacker: + """ + Helper class to unstack data / pivot with multi-level index + + Parameters + ---------- + index : MultiIndex + level : int or str, default last level + Level to "unstack". Accepts a name for the level. + fill_value : scalar, optional + Default value to fill in missing values if subgroups do not have the + same set of labels. By default, missing values will be replaced with + the default fill value for that data type, NaN for float, NaT for + datetimelike, etc. For integer types, by default data will converted to + float and missing values will be set to NaN. + constructor : object + Pandas ``DataFrame`` or subclass used to create unstacked + response. If None, DataFrame will be used. + + Examples + -------- + >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), + ... ('two', 'a'), ('two', 'b')]) + >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index) + >>> s + one a 1 + b 2 + two a 3 + b 4 + dtype: int64 + + >>> s.unstack(level=-1) + a b + one 1 2 + two 3 4 + + >>> s.unstack(level=0) + one two + a 1 3 + b 2 4 + + Returns + ------- + unstacked : DataFrame + """ + + def __init__( + self, index: MultiIndex, level: Level, constructor, sort: bool = True + ) -> None: + self.constructor = constructor + self.sort = sort + + self.index = index.remove_unused_levels() + + self.level = self.index._get_level_number(level) + + # when index includes `nan`, need to lift levels/strides by 1 + self.lift = 1 if -1 in self.index.codes[self.level] else 0 + + # Note: the "pop" below alters these in-place. + self.new_index_levels = list(self.index.levels) + self.new_index_names = list(self.index.names) + + self.removed_name = self.new_index_names.pop(self.level) + self.removed_level = self.new_index_levels.pop(self.level) + self.removed_level_full = index.levels[self.level] + if not self.sort: + unique_codes = unique(self.index.codes[self.level]) + self.removed_level = self.removed_level.take(unique_codes) + self.removed_level_full = self.removed_level_full.take(unique_codes) + + # Bug fix GH 20601 + # If the data frame is too big, the number of unique index combination + # will cause int32 overflow on windows environments. + # We want to check and raise an warning before this happens + num_rows = np.max([index_level.size for index_level in self.new_index_levels]) + num_columns = self.removed_level.size + + # GH20601: This forces an overflow if the number of cells is too high. + num_cells = num_rows * num_columns + + # GH 26314: Previous ValueError raised was too restrictive for many users. + if num_cells > np.iinfo(np.int32).max: + warnings.warn( + f"The following operation may generate {num_cells} cells " + f"in the resulting pandas object.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) + + self._make_selectors() + + @cache_readonly + def _indexer_and_to_sort( + self, + ) -> tuple[ + npt.NDArray[np.intp], + list[np.ndarray], # each has _some_ signed integer dtype + ]: + v = self.level + + codes = list(self.index.codes) + levs = list(self.index.levels) + to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] + sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) + + comp_index, obs_ids = get_compressed_ids(to_sort, sizes) + ngroups = len(obs_ids) + + indexer = get_group_index_sorter(comp_index, ngroups) + return indexer, to_sort + + @cache_readonly + def sorted_labels(self) -> list[np.ndarray]: + indexer, to_sort = self._indexer_and_to_sort + if self.sort: + return [line.take(indexer) for line in to_sort] + return to_sort + + def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: + if self.sort: + indexer, _ = self._indexer_and_to_sort + + sorted_values = algos.take_nd(values, indexer, axis=0) + return sorted_values + return values + + def _make_selectors(self): + new_levels = self.new_index_levels + + # make the mask + remaining_labels = self.sorted_labels[:-1] + level_sizes = tuple(len(x) for x in new_levels) + + comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) + ngroups = len(obs_ids) + + comp_index = ensure_platform_int(comp_index) + stride = self.index.levshape[self.level] + self.lift + self.full_shape = ngroups, stride + + selector = self.sorted_labels[-1] + stride * comp_index + self.lift + mask = np.zeros(np.prod(self.full_shape), dtype=bool) + mask.put(selector, True) + + if mask.sum() < len(self.index): + raise ValueError("Index contains duplicate entries, cannot reshape") + + self.group_index = comp_index + self.mask = mask + if self.sort: + self.compressor = comp_index.searchsorted(np.arange(ngroups)) + else: + self.compressor = np.sort(np.unique(comp_index, return_index=True)[1]) + + @cache_readonly + def mask_all(self) -> bool: + return bool(self.mask.all()) + + @cache_readonly + def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]: + # We cache this for reuse in ExtensionBlock._unstack + dummy_arr = np.arange(len(self.index), dtype=np.intp) + new_values, mask = self.get_new_values(dummy_arr, fill_value=-1) + return new_values, mask.any(0) + # TODO: in all tests we have mask.any(0).all(); can we rely on that? + + def get_result(self, values, value_columns, fill_value) -> DataFrame: + if values.ndim == 1: + values = values[:, np.newaxis] + + if value_columns is None and values.shape[1] != 1: # pragma: no cover + raise ValueError("must pass column labels for multi-column data") + + values, _ = self.get_new_values(values, fill_value) + columns = self.get_new_columns(value_columns) + index = self.new_index + + return self.constructor( + values, index=index, columns=columns, dtype=values.dtype + ) + + def get_new_values(self, values, fill_value=None): + if values.ndim == 1: + values = values[:, np.newaxis] + + sorted_values = self._make_sorted_values(values) + + # place the values + length, width = self.full_shape + stride = values.shape[1] + result_width = width * stride + result_shape = (length, result_width) + mask = self.mask + mask_all = self.mask_all + + # we can simply reshape if we don't have a mask + if mask_all and len(values): + # TODO: Under what circumstances can we rely on sorted_values + # matching values? When that holds, we can slice instead + # of take (in particular for EAs) + new_values = ( + sorted_values.reshape(length, width, stride) + .swapaxes(1, 2) + .reshape(result_shape) + ) + new_mask = np.ones(result_shape, dtype=bool) + return new_values, new_mask + + dtype = values.dtype + + # if our mask is all True, then we can use our existing dtype + if mask_all: + dtype = values.dtype + new_values = np.empty(result_shape, dtype=dtype) + else: + if isinstance(dtype, ExtensionDtype): + # GH#41875 + # We are assuming that fill_value can be held by this dtype, + # unlike the non-EA case that promotes. + cls = dtype.construct_array_type() + new_values = cls._empty(result_shape, dtype=dtype) + new_values[:] = fill_value + else: + dtype, fill_value = maybe_promote(dtype, fill_value) + new_values = np.empty(result_shape, dtype=dtype) + new_values.fill(fill_value) + + name = dtype.name + new_mask = np.zeros(result_shape, dtype=bool) + + # we need to convert to a basic dtype + # and possibly coerce an input to our output dtype + # e.g. ints -> floats + if needs_i8_conversion(values.dtype): + sorted_values = sorted_values.view("i8") + new_values = new_values.view("i8") + else: + sorted_values = sorted_values.astype(name, copy=False) + + # fill in our values & mask + libreshape.unstack( + sorted_values, + mask.view("u1"), + stride, + length, + width, + new_values, + new_mask.view("u1"), + ) + + # reconstruct dtype if needed + if needs_i8_conversion(values.dtype): + # view as datetime64 so we can wrap in DatetimeArray and use + # DTA's view method + new_values = new_values.view("M8[ns]") + new_values = ensure_wrapped_if_datetimelike(new_values) + new_values = new_values.view(values.dtype) + + return new_values, new_mask + + def get_new_columns(self, value_columns: Index | None): + if value_columns is None: + if self.lift == 0: + return self.removed_level._rename(name=self.removed_name) + + lev = self.removed_level.insert(0, item=self.removed_level._na_value) + return lev.rename(self.removed_name) + + stride = len(self.removed_level) + self.lift + width = len(value_columns) + propagator = np.repeat(np.arange(width), stride) + + new_levels: FrozenList | list[Index] + + if isinstance(value_columns, MultiIndex): + # error: Cannot determine type of "__add__" [has-type] + new_levels = value_columns.levels + ( # type: ignore[has-type] + self.removed_level_full, + ) + new_names = value_columns.names + (self.removed_name,) + + new_codes = [lab.take(propagator) for lab in value_columns.codes] + else: + new_levels = [ + value_columns, + self.removed_level_full, + ] + new_names = [value_columns.name, self.removed_name] + new_codes = [propagator] + + repeater = self._repeater + + # The entire level is then just a repetition of the single chunk: + new_codes.append(np.tile(repeater, width)) + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + @cache_readonly + def _repeater(self) -> np.ndarray: + # The two indices differ only if the unstacked level had unused items: + if len(self.removed_level_full) != len(self.removed_level): + # In this case, we remap the new codes to the original level: + repeater = self.removed_level_full.get_indexer(self.removed_level) + if self.lift: + repeater = np.insert(repeater, 0, -1) + else: + # Otherwise, we just use each level item exactly once: + stride = len(self.removed_level) + self.lift + repeater = np.arange(stride) - self.lift + + return repeater + + @cache_readonly + def new_index(self) -> MultiIndex: + # Does not depend on values or value_columns + result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] + + # construct the new index + if len(self.new_index_levels) == 1: + level, level_codes = self.new_index_levels[0], result_codes[0] + if (level_codes == -1).any(): + level = level.insert(len(level), level._na_value) + return level.take(level_codes).rename(self.new_index_names[0]) + + return MultiIndex( + levels=self.new_index_levels, + codes=result_codes, + names=self.new_index_names, + verify_integrity=False, + ) + + +def _unstack_multiple( + data: Series | DataFrame, clocs, fill_value=None, sort: bool = True +): + if len(clocs) == 0: + return data + + # NOTE: This doesn't deal with hierarchical columns yet + + index = data.index + index = cast(MultiIndex, index) # caller is responsible for checking + + # GH 19966 Make sure if MultiIndexed index has tuple name, they will be + # recognised as a whole + if clocs in index.names: + clocs = [clocs] + clocs = [index._get_level_number(i) for i in clocs] + + rlocs = [i for i in range(index.nlevels) if i not in clocs] + + clevels = [index.levels[i] for i in clocs] + ccodes = [index.codes[i] for i in clocs] + cnames = [index.names[i] for i in clocs] + rlevels = [index.levels[i] for i in rlocs] + rcodes = [index.codes[i] for i in rlocs] + rnames = [index.names[i] for i in rlocs] + + shape = tuple(len(x) for x in clevels) + group_index = get_group_index(ccodes, shape, sort=False, xnull=False) + + comp_ids, obs_ids = compress_group_index(group_index, sort=False) + recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) + + if not rlocs: + # Everything is in clocs, so the dummy df has a regular index + dummy_index = Index(obs_ids, name="__placeholder__") + else: + dummy_index = MultiIndex( + levels=rlevels + [obs_ids], + codes=rcodes + [comp_ids], + names=rnames + ["__placeholder__"], + verify_integrity=False, + ) + + if isinstance(data, Series): + dummy = data.copy() + dummy.index = dummy_index + + unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort) + new_levels = clevels + new_names = cnames + new_codes = recons_codes + else: + if isinstance(data.columns, MultiIndex): + result = data + while clocs: + val = clocs.pop(0) + result = result.unstack(val, fill_value=fill_value, sort=sort) + clocs = [v if v < val else v - 1 for v in clocs] + + return result + + # GH#42579 deep=False to avoid consolidating + dummy_df = data.copy(deep=False) + dummy_df.index = dummy_index + + unstacked = dummy_df.unstack( + "__placeholder__", fill_value=fill_value, sort=sort + ) + if isinstance(unstacked, Series): + unstcols = unstacked.index + else: + unstcols = unstacked.columns + assert isinstance(unstcols, MultiIndex) # for mypy + new_levels = [unstcols.levels[0]] + clevels + new_names = [data.columns.name] + cnames + + new_codes = [unstcols.codes[0]] + new_codes.extend(rec.take(unstcols.codes[-1]) for rec in recons_codes) + + new_columns = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + if isinstance(unstacked, Series): + unstacked.index = new_columns + else: + unstacked.columns = new_columns + + return unstacked + + +def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): + if isinstance(level, (tuple, list)): + if len(level) != 1: + # _unstack_multiple only handles MultiIndexes, + # and isn't needed for a single level + return _unstack_multiple(obj, level, fill_value=fill_value, sort=sort) + else: + level = level[0] + + if not is_integer(level) and not level == "__placeholder__": + # check if level is valid in case of regular index + obj.index._get_level_number(level) + + if isinstance(obj, DataFrame): + if isinstance(obj.index, MultiIndex): + return _unstack_frame(obj, level, fill_value=fill_value, sort=sort) + else: + return obj.T.stack(future_stack=True) + elif not isinstance(obj.index, MultiIndex): + # GH 36113 + # Give nicer error messages when unstack a Series whose + # Index is not a MultiIndex. + raise ValueError( + f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" + ) + else: + if is_1d_only_ea_dtype(obj.dtype): + return _unstack_extension_series(obj, level, fill_value, sort=sort) + unstacker = _Unstacker( + obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort + ) + return unstacker.get_result( + obj._values, value_columns=None, fill_value=fill_value + ) + + +def _unstack_frame( + obj: DataFrame, level, fill_value=None, sort: bool = True +) -> DataFrame: + assert isinstance(obj.index, MultiIndex) # checked by caller + unstacker = _Unstacker( + obj.index, level=level, constructor=obj._constructor, sort=sort + ) + + if not obj._can_fast_transpose: + mgr = obj._mgr.unstack(unstacker, fill_value=fill_value) + return obj._constructor_from_mgr(mgr, axes=mgr.axes) + else: + return unstacker.get_result( + obj._values, value_columns=obj.columns, fill_value=fill_value + ) + + +def _unstack_extension_series( + series: Series, level, fill_value, sort: bool +) -> DataFrame: + """ + Unstack an ExtensionArray-backed Series. + + The ExtensionDtype is preserved. + + Parameters + ---------- + series : Series + A Series with an ExtensionArray for values + level : Any + The level name or number. + fill_value : Any + The user-level (not physical storage) fill value to use for + missing values introduced by the reshape. Passed to + ``series.values.take``. + sort : bool + Whether to sort the resulting MuliIndex levels + + Returns + ------- + DataFrame + Each column of the DataFrame will have the same dtype as + the input Series. + """ + # Defer to the logic in ExtensionBlock._unstack + df = series.to_frame() + result = df.unstack(level=level, fill_value=fill_value, sort=sort) + + # equiv: result.droplevel(level=0, axis=1) + # but this avoids an extra copy + result.columns = result.columns._drop_level_numbers([0]) + return result + + +def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True): + """ + Convert DataFrame to Series with multi-level Index. Columns become the + second level of the resulting hierarchical index + + Returns + ------- + stacked : Series or DataFrame + """ + + def stack_factorize(index): + if index.is_unique: + return index, np.arange(len(index)) + codes, categories = factorize_from_iterable(index) + return categories, codes + + N, K = frame.shape + + # Will also convert negative level numbers and check if out of bounds. + level_num = frame.columns._get_level_number(level) + + if isinstance(frame.columns, MultiIndex): + return _stack_multi_columns( + frame, level_num=level_num, dropna=dropna, sort=sort + ) + elif isinstance(frame.index, MultiIndex): + new_levels = list(frame.index.levels) + new_codes = [lab.repeat(K) for lab in frame.index.codes] + + clev, clab = stack_factorize(frame.columns) + new_levels.append(clev) + new_codes.append(np.tile(clab, N).ravel()) + + new_names = list(frame.index.names) + new_names.append(frame.columns.name) + new_index = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + else: + levels, (ilab, clab) = zip(*map(stack_factorize, (frame.index, frame.columns))) + codes = ilab.repeat(K), np.tile(clab, N).ravel() + new_index = MultiIndex( + levels=levels, + codes=codes, + names=[frame.index.name, frame.columns.name], + verify_integrity=False, + ) + + new_values: ArrayLike + if not frame.empty and frame._is_homogeneous_type: + # For homogeneous EAs, frame._values will coerce to object. So + # we concatenate instead. + dtypes = list(frame.dtypes._values) + dtype = dtypes[0] + + if isinstance(dtype, ExtensionDtype): + arr = dtype.construct_array_type() + new_values = arr._concat_same_type( + [col._values for _, col in frame.items()] + ) + new_values = _reorder_for_extension_array_stack(new_values, N, K) + else: + # homogeneous, non-EA + new_values = frame._values.ravel() + + else: + # non-homogeneous + new_values = frame._values.ravel() + + if dropna: + mask = notna(new_values) + new_values = new_values[mask] + new_index = new_index[mask] + + return frame._constructor_sliced(new_values, index=new_index) + + +def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = True): + # If all passed levels match up to column names, no + # ambiguity about what to do + if all(lev in frame.columns.names for lev in level): + result = frame + for lev in level: + result = stack(result, lev, dropna=dropna, sort=sort) + + # Otherwise, level numbers may change as each successive level is stacked + elif all(isinstance(lev, int) for lev in level): + # As each stack is done, the level numbers decrease, so we need + # to account for that when level is a sequence of ints + result = frame + # _get_level_number() checks level numbers are in range and converts + # negative numbers to positive + level = [frame.columns._get_level_number(lev) for lev in level] + + while level: + lev = level.pop(0) + result = stack(result, lev, dropna=dropna, sort=sort) + # Decrement all level numbers greater than current, as these + # have now shifted down by one + level = [v if v <= lev else v - 1 for v in level] + + else: + raise ValueError( + "level should contain all level names or all level " + "numbers, not a mixture of the two." + ) + + return result + + +def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex: + """Creates a MultiIndex from the first N-1 levels of this MultiIndex.""" + if len(columns.levels) <= 2: + return columns.levels[0]._rename(name=columns.names[0]) + + levs = [ + [lev[c] if c >= 0 else None for c in codes] + for lev, codes in zip(columns.levels[:-1], columns.codes[:-1]) + ] + + # Remove duplicate tuples in the MultiIndex. + tuples = zip(*levs) + unique_tuples = (key for key, _ in itertools.groupby(tuples)) + new_levs = zip(*unique_tuples) + + # The dtype of each level must be explicitly set to avoid inferring the wrong type. + # See GH-36991. + return MultiIndex.from_arrays( + [ + # Not all indices can accept None values. + Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev + for new_lev, lev in zip(new_levs, columns.levels) + ], + names=columns.names[:-1], + ) + + +def _stack_multi_columns( + frame: DataFrame, level_num: int = -1, dropna: bool = True, sort: bool = True +) -> DataFrame: + def _convert_level_number(level_num: int, columns: Index): + """ + Logic for converting the level number to something we can safely pass + to swaplevel. + + If `level_num` matches a column name return the name from + position `level_num`, otherwise return `level_num`. + """ + if level_num in columns.names: + return columns.names[level_num] + + return level_num + + this = frame.copy(deep=False) + mi_cols = this.columns # cast(MultiIndex, this.columns) + assert isinstance(mi_cols, MultiIndex) # caller is responsible + + # this makes life much simpler + if level_num != mi_cols.nlevels - 1: + # roll levels to put selected level at end + roll_columns = mi_cols + for i in range(level_num, mi_cols.nlevels - 1): + # Need to check if the ints conflict with level names + lev1 = _convert_level_number(i, roll_columns) + lev2 = _convert_level_number(i + 1, roll_columns) + roll_columns = roll_columns.swaplevel(lev1, lev2) + this.columns = mi_cols = roll_columns + + if not mi_cols._is_lexsorted() and sort: + # Workaround the edge case where 0 is one of the column names, + # which interferes with trying to sort based on the first + # level + level_to_sort = _convert_level_number(0, mi_cols) + this = this.sort_index(level=level_to_sort, axis=1) + mi_cols = this.columns + + mi_cols = cast(MultiIndex, mi_cols) + new_columns = _stack_multi_column_index(mi_cols) + + # time to ravel the values + new_data = {} + level_vals = mi_cols.levels[-1] + level_codes = unique(mi_cols.codes[-1]) + if sort: + level_codes = np.sort(level_codes) + level_vals_nan = level_vals.insert(len(level_vals), None) + + level_vals_used = np.take(level_vals_nan, level_codes) + levsize = len(level_codes) + drop_cols = [] + for key in new_columns: + try: + loc = this.columns.get_loc(key) + except KeyError: + drop_cols.append(key) + continue + + # can make more efficient? + # we almost always return a slice + # but if unsorted can get a boolean + # indexer + if not isinstance(loc, slice): + slice_len = len(loc) + else: + slice_len = loc.stop - loc.start + + if slice_len != levsize: + chunk = this.loc[:, this.columns[loc]] + chunk.columns = level_vals_nan.take(chunk.columns.codes[-1]) + value_slice = chunk.reindex(columns=level_vals_used).values + else: + subset = this.iloc[:, loc] + dtype = find_common_type(subset.dtypes.tolist()) + if isinstance(dtype, ExtensionDtype): + # TODO(EA2D): won't need special case, can go through .values + # paths below (might change to ._values) + value_slice = dtype.construct_array_type()._concat_same_type( + [x._values.astype(dtype, copy=False) for _, x in subset.items()] + ) + N, K = subset.shape + idx = np.arange(N * K).reshape(K, N).T.ravel() + value_slice = value_slice.take(idx) + else: + value_slice = subset.values + + if value_slice.ndim > 1: + # i.e. not extension + value_slice = value_slice.ravel() + + new_data[key] = value_slice + + if len(drop_cols) > 0: + new_columns = new_columns.difference(drop_cols) + + N = len(this) + + if isinstance(this.index, MultiIndex): + new_levels = list(this.index.levels) + new_names = list(this.index.names) + new_codes = [lab.repeat(levsize) for lab in this.index.codes] + else: + old_codes, old_levels = factorize_from_iterable(this.index) + new_levels = [old_levels] + new_codes = [old_codes.repeat(levsize)] + new_names = [this.index.name] # something better? + + new_levels.append(level_vals) + new_codes.append(np.tile(level_codes, N)) + new_names.append(frame.columns.names[level_num]) + + new_index = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + + result = frame._constructor(new_data, index=new_index, columns=new_columns) + + if frame.columns.nlevels > 1: + desired_columns = frame.columns._drop_level_numbers([level_num]).unique() + if not result.columns.equals(desired_columns): + result = result[desired_columns] + + # more efficient way to go about this? can do the whole masking biz but + # will only save a small amount of time... + if dropna: + result = result.dropna(axis=0, how="all") + + return result + + +def _reorder_for_extension_array_stack( + arr: ExtensionArray, n_rows: int, n_columns: int +) -> ExtensionArray: + """ + Re-orders the values when stacking multiple extension-arrays. + + The indirect stacking method used for EAs requires a followup + take to get the order correct. + + Parameters + ---------- + arr : ExtensionArray + n_rows, n_columns : int + The number of rows and columns in the original DataFrame. + + Returns + ------- + taken : ExtensionArray + The original `arr` with elements re-ordered appropriately + + Examples + -------- + >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f']) + >>> _reorder_for_extension_array_stack(arr, 2, 3) + array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='>> _reorder_for_extension_array_stack(arr, 3, 2) + array(['a', 'd', 'b', 'e', 'c', 'f'], dtype=' Series | DataFrame: + if frame.columns.nunique() != len(frame.columns): + raise ValueError("Columns with duplicate values are not supported in stack") + + # If we need to drop `level` from columns, it needs to be in descending order + drop_levnums = sorted(level, reverse=True) + stack_cols = frame.columns._drop_level_numbers( + [k for k in range(frame.columns.nlevels) if k not in level][::-1] + ) + if len(level) > 1: + # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] + sorter = np.argsort(level) + ordered_stack_cols = stack_cols._reorder_ilevels(sorter) + else: + ordered_stack_cols = stack_cols + + stack_cols_unique = stack_cols.unique() + ordered_stack_cols_unique = ordered_stack_cols.unique() + + # Grab data for each unique index to be stacked + buf = [] + for idx in stack_cols_unique: + if len(frame.columns) == 1: + data = frame.copy() + else: + # Take the data from frame corresponding to this idx value + if len(level) == 1: + idx = (idx,) + gen = iter(idx) + column_indexer = tuple( + next(gen) if k in level else slice(None) + for k in range(frame.columns.nlevels) + ) + data = frame.loc[:, column_indexer] + + if len(level) < frame.columns.nlevels: + data.columns = data.columns._drop_level_numbers(drop_levnums) + elif stack_cols.nlevels == 1: + if data.ndim == 1: + data.name = 0 + else: + data.columns = RangeIndex(len(data.columns)) + buf.append(data) + + result: Series | DataFrame + if len(buf) > 0 and not frame.empty: + result = concat(buf) + ratio = len(result) // len(frame) + else: + # input is empty + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + new_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + else: + new_columns = [0] + result = DataFrame(columns=new_columns, dtype=frame._values.dtype) + ratio = 0 + + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + if not result.columns.equals(desired_columns): + result = result[desired_columns] + + # Construct the correct MultiIndex by combining the frame's index and + # stacked columns. + index_levels: list | FrozenList + if isinstance(frame.index, MultiIndex): + index_levels = frame.index.levels + index_codes = list(np.tile(frame.index.codes, (1, ratio))) + else: + codes, uniques = factorize(frame.index, use_na_sentinel=False) + index_levels = [uniques] + index_codes = list(np.tile(codes, (1, ratio))) + if isinstance(stack_cols, MultiIndex): + column_levels = ordered_stack_cols.levels + column_codes = ordered_stack_cols.drop_duplicates().codes + else: + column_levels = [ordered_stack_cols.unique()] + column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] + result.index = MultiIndex( + levels=index_levels + column_levels, + codes=index_codes + column_codes, + names=frame.index.names + list(ordered_stack_cols.names), + verify_integrity=False, + ) + + # sort result, but faster than calling sort_index since we know the order we need + len_df = len(frame) + n_uniques = len(ordered_stack_cols_unique) + indexer = np.arange(n_uniques) + idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) + result = result.take(idxs) + + # Reshape/rename if needed and dropna + if result.ndim == 2 and frame.columns.nlevels == len(level): + if len(result.columns) == 0: + result = Series(index=result.index) + else: + result = result.iloc[:, 0] + if result.ndim == 1: + result.name = None + + return result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/tile.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/tile.py new file mode 100644 index 0000000000000000000000000000000000000000..2b0c6fbb8e3bf69fcb60ce3257450260af2a9f6b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/tile.py @@ -0,0 +1,638 @@ +""" +Quantilization functions and related stuff +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, +) + +import numpy as np + +from pandas._libs import ( + Timedelta, + Timestamp, + lib, +) + +from pandas.core.dtypes.common import ( + ensure_platform_int, + is_bool_dtype, + is_integer, + is_list_like, + is_numeric_dtype, + is_scalar, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + ExtensionDtype, +) +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas import ( + Categorical, + Index, + IntervalIndex, +) +import pandas.core.algorithms as algos +from pandas.core.arrays.datetimelike import dtype_to_unit + +if TYPE_CHECKING: + from pandas._typing import ( + DtypeObj, + IntervalLeftRight, + ) + + +def cut( + x, + bins, + right: bool = True, + labels=None, + retbins: bool = False, + precision: int = 3, + include_lowest: bool = False, + duplicates: str = "raise", + ordered: bool = True, +): + """ + Bin values into discrete intervals. + + Use `cut` when you need to segment and sort data values into bins. This + function is also useful for going from a continuous variable to a + categorical variable. For example, `cut` could convert ages to groups of + age ranges. Supports binning into an equal number of bins, or a + pre-specified array of bins. + + Parameters + ---------- + x : array-like + The input array to be binned. Must be 1-dimensional. + bins : int, sequence of scalars, or IntervalIndex + The criteria to bin by. + + * int : Defines the number of equal-width bins in the range of `x`. The + range of `x` is extended by .1% on each side to include the minimum + and maximum values of `x`. + * sequence of scalars : Defines the bin edges allowing for non-uniform + width. No extension of the range of `x` is done. + * IntervalIndex : Defines the exact bins to be used. Note that + IntervalIndex for `bins` must be non-overlapping. + + right : bool, default True + Indicates whether `bins` includes the rightmost edge or not. If + ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` + indicate (1,2], (2,3], (3,4]. This argument is ignored when + `bins` is an IntervalIndex. + labels : array or False, default None + Specifies the labels for the returned bins. Must be the same length as + the resulting bins. If False, returns only integer indicators of the + bins. This affects the type of the output container (see below). + This argument is ignored when `bins` is an IntervalIndex. If True, + raises an error. When `ordered=False`, labels must be provided. + retbins : bool, default False + Whether to return the bins or not. Useful when bins is provided + as a scalar. + precision : int, default 3 + The precision at which to store and display the bins labels. + include_lowest : bool, default False + Whether the first interval should be left-inclusive or not. + duplicates : {default 'raise', 'drop'}, optional + If bin edges are not unique, raise ValueError or drop non-uniques. + ordered : bool, default True + Whether the labels are ordered or not. Applies to returned types + Categorical and Series (with Categorical dtype). If True, + the resulting categorical will be ordered. If False, the resulting + categorical will be unordered (labels must be provided). + + Returns + ------- + out : Categorical, Series, or ndarray + An array-like object representing the respective bin for each value + of `x`. The type depends on the value of `labels`. + + * None (default) : returns a Series for Series `x` or a + Categorical for all other inputs. The values stored within + are Interval dtype. + + * sequence of scalars : returns a Series for Series `x` or a + Categorical for all other inputs. The values stored within + are whatever the type in the sequence is. + + * False : returns an ndarray of integers. + + bins : numpy.ndarray or IntervalIndex. + The computed or specified bins. Only returned when `retbins=True`. + For scalar or sequence `bins`, this is an ndarray with the computed + bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For + an IntervalIndex `bins`, this is equal to `bins`. + + See Also + -------- + qcut : Discretize variable into equal-sized buckets based on rank + or based on sample quantiles. + Categorical : Array type for storing data that come from a + fixed set of values. + Series : One-dimensional array with axis labels (including time series). + IntervalIndex : Immutable Index implementing an ordered, sliceable set. + + Notes + ----- + Any NA values will be NA in the result. Out of bounds values will be NA in + the resulting Series or Categorical object. + + Reference :ref:`the user guide ` for more examples. + + Examples + -------- + Discretize into three equal-sized bins. + + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) + ... # doctest: +ELLIPSIS + [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... + Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ... + + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) + ... # doctest: +ELLIPSIS + ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... + Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ... + array([0.994, 3. , 5. , 7. ])) + + Discovers the same bins, but assign them specific labels. Notice that + the returned Categorical's categories are `labels` and is ordered. + + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), + ... 3, labels=["bad", "medium", "good"]) + ['bad', 'good', 'medium', 'medium', 'good', 'bad'] + Categories (3, object): ['bad' < 'medium' < 'good'] + + ``ordered=False`` will result in unordered categories when labels are passed. + This parameter can be used to allow non-unique labels: + + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, + ... labels=["B", "A", "B"], ordered=False) + ['B', 'B', 'A', 'A', 'B', 'B'] + Categories (2, object): ['A', 'B'] + + ``labels=False`` implies you just want the bins back. + + >>> pd.cut([0, 1, 1, 2], bins=4, labels=False) + array([0, 1, 1, 3]) + + Passing a Series as an input returns a Series with categorical dtype: + + >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), + ... index=['a', 'b', 'c', 'd', 'e']) + >>> pd.cut(s, 3) + ... # doctest: +ELLIPSIS + a (1.992, 4.667] + b (1.992, 4.667] + c (4.667, 7.333] + d (7.333, 10.0] + e (7.333, 10.0] + dtype: category + Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ... + + Passing a Series as an input returns a Series with mapping value. + It is used to map numerically to intervals based on bins. + + >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), + ... index=['a', 'b', 'c', 'd', 'e']) + >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False) + ... # doctest: +ELLIPSIS + (a 1.0 + b 2.0 + c 3.0 + d 4.0 + e NaN + dtype: float64, + array([ 0, 2, 4, 6, 8, 10])) + + Use `drop` optional when bins is not unique + + >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, + ... right=False, duplicates='drop') + ... # doctest: +ELLIPSIS + (a 1.0 + b 2.0 + c 3.0 + d 3.0 + e NaN + dtype: float64, + array([ 0, 2, 4, 6, 10])) + + Passing an IntervalIndex for `bins` results in those categories exactly. + Notice that values not covered by the IntervalIndex are set to NaN. 0 + is to the left of the first bin (which is closed on the right), and 1.5 + falls between two bins. + + >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) + >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) + [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]] + Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]] + """ + # NOTE: this binning code is changed a bit from histogram for var(x) == 0 + + original = x + x_idx = _preprocess_for_cut(x) + x_idx, _ = _coerce_to_type(x_idx) + + if not np.iterable(bins): + bins = _nbins_to_bins(x_idx, bins, right) + + elif isinstance(bins, IntervalIndex): + if bins.is_overlapping: + raise ValueError("Overlapping IntervalIndex is not accepted.") + + else: + bins = Index(bins) + if not bins.is_monotonic_increasing: + raise ValueError("bins must increase monotonically.") + + fac, bins = _bins_to_cuts( + x_idx, + bins, + right=right, + labels=labels, + precision=precision, + include_lowest=include_lowest, + duplicates=duplicates, + ordered=ordered, + ) + + return _postprocess_for_cut(fac, bins, retbins, original) + + +def qcut( + x, + q, + labels=None, + retbins: bool = False, + precision: int = 3, + duplicates: str = "raise", +): + """ + Quantile-based discretization function. + + Discretize variable into equal-sized buckets based on rank or based + on sample quantiles. For example 1000 values for 10 quantiles would + produce a Categorical object indicating quantile membership for each data point. + + Parameters + ---------- + x : 1d ndarray or Series + q : int or list-like of float + Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately + array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. + labels : array or False, default None + Used as labels for the resulting bins. Must be of the same length as + the resulting bins. If False, return only integer indicators of the + bins. If True, raises an error. + retbins : bool, optional + Whether to return the (bins, labels) or not. Can be useful if bins + is given as a scalar. + precision : int, optional + The precision at which to store and display the bins labels. + duplicates : {default 'raise', 'drop'}, optional + If bin edges are not unique, raise ValueError or drop non-uniques. + + Returns + ------- + out : Categorical or Series or array of integers if labels is False + The return type (Categorical or Series) depends on the input: a Series + of type category if input is a Series else Categorical. Bins are + represented as categories when categorical data is returned. + bins : ndarray of floats + Returned only if `retbins` is True. + + Notes + ----- + Out of bounds values will be NA in the resulting Categorical object + + Examples + -------- + >>> pd.qcut(range(5), 4) + ... # doctest: +ELLIPSIS + [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] + Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ... + + >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"]) + ... # doctest: +SKIP + [good, good, medium, bad, bad] + Categories (3, object): [good < medium < bad] + + >>> pd.qcut(range(5), 4, labels=False) + array([0, 0, 1, 2, 3]) + """ + original = x + x_idx = _preprocess_for_cut(x) + x_idx, _ = _coerce_to_type(x_idx) + + quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q + + bins = x_idx.to_series().dropna().quantile(quantiles) + + fac, bins = _bins_to_cuts( + x_idx, + Index(bins), + labels=labels, + precision=precision, + include_lowest=True, + duplicates=duplicates, + ) + + return _postprocess_for_cut(fac, bins, retbins, original) + + +def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: + """ + If a user passed an integer N for bins, convert this to a sequence of N + equal(ish)-sized bins. + """ + if is_scalar(nbins) and nbins < 1: + raise ValueError("`bins` should be a positive integer.") + + if x_idx.size == 0: + raise ValueError("Cannot cut empty array") + + rng = (x_idx.min(), x_idx.max()) + mn, mx = rng + + if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)): + # GH#24314 + raise ValueError( + "cannot specify integer `bins` when input data contains infinity" + ) + + if mn == mx: # adjust end points before binning + if _is_dt_or_td(x_idx.dtype): + # using seconds=1 is pretty arbitrary here + # error: Argument 1 to "dtype_to_unit" has incompatible type + # "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]" + unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type] + td = Timedelta(seconds=1).as_unit(unit) + # Use DatetimeArray/TimedeltaArray method instead of linspace + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "_generate_range" + bins = x_idx._values._generate_range( # type: ignore[union-attr] + start=mn - td, end=mx + td, periods=nbins + 1, freq=None, unit=unit + ) + else: + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 + + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + else: # adjust end points after binning + if _is_dt_or_td(x_idx.dtype): + # Use DatetimeArray/TimedeltaArray method instead of linspace + + # error: Argument 1 to "dtype_to_unit" has incompatible type + # "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]" + unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type] + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "_generate_range" + bins = x_idx._values._generate_range( # type: ignore[union-attr] + start=mn, end=mx, periods=nbins + 1, freq=None, unit=unit + ) + else: + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + adj = (mx - mn) * 0.001 # 0.1% of the range + if right: + bins[0] -= adj + else: + bins[-1] += adj + + return Index(bins) + + +def _bins_to_cuts( + x_idx: Index, + bins: Index, + right: bool = True, + labels=None, + precision: int = 3, + include_lowest: bool = False, + duplicates: str = "raise", + ordered: bool = True, +): + if not ordered and labels is None: + raise ValueError("'labels' must be provided if 'ordered = False'") + + if duplicates not in ["raise", "drop"]: + raise ValueError( + "invalid value for 'duplicates' parameter, valid options are: raise, drop" + ) + + result: Categorical | np.ndarray + + if isinstance(bins, IntervalIndex): + # we have a fast-path here + ids = bins.get_indexer(x_idx) + cat_dtype = CategoricalDtype(bins, ordered=True) + result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False) + return result, bins + + unique_bins = algos.unique(bins) + if len(unique_bins) < len(bins) and len(bins) != 2: + if duplicates == "raise": + raise ValueError( + f"Bin edges must be unique: {repr(bins)}.\n" + f"You can drop duplicate edges by setting the 'duplicates' kwarg" + ) + bins = unique_bins + + side: Literal["left", "right"] = "left" if right else "right" + + try: + ids = bins.searchsorted(x_idx, side=side) + except TypeError as err: + # e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx + # is integers + if x_idx.dtype.kind == "m": + raise ValueError("bins must be of timedelta64 dtype") from err + elif x_idx.dtype.kind == bins.dtype.kind == "M": + raise ValueError( + "Cannot use timezone-naive bins with timezone-aware values, " + "or vice-versa" + ) from err + elif x_idx.dtype.kind == "M": + raise ValueError("bins must be of datetime64 dtype") from err + else: + raise + ids = ensure_platform_int(ids) + + if include_lowest: + ids[x_idx == bins[0]] = 1 + + na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0) + has_nas = na_mask.any() + + if labels is not False: + if not (labels is None or is_list_like(labels)): + raise ValueError( + "Bin labels must either be False, None or passed in as a " + "list-like argument" + ) + + if labels is None: + labels = _format_labels( + bins, precision, right=right, include_lowest=include_lowest + ) + elif ordered and len(set(labels)) != len(labels): + raise ValueError( + "labels must be unique if ordered=True; pass ordered=False " + "for duplicate labels" + ) + else: + if len(labels) != len(bins) - 1: + raise ValueError( + "Bin labels must be one fewer than the number of bin edges" + ) + + if not isinstance(getattr(labels, "dtype", None), CategoricalDtype): + labels = Categorical( + labels, + categories=labels if len(set(labels)) == len(labels) else None, + ordered=ordered, + ) + # TODO: handle mismatch between categorical label order and pandas.cut order. + np.putmask(ids, na_mask, 0) + result = algos.take_nd(labels, ids - 1) + + else: + result = ids - 1 + if has_nas: + result = result.astype(np.float64) + np.putmask(result, na_mask, np.nan) + + return result, bins + + +def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]: + """ + if the passed data is of datetime/timedelta, bool or nullable int type, + this method converts it to numeric so that cut or qcut method can + handle it + """ + dtype: DtypeObj | None = None + + if _is_dt_or_td(x.dtype): + dtype = x.dtype + elif is_bool_dtype(x.dtype): + # GH 20303 + x = x.astype(np.int64) + # To support cut and qcut for IntegerArray we convert to float dtype. + # Will properly support in the future. + # https://github.com/pandas-dev/pandas/pull/31290 + # https://github.com/pandas-dev/pandas/issues/31389 + elif isinstance(x.dtype, ExtensionDtype) and is_numeric_dtype(x.dtype): + x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan) + x = Index(x_arr) + + return Index(x), dtype + + +def _is_dt_or_td(dtype: DtypeObj) -> bool: + # Note: the dtype here comes from an Index.dtype, so we know that that any + # dt64/td64 dtype is of a supported unit. + return isinstance(dtype, DatetimeTZDtype) or lib.is_np_dtype(dtype, "mM") + + +def _format_labels( + bins: Index, + precision: int, + right: bool = True, + include_lowest: bool = False, +): + """based on the dtype, return our labels""" + closed: IntervalLeftRight = "right" if right else "left" + + formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta] + + if _is_dt_or_td(bins.dtype): + # error: Argument 1 to "dtype_to_unit" has incompatible type + # "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]" + unit = dtype_to_unit(bins.dtype) # type: ignore[arg-type] + formatter = lambda x: x + adjust = lambda x: x - Timedelta(1, unit=unit).as_unit(unit) + else: + precision = _infer_precision(precision, bins) + formatter = lambda x: _round_frac(x, precision) + adjust = lambda x: x - 10 ** (-precision) + + breaks = [formatter(b) for b in bins] + if right and include_lowest: + # adjust lhs of first interval by precision to account for being right closed + breaks[0] = adjust(breaks[0]) + + if _is_dt_or_td(bins.dtype): + # error: "Index" has no attribute "as_unit" + breaks = type(bins)(breaks).as_unit(unit) # type: ignore[attr-defined] + + return IntervalIndex.from_breaks(breaks, closed=closed) + + +def _preprocess_for_cut(x) -> Index: + """ + handles preprocessing for cut where we convert passed + input to array, strip the index information and store it + separately + """ + # Check that the passed array is a Pandas or Numpy object + # We don't want to strip away a Pandas data-type here (e.g. datetimetz) + ndim = getattr(x, "ndim", None) + if ndim is None: + x = np.asarray(x) + if x.ndim != 1: + raise ValueError("Input array must be 1 dimensional") + + return Index(x) + + +def _postprocess_for_cut(fac, bins, retbins: bool, original): + """ + handles post processing for the cut method where + we combine the index information if the originally passed + datatype was a series + """ + if isinstance(original, ABCSeries): + fac = original._constructor(fac, index=original.index, name=original.name) + + if not retbins: + return fac + + if isinstance(bins, Index) and is_numeric_dtype(bins.dtype): + bins = bins._values + + return fac, bins + + +def _round_frac(x, precision: int): + """ + Round the fractional part of the given number + """ + if not np.isfinite(x) or x == 0: + return x + else: + frac, whole = np.modf(x) + if whole == 0: + digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision + else: + digits = precision + return np.around(x, digits) + + +def _infer_precision(base_precision: int, bins: Index) -> int: + """ + Infer an appropriate precision for _round_frac + """ + for precision in range(base_precision, 20): + levels = np.asarray([_round_frac(b, precision) for b in bins]) + if algos.unique(levels).size == bins.size: + return precision + return base_precision # default diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/util.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/util.py new file mode 100644 index 0000000000000000000000000000000000000000..476e3922b6989e4267aeeafd5943a80c1599b1d5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/reshape/util.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from pandas.core.dtypes.common import is_list_like + +if TYPE_CHECKING: + from pandas._typing import NumpyIndexT + + +def cartesian_product(X) -> list[np.ndarray]: + """ + Numpy version of itertools.product. + Sometimes faster (for large inputs)... + + Parameters + ---------- + X : list-like of list-likes + + Returns + ------- + product : list of ndarrays + + Examples + -------- + >>> cartesian_product([list('ABC'), [1, 2]]) + [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype=' NumpyIndexT: + """ + Index compat for np.tile. + + Notes + ----- + Does not support multi-dimensional `num`. + """ + if isinstance(arr, np.ndarray): + return np.tile(arr, num) + + # Otherwise we have an Index + taker = np.tile(np.arange(len(arr)), num) + return arr.take(taker) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sparse/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sparse/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sparse/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sparse/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cbe125b066702ff7e87127c200f81aeaf7972557 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sparse/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sparse/__pycache__/api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sparse/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43377b1d2fe2d3e196e1dba594d17af6a9adf05f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sparse/__pycache__/api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sparse/api.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sparse/api.py new file mode 100644 index 0000000000000000000000000000000000000000..6650a5c4e90a0f73a43e6e35cdd26c1189daf256 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/sparse/api.py @@ -0,0 +1,5 @@ +from pandas.core.dtypes.dtypes import SparseDtype + +from pandas.core.arrays.sparse import SparseArray + +__all__ = ["SparseArray", "SparseDtype"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d4ce75f768c5d1dcd8586264fe1faf756d5d5e94 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__init__.py @@ -0,0 +1,28 @@ +""" +Implementation of pandas.Series.str and its interface. + +* strings.accessor.StringMethods : Accessor for Series.str +* strings.base.BaseStringArrayMethods: Mixin ABC for EAs to implement str methods + +Most methods on the StringMethods accessor follow the pattern: + + 1. extract the array from the series (or index) + 2. Call that array's implementation of the string method + 3. Wrap the result (in a Series, index, or DataFrame) + +Pandas extension arrays implementing string methods should inherit from +pandas.core.strings.base.BaseStringArrayMethods. This is an ABC defining +the various string methods. To avoid namespace clashes and pollution, +these are prefixed with `_str_`. So ``Series.str.upper()`` calls +``Series.array._str_upper()``. The interface isn't currently public +to other string extension arrays. +""" +# Pandas current implementation is in ObjectStringArrayMixin. This is designed +# to work on object-dtype ndarrays. +# +# BaseStringArrayMethods +# - ObjectStringArrayMixin +# - StringArray +# - NumpyExtensionArray +# - Categorical +# - ArrowStringArray diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..821cf85f0ffa7536a0d83c2e8f58ef460b5c9072 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__pycache__/base.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1147f9b4d6ba327b28c730185529b496f27f44b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__pycache__/base.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__pycache__/object_array.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__pycache__/object_array.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e70305ed79cd98f675f7120e1d8c2b4889ec2ef8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/__pycache__/object_array.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/accessor.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/accessor.py new file mode 100644 index 0000000000000000000000000000000000000000..da10a12d02ae40a147812d1158d256543cc2492e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/accessor.py @@ -0,0 +1,3543 @@ +from __future__ import annotations + +import codecs +from functools import wraps +import re +from typing import ( + TYPE_CHECKING, + Callable, + Literal, + cast, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas._typing import ( + AlignJoin, + DtypeObj, + F, + Scalar, + npt, +) +from pandas.util._decorators import Appender +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + ensure_object, + is_bool_dtype, + is_integer, + is_list_like, + is_object_dtype, + is_re, +) +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndex, + ABCMultiIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.arrays import ExtensionArray +from pandas.core.base import NoNewAttributesMixin +from pandas.core.construction import extract_array + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterator, + ) + + from pandas import ( + DataFrame, + Index, + Series, + ) + +_shared_docs: dict[str, str] = {} +_cpython_optimized_encoders = ( + "utf-8", + "utf8", + "latin-1", + "latin1", + "iso-8859-1", + "mbcs", + "ascii", +) +_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") + + +def forbid_nonstring_types( + forbidden: list[str] | None, name: str | None = None +) -> Callable[[F], F]: + """ + Decorator to forbid specific types for a method of StringMethods. + + For calling `.str.{method}` on a Series or Index, it is necessary to first + initialize the :class:`StringMethods` object, and then call the method. + However, different methods allow different input types, and so this can not + be checked during :meth:`StringMethods.__init__`, but must be done on a + per-method basis. This decorator exists to facilitate this process, and + make it explicit which (inferred) types are disallowed by the method. + + :meth:`StringMethods.__init__` allows the *union* of types its different + methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), + namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. + + The default string types ['string', 'empty'] are allowed for all methods. + For the additional types ['bytes', 'mixed', 'mixed-integer'], each method + then needs to forbid the types it is not intended for. + + Parameters + ---------- + forbidden : list-of-str or None + List of forbidden non-string types, may be one or more of + `['bytes', 'mixed', 'mixed-integer']`. + name : str, default None + Name of the method to use in the error message. By default, this is + None, in which case the name from the method being wrapped will be + copied. However, for working with further wrappers (like _pat_wrapper + and _noarg_wrapper), it is necessary to specify the name. + + Returns + ------- + func : wrapper + The method to which the decorator is applied, with an added check that + enforces the inferred type to not be in the list of forbidden types. + + Raises + ------ + TypeError + If the inferred type of the underlying data is in `forbidden`. + """ + # deal with None + forbidden = [] if forbidden is None else forbidden + + allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( + forbidden + ) + + def _forbid_nonstring_types(func: F) -> F: + func_name = func.__name__ if name is None else name + + @wraps(func) + def wrapper(self, *args, **kwargs): + if self._inferred_dtype not in allowed_types: + msg = ( + f"Cannot use .str.{func_name} with values of " + f"inferred dtype '{self._inferred_dtype}'." + ) + raise TypeError(msg) + return func(self, *args, **kwargs) + + wrapper.__name__ = func_name + return cast(F, wrapper) + + return _forbid_nonstring_types + + +def _map_and_wrap(name: str | None, docstring: str | None): + @forbid_nonstring_types(["bytes"], name=name) + def wrapper(self): + result = getattr(self._data.array, f"_str_{name}")() + return self._wrap_result( + result, returns_string=name not in ("isnumeric", "isdecimal") + ) + + wrapper.__doc__ = docstring + return wrapper + + +class StringMethods(NoNewAttributesMixin): + """ + Vectorized string functions for Series and Index. + + NAs stay NA unless handled otherwise by a particular method. + Patterned after Python's string methods, with some inspiration from + R's stringr package. + + Examples + -------- + >>> s = pd.Series(["A_Str_Series"]) + >>> s + 0 A_Str_Series + dtype: object + + >>> s.str.split("_") + 0 [A, Str, Series] + dtype: object + + >>> s.str.replace("_", "") + 0 AStrSeries + dtype: object + """ + + # Note: see the docstring in pandas.core.strings.__init__ + # for an explanation of the implementation. + # TODO: Dispatch all the methods + # Currently the following are not dispatched to the array + # * cat + # * extractall + + def __init__(self, data) -> None: + from pandas.core.arrays.string_ import StringDtype + + self._inferred_dtype = self._validate(data) + self._is_categorical = isinstance(data.dtype, CategoricalDtype) + self._is_string = isinstance(data.dtype, StringDtype) + self._data = data + + self._index = self._name = None + if isinstance(data, ABCSeries): + self._index = data.index + self._name = data.name + + # ._values.categories works for both Series/Index + self._parent = data._values.categories if self._is_categorical else data + # save orig to blow up categoricals to the right type + self._orig = data + self._freeze() + + @staticmethod + def _validate(data): + """ + Auxiliary function for StringMethods, infers and checks dtype of data. + + This is a "first line of defence" at the creation of the StringMethods- + object, and just checks that the dtype is in the + *union* of the allowed types over all string methods below; this + restriction is then refined on a per-method basis using the decorator + @forbid_nonstring_types (more info in the corresponding docstring). + + This really should exclude all series/index with any non-string values, + but that isn't practical for performance reasons until we have a str + dtype (GH 9343 / 13877) + + Parameters + ---------- + data : The content of the Series + + Returns + ------- + dtype : inferred dtype of data + """ + if isinstance(data, ABCMultiIndex): + raise AttributeError( + "Can only use .str accessor with Index, not MultiIndex" + ) + + # see _libs/lib.pyx for list of inferred types + allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] + + data = extract_array(data) + + values = getattr(data, "categories", data) # categorical / normal + + inferred_dtype = lib.infer_dtype(values, skipna=True) + + if inferred_dtype not in allowed_types: + raise AttributeError("Can only use .str accessor with string values!") + return inferred_dtype + + def __getitem__(self, key): + result = self._data.array._str_getitem(key) + return self._wrap_result(result) + + def __iter__(self) -> Iterator: + raise TypeError(f"'{type(self).__name__}' object is not iterable") + + def _wrap_result( + self, + result, + name=None, + expand: bool | None = None, + fill_value=np.nan, + returns_string: bool = True, + returns_bool: bool = False, + dtype=None, + ): + from pandas import ( + Index, + MultiIndex, + ) + + if not hasattr(result, "ndim") or not hasattr(result, "dtype"): + if isinstance(result, ABCDataFrame): + result = result.__finalize__(self._orig, name="str") + return result + assert result.ndim < 3 + + # We can be wrapping a string / object / categorical result, in which + # case we'll want to return the same dtype as the input. + # Or we can be wrapping a numeric output, in which case we don't want + # to return a StringArray. + # Ideally the array method returns the right array type. + if expand is None: + # infer from ndim if expand is not specified + expand = result.ndim != 1 + elif expand is True and not isinstance(self._orig, ABCIndex): + # required when expand=True is explicitly specified + # not needed when inferred + if isinstance(result.dtype, ArrowDtype): + import pyarrow as pa + + from pandas.compat import pa_version_under11p0 + + from pandas.core.arrays.arrow.array import ArrowExtensionArray + + value_lengths = pa.compute.list_value_length(result._pa_array) + max_len = pa.compute.max(value_lengths).as_py() + min_len = pa.compute.min(value_lengths).as_py() + if result._hasna: + # ArrowExtensionArray.fillna doesn't work for list scalars + result = ArrowExtensionArray( + result._pa_array.fill_null([None] * max_len) + ) + if min_len < max_len: + # append nulls to each scalar list element up to max_len + if not pa_version_under11p0: + result = ArrowExtensionArray( + pa.compute.list_slice( + result._pa_array, + start=0, + stop=max_len, + return_fixed_size_list=True, + ) + ) + else: + all_null = np.full(max_len, fill_value=None, dtype=object) + values = result.to_numpy() + new_values = [] + for row in values: + if len(row) < max_len: + nulls = all_null[: max_len - len(row)] + row = np.append(row, nulls) + new_values.append(row) + pa_type = result._pa_array.type + result = ArrowExtensionArray(pa.array(new_values, type=pa_type)) + if name is not None: + labels = name + else: + labels = range(max_len) + result = ( + pa.compute.list_flatten(result._pa_array) + .to_numpy() + .reshape(len(result), max_len) + ) + result = { + label: ArrowExtensionArray(pa.array(res)) + for label, res in zip(labels, result.T) + } + elif is_object_dtype(result): + + def cons_row(x): + if is_list_like(x): + return x + else: + return [x] + + result = [cons_row(x) for x in result] + if result and not self._is_string: + # propagate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [ + x * max_len if len(x) == 0 or x[0] is np.nan else x + for x in result + ] + + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + + if expand is False: + # if expand is False, result should have the same name + # as the original otherwise specified + if name is None: + name = getattr(result, "name", None) + if name is None: + # do not use logical or, _orig may be a DataFrame + # which has "name" column + name = self._orig.name + + # Wait until we are sure result is a Series or Index before + # checking attributes (GH 12180) + if isinstance(self._orig, ABCIndex): + # if result is a boolean np.array, return the np.array + # instead of wrapping it into a boolean Index (GH 8875) + if is_bool_dtype(result): + return result + + if expand: + result = list(result) + out: Index = MultiIndex.from_tuples(result, names=name) + if out.nlevels == 1: + # We had all tuples of length-one, which are + # better represented as a regular Index. + out = out.get_level_values(0) + return out + else: + return Index(result, name=name, dtype=dtype) + else: + index = self._orig.index + # This is a mess. + _dtype: DtypeObj | str | None = dtype + vdtype = getattr(result, "dtype", None) + if self._is_string: + if is_bool_dtype(vdtype): + _dtype = result.dtype + elif returns_string: + _dtype = self._orig.dtype + else: + _dtype = vdtype + elif vdtype is not None: + _dtype = vdtype + + if expand: + cons = self._orig._constructor_expanddim + result = cons(result, columns=name, index=index, dtype=_dtype) + else: + # Must be a Series + cons = self._orig._constructor + result = cons(result, name=name, index=index, dtype=_dtype) + result = result.__finalize__(self._orig, method="str") + if name is not None and result.ndim == 1: + # __finalize__ might copy over the original name, but we may + # want the new name (e.g. str.extract). + result.name = name + return result + + def _get_series_list(self, others): + """ + Auxiliary function for :meth:`str.cat`. Turn potentially mixed input + into a list of Series (elements without an index must match the length + of the calling Series/Index). + + Parameters + ---------- + others : Series, DataFrame, np.ndarray, list-like or list-like of + Objects that are either Series, Index or np.ndarray (1-dim). + + Returns + ------- + list of Series + Others transformed into list of Series. + """ + from pandas import ( + DataFrame, + Series, + ) + + # self._orig is either Series or Index + idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index + + # Generally speaking, all objects without an index inherit the index + # `idx` of the calling Series/Index - i.e. must have matching length. + # Objects with an index (i.e. Series/Index/DataFrame) keep their own. + if isinstance(others, ABCSeries): + return [others] + elif isinstance(others, ABCIndex): + return [Series(others, index=idx, dtype=others.dtype)] + elif isinstance(others, ABCDataFrame): + return [others[x] for x in others] + elif isinstance(others, np.ndarray) and others.ndim == 2: + others = DataFrame(others, index=idx) + return [others[x] for x in others] + elif is_list_like(others, allow_sets=False): + try: + others = list(others) # ensure iterators do not get read twice etc + except TypeError: + # e.g. ser.str, raise below + pass + else: + # in case of list-like `others`, all elements must be + # either Series/Index/np.ndarray (1-dim)... + if all( + isinstance(x, (ABCSeries, ABCIndex, ExtensionArray)) + or (isinstance(x, np.ndarray) and x.ndim == 1) + for x in others + ): + los: list[Series] = [] + while others: # iterate through list and append each element + los = los + self._get_series_list(others.pop(0)) + return los + # ... or just strings + elif all(not is_list_like(x) for x in others): + return [Series(others, index=idx)] + raise TypeError( + "others must be Series, Index, DataFrame, np.ndarray " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ) + + @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) + def cat( + self, + others=None, + sep: str | None = None, + na_rep=None, + join: AlignJoin = "left", + ) -> str | Series | Index: + """ + Concatenate strings in the Series/Index with given separator. + + If `others` is specified, this function concatenates the Series/Index + and elements of `others` element-wise. + If `others` is not passed, then all values in the Series/Index are + concatenated into a single string with a given `sep`. + + Parameters + ---------- + others : Series, Index, DataFrame, np.ndarray or list-like + Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and + other list-likes of strings must have the same length as the + calling Series/Index, with the exception of indexed objects (i.e. + Series/Index/DataFrame) if `join` is not None. + + If others is a list-like that contains a combination of Series, + Index or np.ndarray (1-dim), then all elements will be unpacked and + must satisfy the above criteria individually. + + If others is None, the method returns the concatenation of all + strings in the calling Series/Index. + sep : str, default '' + The separator between the different elements/columns. By default + the empty string `''` is used. + na_rep : str or None, default None + Representation that is inserted for all missing values: + + - If `na_rep` is None, and `others` is None, missing values in the + Series/Index are omitted from the result. + - If `na_rep` is None, and `others` is not None, a row containing a + missing value in any of the columns (before concatenation) will + have a missing value in the result. + join : {'left', 'right', 'outer', 'inner'}, default 'left' + Determines the join-style between the calling Series/Index and any + Series/Index/DataFrame in `others` (objects without an index need + to match the length of the calling Series/Index). To disable + alignment, use `.values` on any Series/Index/DataFrame in `others`. + + Returns + ------- + str, Series or Index + If `others` is None, `str` is returned, otherwise a `Series/Index` + (same type as caller) of objects is returned. + + See Also + -------- + split : Split each string in the Series/Index. + join : Join lists contained as elements in the Series/Index. + + Examples + -------- + When not passing `others`, all values are concatenated into a single + string: + + >>> s = pd.Series(['a', 'b', np.nan, 'd']) + >>> s.str.cat(sep=' ') + 'a b d' + + By default, NA values in the Series are ignored. Using `na_rep`, they + can be given a representation: + + >>> s.str.cat(sep=' ', na_rep='?') + 'a b ? d' + + If `others` is specified, corresponding values are concatenated with + the separator. Result will be a Series of strings. + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') + 0 a,A + 1 b,B + 2 NaN + 3 d,D + dtype: object + + Missing values will remain missing in the result, but can again be + represented using `na_rep` + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') + 0 a,A + 1 b,B + 2 -,C + 3 d,D + dtype: object + + If `sep` is not specified, the values are concatenated without + separation. + + >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') + 0 aA + 1 bB + 2 -C + 3 dD + dtype: object + + Series with different indexes can be aligned before concatenation. The + `join`-keyword works as in other methods. + + >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) + >>> s.str.cat(t, join='left', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='outer', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + 4 -e + dtype: object + >>> + >>> s.str.cat(t, join='inner', na_rep='-') + 0 aa + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='right', na_rep='-') + 3 dd + 0 aa + 4 -e + 2 -c + dtype: object + + For more examples, see :ref:`here `. + """ + # TODO: dispatch + from pandas import ( + Index, + Series, + concat, + ) + + if isinstance(others, str): + raise ValueError("Did you mean to supply a `sep` keyword?") + if sep is None: + sep = "" + + if isinstance(self._orig, ABCIndex): + data = Series(self._orig, index=self._orig, dtype=self._orig.dtype) + else: # Series + data = self._orig + + # concatenate Series/Index with itself if no "others" + if others is None: + # error: Incompatible types in assignment (expression has type + # "ndarray", variable has type "Series") + data = ensure_object(data) # type: ignore[assignment] + na_mask = isna(data) + if na_rep is None and na_mask.any(): + return sep.join(data[~na_mask]) + elif na_rep is not None and na_mask.any(): + return sep.join(np.where(na_mask, na_rep, data)) + else: + return sep.join(data) + + try: + # turn anything in "others" into lists of Series + others = self._get_series_list(others) + except ValueError as err: # do not catch TypeError raised by _get_series_list + raise ValueError( + "If `others` contains arrays or lists (or other " + "list-likes without an index), these must all be " + "of the same length as the calling Series/Index." + ) from err + + # align if required + if any(not data.index.equals(x.index) for x in others): + # Need to add keys for uniqueness in case of duplicate columns + others = concat( + others, + axis=1, + join=(join if join == "inner" else "outer"), + keys=range(len(others)), + sort=False, + copy=False, + ) + data, others = data.align(others, join=join) + others = [others[x] for x in others] # again list of Series + + all_cols = [ensure_object(x) for x in [data] + others] + na_masks = np.array([isna(x) for x in all_cols]) + union_mask = np.logical_or.reduce(na_masks, axis=0) + + if na_rep is None and union_mask.any(): + # no na_rep means NaNs for all rows where any column has a NaN + # only necessary if there are actually any NaNs + result = np.empty(len(data), dtype=object) + np.putmask(result, union_mask, np.nan) + + not_masked = ~union_mask + result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) + elif na_rep is not None and union_mask.any(): + # fill NaNs with na_rep in case there are actually any NaNs + all_cols = [ + np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) + ] + result = cat_safe(all_cols, sep) + else: + # no NaNs - can just concatenate + result = cat_safe(all_cols, sep) + + out: Index | Series + if isinstance(self._orig.dtype, CategoricalDtype): + # We need to infer the new categories. + dtype = self._orig.dtype.categories.dtype + else: + dtype = self._orig.dtype + if isinstance(self._orig, ABCIndex): + # add dtype for case that result is all-NA + if isna(result).all(): + dtype = object # type: ignore[assignment] + + out = Index(result, dtype=dtype, name=self._orig.name) + else: # Series + res_ser = Series( + result, dtype=dtype, index=data.index, name=self._orig.name, copy=False + ) + out = res_ser.__finalize__(self._orig, method="str_cat") + return out + + _shared_docs[ + "str_split" + ] = r""" + Split strings around given separator/delimiter. + + Splits the string in the Series/Index from the %(side)s, + at the specified delimiter string. + + Parameters + ---------- + pat : str%(pat_regex)s, optional + %(pat_description)s. + If not specified, split on whitespace. + n : int, default -1 (all) + Limit number of splits in output. + ``None``, 0 and -1 will be interpreted as return all splits. + expand : bool, default False + Expand the split strings into separate columns. + + - If ``True``, return DataFrame/MultiIndex expanding dimensionality. + - If ``False``, return Series/Index, containing lists of strings. + %(regex_argument)s + Returns + ------- + Series, Index, DataFrame or MultiIndex + Type matches caller unless ``expand=True`` (see Notes). + %(raises_split)s + See Also + -------- + Series.str.split : Split strings around given separator/delimiter. + Series.str.rsplit : Splits string around given separator/delimiter, + starting from the right. + Series.str.join : Join lists contained as elements in the Series/Index + with passed delimiter. + str.split : Standard library version for split. + str.rsplit : Standard library version for rsplit. + + Notes + ----- + The handling of the `n` keyword depends on the number of found splits: + + - If found splits > `n`, make first `n` splits only + - If found splits <= `n`, make all splits + - If for a certain row the number of found splits < `n`, + append `None` for padding up to `n` if ``expand=True`` + + If using ``expand=True``, Series and Index callers return DataFrame and + MultiIndex objects, respectively. + %(regex_pat_note)s + Examples + -------- + >>> s = pd.Series( + ... [ + ... "this is a regular sentence", + ... "https://docs.python.org/3/tutorial/index.html", + ... np.nan + ... ] + ... ) + >>> s + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html + 2 NaN + dtype: object + + In the default setting, the string is split by whitespace. + + >>> s.str.split() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + Without the `n` parameter, the outputs of `rsplit` and `split` + are identical. + + >>> s.str.rsplit() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `n` parameter can be used to limit the number of splits on the + delimiter. The outputs of `split` and `rsplit` are different. + + >>> s.str.split(n=2) + 0 [this, is, a regular sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + >>> s.str.rsplit(n=2) + 0 [this is a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `pat` parameter can be used to split by other characters. + + >>> s.str.split(pat="/") + 0 [this is a regular sentence] + 1 [https:, , docs.python.org, 3, tutorial, index... + 2 NaN + dtype: object + + When using ``expand=True``, the split elements will expand out into + separate columns. If NaN is present, it is propagated throughout + the columns during the split. + + >>> s.str.split(expand=True) + 0 1 2 3 4 + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html None None None None + 2 NaN NaN NaN NaN NaN + + For slightly more complex use cases like splitting the html document name + from a url, a combination of parameter settings can be used. + + >>> s.str.rsplit("/", n=1, expand=True) + 0 1 + 0 this is a regular sentence None + 1 https://docs.python.org/3/tutorial index.html + 2 NaN NaN + %(regex_examples)s""" + + @Appender( + _shared_docs["str_split"] + % { + "side": "beginning", + "pat_regex": " or compiled regex", + "pat_description": "String or regular expression to split on", + "regex_argument": """ + regex : bool, default None + Determines if the passed-in pattern is a regular expression: + + - If ``True``, assumes the passed-in pattern is a regular expression + - If ``False``, treats the pattern as a literal string. + - If ``None`` and `pat` length is 1, treats `pat` as a literal string. + - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression. + - Cannot be set to False if `pat` is a compiled regex + + .. versionadded:: 1.4.0 + """, + "raises_split": """ + Raises + ------ + ValueError + * if `regex` is False and `pat` is a compiled regex + """, + "regex_pat_note": """ + Use of `regex =False` with a `pat` as a compiled regex will raise an error. + """, + "method": "split", + "regex_examples": r""" + Remember to escape special characters when explicitly using regular expressions. + + >>> s = pd.Series(["foo and bar plus baz"]) + >>> s.str.split(r"and|plus", expand=True) + 0 1 2 + 0 foo bar baz + + Regular expressions can be used to handle urls or file names. + When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled + as a regex only if ``len(pat) != 1``. + + >>> s = pd.Series(['foojpgbar.jpg']) + >>> s.str.split(r".", expand=True) + 0 1 + 0 foojpgbar jpg + + >>> s.str.split(r"\.jpg", expand=True) + 0 1 + 0 foojpgbar + + When ``regex=True``, `pat` is interpreted as a regex + + >>> s.str.split(r"\.jpg", regex=True, expand=True) + 0 1 + 0 foojpgbar + + A compiled regex can be passed as `pat` + + >>> import re + >>> s.str.split(re.compile(r"\.jpg"), expand=True) + 0 1 + 0 foojpgbar + + When ``regex=False``, `pat` is interpreted as the string itself + + >>> s.str.split(r"\.jpg", regex=False, expand=True) + 0 + 0 foojpgbar.jpg + """, + } + ) + @forbid_nonstring_types(["bytes"]) + def split( + self, + pat: str | re.Pattern | None = None, + *, + n=-1, + expand: bool = False, + regex: bool | None = None, + ): + if regex is False and is_re(pat): + raise ValueError( + "Cannot use a compiled regex as replacement pattern with regex=False" + ) + if is_re(pat): + regex = True + result = self._data.array._str_split(pat, n, expand, regex) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) + + @Appender( + _shared_docs["str_split"] + % { + "side": "end", + "pat_regex": "", + "pat_description": "String to split on", + "regex_argument": "", + "raises_split": "", + "regex_pat_note": "", + "method": "rsplit", + "regex_examples": "", + } + ) + @forbid_nonstring_types(["bytes"]) + def rsplit(self, pat=None, *, n=-1, expand: bool = False): + result = self._data.array._str_rsplit(pat, n=n) + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) + + _shared_docs[ + "str_partition" + ] = """ + Split the string at the %(side)s occurrence of `sep`. + + This method splits the string at the %(side)s occurrence of `sep`, + and returns 3 elements containing the part before the separator, + the separator itself, and the part after the separator. + If the separator is not found, return %(return)s. + + Parameters + ---------- + sep : str, default whitespace + String to split on. + expand : bool, default True + If True, return DataFrame/MultiIndex expanding dimensionality. + If False, return Series/Index. + + Returns + ------- + DataFrame/MultiIndex or Series/Index of objects + + See Also + -------- + %(also)s + Series.str.split : Split strings around given separators. + str.partition : Standard library version. + + Examples + -------- + + >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) + >>> s + 0 Linda van der Berg + 1 George Pitt-Rivers + dtype: object + + >>> s.str.partition() + 0 1 2 + 0 Linda van der Berg + 1 George Pitt-Rivers + + To partition by the last space instead of the first one: + + >>> s.str.rpartition() + 0 1 2 + 0 Linda van der Berg + 1 George Pitt-Rivers + + To partition by something different than a space: + + >>> s.str.partition('-') + 0 1 2 + 0 Linda van der Berg + 1 George Pitt - Rivers + + To return a Series containing tuples instead of a DataFrame: + + >>> s.str.partition('-', expand=False) + 0 (Linda van der Berg, , ) + 1 (George Pitt, -, Rivers) + dtype: object + + Also available on indices: + + >>> idx = pd.Index(['X 123', 'Y 999']) + >>> idx + Index(['X 123', 'Y 999'], dtype='object') + + Which will create a MultiIndex: + + >>> idx.str.partition() + MultiIndex([('X', ' ', '123'), + ('Y', ' ', '999')], + ) + + Or an index with tuples with ``expand=False``: + + >>> idx.str.partition(expand=False) + Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') + """ + + @Appender( + _shared_docs["str_partition"] + % { + "side": "first", + "return": "3 elements containing the string itself, followed by two " + "empty strings", + "also": "rpartition : Split the string at the last occurrence of `sep`.", + } + ) + @forbid_nonstring_types(["bytes"]) + def partition(self, sep: str = " ", expand: bool = True): + result = self._data.array._str_partition(sep, expand) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) + + @Appender( + _shared_docs["str_partition"] + % { + "side": "last", + "return": "3 elements containing two empty strings, followed by the " + "string itself", + "also": "partition : Split the string at the first occurrence of `sep`.", + } + ) + @forbid_nonstring_types(["bytes"]) + def rpartition(self, sep: str = " ", expand: bool = True): + result = self._data.array._str_rpartition(sep, expand) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) + + def get(self, i): + """ + Extract element from each component at specified position or with specified key. + + Extract element from lists, tuples, dict, or strings in each element in the + Series/Index. + + Parameters + ---------- + i : int or hashable dict label + Position or key of element to extract. + + Returns + ------- + Series or Index + + Examples + -------- + >>> s = pd.Series(["String", + ... (1, 2, 3), + ... ["a", "b", "c"], + ... 123, + ... -456, + ... {1: "Hello", "2": "World"}]) + >>> s + 0 String + 1 (1, 2, 3) + 2 [a, b, c] + 3 123 + 4 -456 + 5 {1: 'Hello', '2': 'World'} + dtype: object + + >>> s.str.get(1) + 0 t + 1 2 + 2 b + 3 NaN + 4 NaN + 5 Hello + dtype: object + + >>> s.str.get(-1) + 0 g + 1 3 + 2 c + 3 NaN + 4 NaN + 5 None + dtype: object + + Return element with given key + + >>> s = pd.Series([{"name": "Hello", "value": "World"}, + ... {"name": "Goodbye", "value": "Planet"}]) + >>> s.str.get('name') + 0 Hello + 1 Goodbye + dtype: object + """ + result = self._data.array._str_get(i) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def join(self, sep: str): + """ + Join lists contained as elements in the Series/Index with passed delimiter. + + If the elements of a Series are lists themselves, join the content of these + lists using the delimiter passed to the function. + This function is an equivalent to :meth:`str.join`. + + Parameters + ---------- + sep : str + Delimiter to use between list entries. + + Returns + ------- + Series/Index: object + The list entries concatenated by intervening occurrences of the + delimiter. + + Raises + ------ + AttributeError + If the supplied Series contains neither strings nor lists. + + See Also + -------- + str.join : Standard library version of this method. + Series.str.split : Split strings around given separator/delimiter. + + Notes + ----- + If any of the list items is not a string object, the result of the join + will be `NaN`. + + Examples + -------- + Example with a list that contains non-string elements. + + >>> s = pd.Series([['lion', 'elephant', 'zebra'], + ... [1.1, 2.2, 3.3], + ... ['cat', np.nan, 'dog'], + ... ['cow', 4.5, 'goat'], + ... ['duck', ['swan', 'fish'], 'guppy']]) + >>> s + 0 [lion, elephant, zebra] + 1 [1.1, 2.2, 3.3] + 2 [cat, nan, dog] + 3 [cow, 4.5, goat] + 4 [duck, [swan, fish], guppy] + dtype: object + + Join all lists using a '-'. The lists containing object(s) of types other + than str will produce a NaN. + + >>> s.str.join('-') + 0 lion-elephant-zebra + 1 NaN + 2 NaN + 3 NaN + 4 NaN + dtype: object + """ + result = self._data.array._str_join(sep) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def contains( + self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + ): + r""" + Test if pattern or regex is contained within a string of a Series or Index. + + Return boolean Series or Index based on whether a given pattern or regex is + contained within a string of a Series or Index. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Flags to pass through to the re module, e.g. re.IGNORECASE. + na : scalar, optional + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. + regex : bool, default True + If True, assumes the pat is a regular expression. + + If False, treats the pat as a literal string. + + Returns + ------- + Series or Index of boolean values + A Series or Index of boolean values indicating whether the + given pattern is contained within the string of each element + of the Series or Index. + + See Also + -------- + match : Analogous, but stricter, relying on re.match instead of re.search. + Series.str.startswith : Test if the start of each string element matches a + pattern. + Series.str.endswith : Same as startswith, but tests the end of string. + + Examples + -------- + Returning a Series of booleans using only a literal pattern. + + >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan]) + >>> s1.str.contains('og', regex=False) + 0 False + 1 True + 2 False + 3 False + 4 NaN + dtype: object + + Returning an Index of booleans using only a literal pattern. + + >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.nan]) + >>> ind.str.contains('23', regex=False) + Index([False, False, False, True, nan], dtype='object') + + Specifying case sensitivity using `case`. + + >>> s1.str.contains('oG', case=True, regex=True) + 0 False + 1 False + 2 False + 3 False + 4 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN` replaces NaN values + with `False`. If Series or Index does not contain NaN values + the resultant dtype will be `bool`, otherwise, an `object` dtype. + + >>> s1.str.contains('og', na=False, regex=True) + 0 False + 1 True + 2 False + 3 False + 4 False + dtype: bool + + Returning 'house' or 'dog' when either expression occurs in a string. + + >>> s1.str.contains('house|dog', regex=True) + 0 False + 1 True + 2 True + 3 False + 4 NaN + dtype: object + + Ignoring case sensitivity using `flags` with regex. + + >>> import re + >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) + 0 False + 1 False + 2 True + 3 False + 4 NaN + dtype: object + + Returning any digit using regular expression. + + >>> s1.str.contains('\\d', regex=True) + 0 False + 1 False + 2 False + 3 True + 4 NaN + dtype: object + + Ensure `pat` is a not a literal pattern when `regex` is set to True. + Note in the following example one might expect only `s2[1]` and `s2[3]` to + return `True`. However, '.0' as a regex matches any character + followed by a 0. + + >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) + >>> s2.str.contains('.0', regex=True) + 0 True + 1 True + 2 False + 3 True + 4 False + dtype: bool + """ + if regex and re.compile(pat).groups: + warnings.warn( + "This pattern is interpreted as a regular expression, and has " + "match groups. To actually get the groups, use str.extract.", + UserWarning, + stacklevel=find_stack_level(), + ) + + result = self._data.array._str_contains(pat, case, flags, na, regex) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def match(self, pat: str, case: bool = True, flags: int = 0, na=None): + """ + Determine if each string starts with a match of a regular expression. + + Parameters + ---------- + pat : str + Character sequence. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : scalar, optional + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. + + Returns + ------- + Series/Index/array of boolean values + + See Also + -------- + fullmatch : Stricter matching that requires the entire string to match. + contains : Analogous, but less strict, relying on re.search instead of + re.match. + extract : Extract matched groups. + + Examples + -------- + >>> ser = pd.Series(["horse", "eagle", "donkey"]) + >>> ser.str.match("e") + 0 False + 1 True + 2 False + dtype: bool + """ + result = self._data.array._str_match(pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): + """ + Determine if each string entirely matches a regular expression. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : scalar, optional + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. + + Returns + ------- + Series/Index/array of boolean values + + See Also + -------- + match : Similar, but also returns `True` when only a *prefix* of the string + matches the regular expression. + extract : Extract matched groups. + + Examples + -------- + >>> ser = pd.Series(["cat", "duck", "dove"]) + >>> ser.str.fullmatch(r'd.+') + 0 False + 1 True + 2 True + dtype: bool + """ + result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool | None = None, + flags: int = 0, + regex: bool = False, + ): + r""" + Replace each occurrence of pattern/regex in the Series/Index. + + Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on + the regex value. + + Parameters + ---------- + pat : str or compiled regex + String can be a character sequence or regular expression. + repl : str or callable + Replacement string or a callable. The callable is passed the regex + match object and must return a replacement string to be used. + See :func:`re.sub`. + n : int, default -1 (all) + Number of replacements to make from start. + case : bool, default None + Determines if replace is case sensitive: + + - If True, case sensitive (the default if `pat` is a string) + - Set to False for case insensitive + - Cannot be set if `pat` is a compiled regex. + + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled + regex. + regex : bool, default False + Determines if the passed-in pattern is a regular expression: + + - If True, assumes the passed-in pattern is a regular expression. + - If False, treats the pattern as a literal string + - Cannot be set to False if `pat` is a compiled regex or `repl` is + a callable. + + Returns + ------- + Series or Index of object + A copy of the object with all matching occurrences of `pat` replaced by + `repl`. + + Raises + ------ + ValueError + * if `regex` is False and `repl` is a callable or `pat` is a compiled + regex + * if `pat` is a compiled regex and `case` or `flags` is set + + Notes + ----- + When `pat` is a compiled regex, all flags should be included in the + compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled + regex will raise an error. + + Examples + -------- + When `pat` is a string and `regex` is True, the given `pat` + is compiled as a regex. When `repl` is a string, it replaces matching + regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are + left as is: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) + 0 bao + 1 baz + 2 NaN + dtype: object + + When `pat` is a string and `regex` is False, every `pat` is replaced with + `repl` as with :meth:`str.replace`: + + >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) + 0 bao + 1 fuz + 2 NaN + dtype: object + + When `repl` is a callable, it is called on every `pat` using + :func:`re.sub`. The callable should expect one positional argument + (a regex object) and return a string. + + To get the idea: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True) + 0 oo + 1 uz + 2 NaN + dtype: object + + Reverse every lowercase alphabetic word: + + >>> repl = lambda m: m.group(0)[::-1] + >>> ser = pd.Series(['foo 123', 'bar baz', np.nan]) + >>> ser.str.replace(r'[a-z]+', repl, regex=True) + 0 oof 123 + 1 rab zab + 2 NaN + dtype: object + + Using regex groups (extract second group and swap case): + + >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" + >>> repl = lambda m: m.group('two').swapcase() + >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz']) + >>> ser.str.replace(pat, repl, regex=True) + 0 tWO + 1 bAR + dtype: object + + Using a compiled regex with flags + + >>> import re + >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True) + 0 foo + 1 bar + 2 NaN + dtype: object + """ + # Check whether repl is valid (GH 13438, GH 15055) + if not (isinstance(repl, str) or callable(repl)): + raise TypeError("repl must be a string or callable") + + is_compiled_re = is_re(pat) + if regex or regex is None: + if is_compiled_re and (case is not None or flags != 0): + raise ValueError( + "case and flags cannot be set when pat is a compiled regex" + ) + + elif is_compiled_re: + raise ValueError( + "Cannot use a compiled regex as replacement pattern with regex=False" + ) + elif callable(repl): + raise ValueError("Cannot use a callable replacement when regex=False") + + if case is None: + case = True + + result = self._data.array._str_replace( + pat, repl, n=n, case=case, flags=flags, regex=regex + ) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def repeat(self, repeats): + """ + Duplicate each string in the Series or Index. + + Parameters + ---------- + repeats : int or sequence of int + Same value for all (int) or different value per (sequence). + + Returns + ------- + Series or pandas.Index + Series or Index of repeated string objects specified by + input parameter repeats. + + Examples + -------- + >>> s = pd.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + + Single int repeats string in Series + + >>> s.str.repeat(repeats=2) + 0 aa + 1 bb + 2 cc + dtype: object + + Sequence of int repeats corresponding string in Series + + >>> s.str.repeat(repeats=[1, 2, 3]) + 0 a + 1 bb + 2 ccc + dtype: object + """ + result = self._data.array._str_repeat(repeats) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def pad( + self, + width: int, + side: Literal["left", "right", "both"] = "left", + fillchar: str = " ", + ): + """ + Pad strings in the Series/Index up to width. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with character defined in `fillchar`. + side : {'left', 'right', 'both'}, default 'left' + Side from which to fill resulting string. + fillchar : str, default ' ' + Additional character for filling, default is whitespace. + + Returns + ------- + Series or Index of object + Returns Series or Index with minimum number of char in object. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='left')``. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='right')``. + Series.str.center : Fills both sides of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='both')``. + Series.str.zfill : Pad strings in the Series/Index by prepending '0' + character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. + + Examples + -------- + >>> s = pd.Series(["caribou", "tiger"]) + >>> s + 0 caribou + 1 tiger + dtype: object + + >>> s.str.pad(width=10) + 0 caribou + 1 tiger + dtype: object + + >>> s.str.pad(width=10, side='right', fillchar='-') + 0 caribou--- + 1 tiger----- + dtype: object + + >>> s.str.pad(width=10, side='both', fillchar='-') + 0 -caribou-- + 1 --tiger--- + dtype: object + """ + if not isinstance(fillchar, str): + msg = f"fillchar must be a character, not {type(fillchar).__name__}" + raise TypeError(msg) + + if len(fillchar) != 1: + raise TypeError("fillchar must be a character, not str") + + if not is_integer(width): + msg = f"width must be of integer type, not {type(width).__name__}" + raise TypeError(msg) + + result = self._data.array._str_pad(width, side=side, fillchar=fillchar) + return self._wrap_result(result) + + _shared_docs[ + "str_pad" + ] = """ + Pad %(side)s side of strings in the Series/Index. + + Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with ``fillchar``. + fillchar : str + Additional character for filling, default is whitespace. + + Returns + ------- + Series/Index of objects. + + Examples + -------- + For Series.str.center: + + >>> ser = pd.Series(['dog', 'bird', 'mouse']) + >>> ser.str.center(8, fillchar='.') + 0 ..dog... + 1 ..bird.. + 2 .mouse.. + dtype: object + + For Series.str.ljust: + + >>> ser = pd.Series(['dog', 'bird', 'mouse']) + >>> ser.str.ljust(8, fillchar='.') + 0 dog..... + 1 bird.... + 2 mouse... + dtype: object + + For Series.str.rjust: + + >>> ser = pd.Series(['dog', 'bird', 'mouse']) + >>> ser.str.rjust(8, fillchar='.') + 0 .....dog + 1 ....bird + 2 ...mouse + dtype: object + """ + + @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"}) + @forbid_nonstring_types(["bytes"]) + def center(self, width: int, fillchar: str = " "): + return self.pad(width, side="both", fillchar=fillchar) + + @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"}) + @forbid_nonstring_types(["bytes"]) + def ljust(self, width: int, fillchar: str = " "): + return self.pad(width, side="right", fillchar=fillchar) + + @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"}) + @forbid_nonstring_types(["bytes"]) + def rjust(self, width: int, fillchar: str = " "): + return self.pad(width, side="left", fillchar=fillchar) + + @forbid_nonstring_types(["bytes"]) + def zfill(self, width: int): + """ + Pad strings in the Series/Index by prepending '0' characters. + + Strings in the Series/Index are padded with '0' characters on the + left of the string to reach a total string length `width`. Strings + in the Series/Index with length greater or equal to `width` are + unchanged. + + Parameters + ---------- + width : int + Minimum length of resulting string; strings with length less + than `width` be prepended with '0' characters. + + Returns + ------- + Series/Index of objects. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. + Series.str.pad : Fills the specified sides of strings with an arbitrary + character. + Series.str.center : Fills both sides of strings with an arbitrary + character. + + Notes + ----- + Differs from :meth:`str.zfill` which has special handling + for '+'/'-' in the string. + + Examples + -------- + >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) + >>> s + 0 -1 + 1 1 + 2 1000 + 3 10 + 4 NaN + dtype: object + + Note that ``10`` and ``NaN`` are not strings, therefore they are + converted to ``NaN``. The minus sign in ``'-1'`` is treated as a + special character and the zero is added to the right of it + (:meth:`str.zfill` would have moved it to the left). ``1000`` + remains unchanged as it is longer than `width`. + + >>> s.str.zfill(3) + 0 -01 + 1 001 + 2 1000 + 3 NaN + 4 NaN + dtype: object + """ + if not is_integer(width): + msg = f"width must be of integer type, not {type(width).__name__}" + raise TypeError(msg) + f = lambda x: x.zfill(width) + result = self._data.array._str_map(f) + return self._wrap_result(result) + + def slice(self, start=None, stop=None, step=None): + """ + Slice substrings from each element in the Series or Index. + + Parameters + ---------- + start : int, optional + Start position for slice operation. + stop : int, optional + Stop position for slice operation. + step : int, optional + Step size for slice operation. + + Returns + ------- + Series or Index of object + Series or Index from sliced substring from original string object. + + See Also + -------- + Series.str.slice_replace : Replace a slice with a string. + Series.str.get : Return element at position. + Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` + being the position. + + Examples + -------- + >>> s = pd.Series(["koala", "dog", "chameleon"]) + >>> s + 0 koala + 1 dog + 2 chameleon + dtype: object + + >>> s.str.slice(start=1) + 0 oala + 1 og + 2 hameleon + dtype: object + + >>> s.str.slice(start=-1) + 0 a + 1 g + 2 n + dtype: object + + >>> s.str.slice(stop=2) + 0 ko + 1 do + 2 ch + dtype: object + + >>> s.str.slice(step=2) + 0 kaa + 1 dg + 2 caeen + dtype: object + + >>> s.str.slice(start=0, stop=5, step=3) + 0 kl + 1 d + 2 cm + dtype: object + + Equivalent behaviour to: + + >>> s.str[0:5:3] + 0 kl + 1 d + 2 cm + dtype: object + """ + result = self._data.array._str_slice(start, stop, step) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def slice_replace(self, start=None, stop=None, repl=None): + """ + Replace a positional slice of a string with another value. + + Parameters + ---------- + start : int, optional + Left index position to use for the slice. If not specified (None), + the slice is unbounded on the left, i.e. slice from the start + of the string. + stop : int, optional + Right index position to use for the slice. If not specified (None), + the slice is unbounded on the right, i.e. slice until the + end of the string. + repl : str, optional + String for replacement. If not specified (None), the sliced region + is replaced with an empty string. + + Returns + ------- + Series or Index + Same type as the original object. + + See Also + -------- + Series.str.slice : Just slicing without replacement. + + Examples + -------- + >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) + >>> s + 0 a + 1 ab + 2 abc + 3 abdc + 4 abcde + dtype: object + + Specify just `start`, meaning replace `start` until the end of the + string with `repl`. + + >>> s.str.slice_replace(1, repl='X') + 0 aX + 1 aX + 2 aX + 3 aX + 4 aX + dtype: object + + Specify just `stop`, meaning the start of the string to `stop` is replaced + with `repl`, and the rest of the string is included. + + >>> s.str.slice_replace(stop=2, repl='X') + 0 X + 1 X + 2 Xc + 3 Xdc + 4 Xcde + dtype: object + + Specify `start` and `stop`, meaning the slice from `start` to `stop` is + replaced with `repl`. Everything before or after `start` and `stop` is + included as is. + + >>> s.str.slice_replace(start=1, stop=3, repl='X') + 0 aX + 1 aX + 2 aX + 3 aXc + 4 aXde + dtype: object + """ + result = self._data.array._str_slice_replace(start, stop, repl) + return self._wrap_result(result) + + def decode(self, encoding, errors: str = "strict"): + """ + Decode character string in the Series/Index using indicated encoding. + + Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in + python3. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + Series or Index + + Examples + -------- + For Series: + + >>> ser = pd.Series([b'cow', b'123', b'()']) + >>> ser.str.decode('ascii') + 0 cow + 1 123 + 2 () + dtype: object + """ + # TODO: Add a similar _bytes interface. + if encoding in _cpython_optimized_decoders: + # CPython optimized implementation + f = lambda x: x.decode(encoding, errors) + else: + decoder = codecs.getdecoder(encoding) + f = lambda x: decoder(x, errors)[0] + arr = self._data.array + # assert isinstance(arr, (StringArray,)) + result = arr._str_map(f) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def encode(self, encoding, errors: str = "strict"): + """ + Encode character string in the Series/Index using indicated encoding. + + Equivalent to :meth:`str.encode`. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + Series/Index of objects + + Examples + -------- + >>> ser = pd.Series(['cow', '123', '()']) + >>> ser.str.encode(encoding='ascii') + 0 b'cow' + 1 b'123' + 2 b'()' + dtype: object + """ + result = self._data.array._str_encode(encoding, errors) + return self._wrap_result(result, returns_string=False) + + _shared_docs[ + "str_strip" + ] = r""" + Remove %(position)s characters. + + Strip whitespaces (including newlines) or a set of specified characters + from each string in the Series/Index from %(side)s. + Replaces any non-strings in Series with NaNs. + Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + to_strip : str or None, default None + Specifying the set of characters to be removed. + All combinations of this set of characters will be stripped. + If None then whitespaces are removed. + + Returns + ------- + Series or Index of object + + See Also + -------- + Series.str.strip : Remove leading and trailing characters in Series/Index. + Series.str.lstrip : Remove leading characters in Series/Index. + Series.str.rstrip : Remove trailing characters in Series/Index. + + Examples + -------- + >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True]) + >>> s + 0 1. Ant. + 1 2. Bee!\n + 2 3. Cat?\t + 3 NaN + 4 10 + 5 True + dtype: object + + >>> s.str.strip() + 0 1. Ant. + 1 2. Bee! + 2 3. Cat? + 3 NaN + 4 NaN + 5 NaN + dtype: object + + >>> s.str.lstrip('123.') + 0 Ant. + 1 Bee!\n + 2 Cat?\t + 3 NaN + 4 NaN + 5 NaN + dtype: object + + >>> s.str.rstrip('.!? \n\t') + 0 1. Ant + 1 2. Bee + 2 3. Cat + 3 NaN + 4 NaN + 5 NaN + dtype: object + + >>> s.str.strip('123.!? \n\t') + 0 Ant + 1 Bee + 2 Cat + 3 NaN + 4 NaN + 5 NaN + dtype: object + """ + + @Appender( + _shared_docs["str_strip"] + % { + "side": "left and right sides", + "method": "strip", + "position": "leading and trailing", + } + ) + @forbid_nonstring_types(["bytes"]) + def strip(self, to_strip=None): + result = self._data.array._str_strip(to_strip) + return self._wrap_result(result) + + @Appender( + _shared_docs["str_strip"] + % {"side": "left side", "method": "lstrip", "position": "leading"} + ) + @forbid_nonstring_types(["bytes"]) + def lstrip(self, to_strip=None): + result = self._data.array._str_lstrip(to_strip) + return self._wrap_result(result) + + @Appender( + _shared_docs["str_strip"] + % {"side": "right side", "method": "rstrip", "position": "trailing"} + ) + @forbid_nonstring_types(["bytes"]) + def rstrip(self, to_strip=None): + result = self._data.array._str_rstrip(to_strip) + return self._wrap_result(result) + + _shared_docs[ + "str_removefix" + ] = r""" + Remove a %(side)s from an object series. + + If the %(side)s is not present, the original string will be returned. + + Parameters + ---------- + %(side)s : str + Remove the %(side)s of the string. + + Returns + ------- + Series/Index: object + The Series or Index with given %(side)s removed. + + See Also + -------- + Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series. + + Examples + -------- + >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"]) + >>> s + 0 str_foo + 1 str_bar + 2 no_prefix + dtype: object + >>> s.str.removeprefix("str_") + 0 foo + 1 bar + 2 no_prefix + dtype: object + + >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"]) + >>> s + 0 foo_str + 1 bar_str + 2 no_suffix + dtype: object + >>> s.str.removesuffix("_str") + 0 foo + 1 bar + 2 no_suffix + dtype: object + """ + + @Appender( + _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"} + ) + @forbid_nonstring_types(["bytes"]) + def removeprefix(self, prefix: str): + result = self._data.array._str_removeprefix(prefix) + return self._wrap_result(result) + + @Appender( + _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"} + ) + @forbid_nonstring_types(["bytes"]) + def removesuffix(self, suffix: str): + result = self._data.array._str_removesuffix(suffix) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def wrap(self, width: int, **kwargs): + r""" + Wrap strings in Series/Index at specified line width. + + This method has the same keyword parameters and defaults as + :class:`textwrap.TextWrapper`. + + Parameters + ---------- + width : int + Maximum line width. + expand_tabs : bool, optional + If True, tab characters will be expanded to spaces (default: True). + replace_whitespace : bool, optional + If True, each whitespace character (as defined by string.whitespace) + remaining after tab expansion will be replaced by a single space + (default: True). + drop_whitespace : bool, optional + If True, whitespace that, after wrapping, happens to end up at the + beginning or end of a line is dropped (default: True). + break_long_words : bool, optional + If True, then words longer than width will be broken in order to ensure + that no lines are longer than width. If it is false, long words will + not be broken, and some lines may be longer than width (default: True). + break_on_hyphens : bool, optional + If True, wrapping will occur preferably on whitespace and right after + hyphens in compound words, as it is customary in English. If false, + only whitespaces will be considered as potentially good places for line + breaks, but you need to set break_long_words to false if you want truly + insecable words (default: True). + + Returns + ------- + Series or Index + + Notes + ----- + Internally, this method uses a :class:`textwrap.TextWrapper` instance with + default settings. To achieve behavior matching R's stringr library str_wrap + function, use the arguments: + + - expand_tabs = False + - replace_whitespace = True + - drop_whitespace = True + - break_long_words = False + - break_on_hyphens = False + + Examples + -------- + >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) + >>> s.str.wrap(12) + 0 line to be\nwrapped + 1 another line\nto be\nwrapped + dtype: object + """ + result = self._data.array._str_wrap(width, **kwargs) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def get_dummies(self, sep: str = "|"): + """ + Return DataFrame of dummy/indicator variables for Series. + + Each string in Series is split by sep and returned as a DataFrame + of dummy/indicator variables. + + Parameters + ---------- + sep : str, default "|" + String to split on. + + Returns + ------- + DataFrame + Dummy variables corresponding to values of the Series. + + See Also + -------- + get_dummies : Convert categorical variable into dummy/indicator + variables. + + Examples + -------- + >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 1 0 0 + 2 1 0 1 + + >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 0 0 0 + 2 1 0 1 + """ + # we need to cast to Series of strings as only that has all + # methods available for making the dummies... + result, name = self._data.array._str_get_dummies(sep) + return self._wrap_result( + result, + name=name, + expand=True, + returns_string=False, + ) + + @forbid_nonstring_types(["bytes"]) + def translate(self, table): + """ + Map all characters in the string through the given mapping table. + + Equivalent to standard :meth:`str.translate`. + + Parameters + ---------- + table : dict + Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or + None. Unmapped characters are left untouched. + Characters mapped to None are deleted. :meth:`str.maketrans` is a + helper function for making translation tables. + + Returns + ------- + Series or Index + + Examples + -------- + >>> ser = pd.Series(["El niño", "Françoise"]) + >>> mytable = str.maketrans({'ñ': 'n', 'ç': 'c'}) + >>> ser.str.translate(mytable) + 0 El nino + 1 Francoise + dtype: object + """ + result = self._data.array._str_translate(table) + dtype = object if self._data.dtype == "object" else None + return self._wrap_result(result, dtype=dtype) + + @forbid_nonstring_types(["bytes"]) + def count(self, pat, flags: int = 0): + r""" + Count occurrences of pattern in each string of the Series/Index. + + This function is used to count the number of times a particular regex + pattern is repeated in each of the string elements of the + :class:`~pandas.Series`. + + Parameters + ---------- + pat : str + Valid regular expression. + flags : int, default 0, meaning no flags + Flags for the `re` module. For a complete list, `see here + `_. + **kwargs + For compatibility with other string methods. Not used. + + Returns + ------- + Series or Index + Same type as the calling object containing the integer counts. + + See Also + -------- + re : Standard library module for regular expressions. + str.count : Standard library version, without regular expression support. + + Notes + ----- + Some characters need to be escaped when passing in `pat`. + eg. ``'$'`` has a special meaning in regex and must be escaped when + finding this literal character. + + Examples + -------- + >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) + >>> s.str.count('a') + 0 0.0 + 1 0.0 + 2 2.0 + 3 2.0 + 4 NaN + 5 0.0 + 6 1.0 + dtype: float64 + + Escape ``'$'`` to find the literal dollar sign. + + >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) + >>> s.str.count('\\$') + 0 1 + 1 0 + 2 1 + 3 2 + 4 2 + 5 0 + dtype: int64 + + This is also available on Index + + >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') + Index([0, 0, 2, 1], dtype='int64') + """ + result = self._data.array._str_count(pat, flags) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def startswith( + self, pat: str | tuple[str, ...], na: Scalar | None = None + ) -> Series | Index: + """ + Test if the start of each string element matches a pattern. + + Equivalent to :meth:`str.startswith`. + + Parameters + ---------- + pat : str or tuple[str, ...] + Character sequence or tuple of strings. Regular expressions are not + accepted. + na : object, default NaN + Object shown if element tested is not a string. The default depends + on dtype of the array. For object-dtype, ``numpy.nan`` is used. + For ``StringDtype``, ``pandas.NA`` is used. + + Returns + ------- + Series or Index of bool + A Series of booleans indicating whether the given pattern matches + the start of each string element. + + See Also + -------- + str.startswith : Python standard library string method. + Series.str.endswith : Same as startswith, but tests the end of string. + Series.str.contains : Tests if string element contains a pattern. + + Examples + -------- + >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) + >>> s + 0 bat + 1 Bear + 2 cat + 3 NaN + dtype: object + + >>> s.str.startswith('b') + 0 True + 1 False + 2 False + 3 NaN + dtype: object + + >>> s.str.startswith(('b', 'B')) + 0 True + 1 True + 2 False + 3 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN`. + + >>> s.str.startswith('b', na=False) + 0 True + 1 False + 2 False + 3 False + dtype: bool + """ + if not isinstance(pat, (str, tuple)): + msg = f"expected a string or tuple, not {type(pat).__name__}" + raise TypeError(msg) + result = self._data.array._str_startswith(pat, na=na) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def endswith( + self, pat: str | tuple[str, ...], na: Scalar | None = None + ) -> Series | Index: + """ + Test if the end of each string element matches a pattern. + + Equivalent to :meth:`str.endswith`. + + Parameters + ---------- + pat : str or tuple[str, ...] + Character sequence or tuple of strings. Regular expressions are not + accepted. + na : object, default NaN + Object shown if element tested is not a string. The default depends + on dtype of the array. For object-dtype, ``numpy.nan`` is used. + For ``StringDtype``, ``pandas.NA`` is used. + + Returns + ------- + Series or Index of bool + A Series of booleans indicating whether the given pattern matches + the end of each string element. + + See Also + -------- + str.endswith : Python standard library string method. + Series.str.startswith : Same as endswith, but tests the start of string. + Series.str.contains : Tests if string element contains a pattern. + + Examples + -------- + >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) + >>> s + 0 bat + 1 bear + 2 caT + 3 NaN + dtype: object + + >>> s.str.endswith('t') + 0 True + 1 False + 2 False + 3 NaN + dtype: object + + >>> s.str.endswith(('t', 'T')) + 0 True + 1 False + 2 True + 3 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN`. + + >>> s.str.endswith('t', na=False) + 0 True + 1 False + 2 False + 3 False + dtype: bool + """ + if not isinstance(pat, (str, tuple)): + msg = f"expected a string or tuple, not {type(pat).__name__}" + raise TypeError(msg) + result = self._data.array._str_endswith(pat, na=na) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def findall(self, pat, flags: int = 0): + """ + Find all occurrences of pattern or regular expression in the Series/Index. + + Equivalent to applying :func:`re.findall` to all the elements in the + Series/Index. + + Parameters + ---------- + pat : str + Pattern or regular expression. + flags : int, default 0 + Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which + means no flags). + + Returns + ------- + Series/Index of lists of strings + All non-overlapping matches of pattern or regular expression in each + string of this Series/Index. + + See Also + -------- + count : Count occurrences of pattern or regular expression in each string + of the Series/Index. + extractall : For each string in the Series, extract groups from all matches + of regular expression and return a DataFrame with one row for each + match and one column for each group. + re.findall : The equivalent ``re`` function to all non-overlapping matches + of pattern or regular expression in string, as a list of strings. + + Examples + -------- + >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) + + The search for the pattern 'Monkey' returns one match: + + >>> s.str.findall('Monkey') + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + On the other hand, the search for the pattern 'MONKEY' doesn't return any + match: + + >>> s.str.findall('MONKEY') + 0 [] + 1 [] + 2 [] + dtype: object + + Flags can be added to the pattern or regular expression. For instance, + to find the pattern 'MONKEY' ignoring the case: + + >>> import re + >>> s.str.findall('MONKEY', flags=re.IGNORECASE) + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + When the pattern matches more than one string in the Series, all matches + are returned: + + >>> s.str.findall('on') + 0 [on] + 1 [on] + 2 [] + dtype: object + + Regular expressions are supported too. For instance, the search for all the + strings ending with the word 'on' is shown next: + + >>> s.str.findall('on$') + 0 [on] + 1 [] + 2 [] + dtype: object + + If the pattern is found more than once in the same string, then a list of + multiple strings is returned: + + >>> s.str.findall('b') + 0 [] + 1 [] + 2 [b, b] + dtype: object + """ + result = self._data.array._str_findall(pat, flags) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def extract( + self, pat: str, flags: int = 0, expand: bool = True + ) -> DataFrame | Series | Index: + r""" + Extract capture groups in the regex `pat` as columns in a DataFrame. + + For each subject string in the Series, extract groups from the + first match of regular expression `pat`. + + Parameters + ---------- + pat : str + Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that + modify regular expression matching for things like case, + spaces, etc. For more details, see :mod:`re`. + expand : bool, default True + If True, return DataFrame with one column per capture group. + If False, return a Series/Index if there is one capture group + or DataFrame if there are multiple capture groups. + + Returns + ------- + DataFrame or Series or Index + A DataFrame with one row for each subject string, and one + column for each group. Any capture group names in regular + expression pat will be used for column names; otherwise + capture group numbers will be used. The dtype of each result + column is always object, even when no match is found. If + ``expand=False`` and pat has only one capture group, then + return a Series (if subject is a Series) or Index (if subject + is an Index). + + See Also + -------- + extractall : Returns all matches (not just the first match). + + Examples + -------- + A pattern with two groups will return a DataFrame with two columns. + Non-matches will be NaN. + + >>> s = pd.Series(['a1', 'b2', 'c3']) + >>> s.str.extract(r'([ab])(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN NaN + + A pattern may contain optional groups. + + >>> s.str.extract(r'([ab])?(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN 3 + + Named groups will become column names in the result. + + >>> s.str.extract(r'(?P[ab])(?P\d)') + letter digit + 0 a 1 + 1 b 2 + 2 NaN NaN + + A pattern with one group will return a DataFrame with one column + if expand=True. + + >>> s.str.extract(r'[ab](\d)', expand=True) + 0 + 0 1 + 1 2 + 2 NaN + + A pattern with one group will return a Series if expand=False. + + >>> s.str.extract(r'[ab](\d)', expand=False) + 0 1 + 1 2 + 2 NaN + dtype: object + """ + from pandas import DataFrame + + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + + regex = re.compile(pat, flags=flags) + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + + if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex): + raise ValueError("only one regex group is supported with Index") + + obj = self._data + result_dtype = _result_dtype(obj) + + returns_df = regex.groups > 1 or expand + + if returns_df: + name = None + columns = _get_group_names(regex) + + if obj.array.size == 0: + result = DataFrame(columns=columns, dtype=result_dtype) + + else: + result_list = self._data.array._str_extract( + pat, flags=flags, expand=returns_df + ) + + result_index: Index | None + if isinstance(obj, ABCSeries): + result_index = obj.index + else: + result_index = None + + result = DataFrame( + result_list, columns=columns, index=result_index, dtype=result_dtype + ) + + else: + name = _get_single_group_name(regex) + result = self._data.array._str_extract(pat, flags=flags, expand=returns_df) + return self._wrap_result(result, name=name, dtype=result_dtype) + + @forbid_nonstring_types(["bytes"]) + def extractall(self, pat, flags: int = 0) -> DataFrame: + r""" + Extract capture groups in the regex `pat` as columns in DataFrame. + + For each subject string in the Series, extract groups from all + matches of regular expression pat. When each subject string in the + Series has exactly one match, extractall(pat).xs(0, level='match') + is the same as extract(pat). + + Parameters + ---------- + pat : str + Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + A ``re`` module flag, for example ``re.IGNORECASE``. These allow + to modify regular expression matching for things like case, spaces, + etc. Multiple flags can be combined with the bitwise OR operator, + for example ``re.IGNORECASE | re.MULTILINE``. + + Returns + ------- + DataFrame + A ``DataFrame`` with one row for each match, and one column for each + group. Its rows have a ``MultiIndex`` with first levels that come from + the subject ``Series``. The last level is named 'match' and indexes the + matches in each item of the ``Series``. Any capture group names in + regular expression pat will be used for column names; otherwise capture + group numbers will be used. + + See Also + -------- + extract : Returns first match only (not all matches). + + Examples + -------- + A pattern with one group will return a DataFrame with one column. + Indices with no matches will not appear in the result. + + >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) + >>> s.str.extractall(r"[ab](\d)") + 0 + match + A 0 1 + 1 2 + B 0 1 + + Capture group names are used for column names of the result. + + >>> s.str.extractall(r"[ab](?P\d)") + digit + match + A 0 1 + 1 2 + B 0 1 + + A pattern with two groups will return a DataFrame with two columns. + + >>> s.str.extractall(r"(?P[ab])(?P\d)") + letter digit + match + A 0 a 1 + 1 a 2 + B 0 b 1 + + Optional groups that do not match are NaN in the result. + + >>> s.str.extractall(r"(?P[ab])?(?P\d)") + letter digit + match + A 0 a 1 + 1 a 2 + B 0 b 1 + C 0 NaN 1 + """ + # TODO: dispatch + return str_extractall(self._orig, pat, flags) + + _shared_docs[ + "find" + ] = """ + Return %(side)s indexes in each strings in the Series/Index. + + Each of returned indexes corresponds to the position where the + substring is fully contained between [start:end]. Return -1 on + failure. Equivalent to standard :meth:`str.%(method)s`. + + Parameters + ---------- + sub : str + Substring being searched. + start : int + Left edge index. + end : int + Right edge index. + + Returns + ------- + Series or Index of int. + + See Also + -------- + %(also)s + + Examples + -------- + For Series.str.find: + + >>> ser = pd.Series(["cow_", "duck_", "do_ve"]) + >>> ser.str.find("_") + 0 3 + 1 4 + 2 2 + dtype: int64 + + For Series.str.rfind: + + >>> ser = pd.Series(["_cow_", "duck_", "do_v_e"]) + >>> ser.str.rfind("_") + 0 4 + 1 4 + 2 4 + dtype: int64 + """ + + @Appender( + _shared_docs["find"] + % { + "side": "lowest", + "method": "find", + "also": "rfind : Return highest indexes in each strings.", + } + ) + @forbid_nonstring_types(["bytes"]) + def find(self, sub, start: int = 0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._data.array._str_find(sub, start, end) + return self._wrap_result(result, returns_string=False) + + @Appender( + _shared_docs["find"] + % { + "side": "highest", + "method": "rfind", + "also": "find : Return lowest indexes in each strings.", + } + ) + @forbid_nonstring_types(["bytes"]) + def rfind(self, sub, start: int = 0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._data.array._str_rfind(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def normalize(self, form): + """ + Return the Unicode normal form for the strings in the Series/Index. + + For more information on the forms, see the + :func:`unicodedata.normalize`. + + Parameters + ---------- + form : {'NFC', 'NFKC', 'NFD', 'NFKD'} + Unicode form. + + Returns + ------- + Series/Index of objects + + Examples + -------- + >>> ser = pd.Series(['ñ']) + >>> ser.str.normalize('NFC') == ser.str.normalize('NFD') + 0 False + dtype: bool + """ + result = self._data.array._str_normalize(form) + return self._wrap_result(result) + + _shared_docs[ + "index" + ] = """ + Return %(side)s indexes in each string in Series/Index. + + Each of the returned indexes corresponds to the position where the + substring is fully contained between [start:end]. This is the same + as ``str.%(similar)s`` except instead of returning -1, it raises a + ValueError when the substring is not found. Equivalent to standard + ``str.%(method)s``. + + Parameters + ---------- + sub : str + Substring being searched. + start : int + Left edge index. + end : int + Right edge index. + + Returns + ------- + Series or Index of object + + See Also + -------- + %(also)s + + Examples + -------- + For Series.str.index: + + >>> ser = pd.Series(["horse", "eagle", "donkey"]) + >>> ser.str.index("e") + 0 4 + 1 0 + 2 4 + dtype: int64 + + For Series.str.rindex: + + >>> ser = pd.Series(["Deer", "eagle", "Sheep"]) + >>> ser.str.rindex("e") + 0 2 + 1 4 + 2 3 + dtype: int64 + """ + + @Appender( + _shared_docs["index"] + % { + "side": "lowest", + "similar": "find", + "method": "index", + "also": "rindex : Return highest indexes in each strings.", + } + ) + @forbid_nonstring_types(["bytes"]) + def index(self, sub, start: int = 0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._data.array._str_index(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + @Appender( + _shared_docs["index"] + % { + "side": "highest", + "similar": "rfind", + "method": "rindex", + "also": "index : Return lowest indexes in each strings.", + } + ) + @forbid_nonstring_types(["bytes"]) + def rindex(self, sub, start: int = 0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._data.array._str_rindex(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + def len(self): + """ + Compute the length of each element in the Series/Index. + + The element may be a sequence (such as a string, tuple or list) or a collection + (such as a dictionary). + + Returns + ------- + Series or Index of int + A Series or Index of integer values indicating the length of each + element in the Series or Index. + + See Also + -------- + str.len : Python built-in function returning the length of an object. + Series.size : Returns the length of the Series. + + Examples + -------- + Returns the length (number of characters) in a string. Returns the + number of entries for dictionaries, lists or tuples. + + >>> s = pd.Series(['dog', + ... '', + ... 5, + ... {'foo' : 'bar'}, + ... [2, 3, 5, 7], + ... ('one', 'two', 'three')]) + >>> s + 0 dog + 1 + 2 5 + 3 {'foo': 'bar'} + 4 [2, 3, 5, 7] + 5 (one, two, three) + dtype: object + >>> s.str.len() + 0 3.0 + 1 0.0 + 2 NaN + 3 1.0 + 4 4.0 + 5 3.0 + dtype: float64 + """ + result = self._data.array._str_len() + return self._wrap_result(result, returns_string=False) + + _shared_docs[ + "casemethods" + ] = """ + Convert strings in the Series/Index to %(type)s. + %(version)s + Equivalent to :meth:`str.%(method)s`. + + Returns + ------- + Series or Index of object + + See Also + -------- + Series.str.lower : Converts all characters to lowercase. + Series.str.upper : Converts all characters to uppercase. + Series.str.title : Converts first character of each word to uppercase and + remaining to lowercase. + Series.str.capitalize : Converts first character to uppercase and + remaining to lowercase. + Series.str.swapcase : Converts uppercase to lowercase and lowercase to + uppercase. + Series.str.casefold: Removes all case distinctions in the string. + + Examples + -------- + >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) + >>> s + 0 lower + 1 CAPITALS + 2 this is a sentence + 3 SwApCaSe + dtype: object + + >>> s.str.lower() + 0 lower + 1 capitals + 2 this is a sentence + 3 swapcase + dtype: object + + >>> s.str.upper() + 0 LOWER + 1 CAPITALS + 2 THIS IS A SENTENCE + 3 SWAPCASE + dtype: object + + >>> s.str.title() + 0 Lower + 1 Capitals + 2 This Is A Sentence + 3 Swapcase + dtype: object + + >>> s.str.capitalize() + 0 Lower + 1 Capitals + 2 This is a sentence + 3 Swapcase + dtype: object + + >>> s.str.swapcase() + 0 LOWER + 1 capitals + 2 THIS IS A SENTENCE + 3 sWaPcAsE + dtype: object + """ + # Types: + # cases: + # upper, lower, title, capitalize, swapcase, casefold + # boolean: + # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle + # _doc_args holds dict of strings to use in substituting casemethod docs + _doc_args: dict[str, dict[str, str]] = {} + _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} + _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""} + _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""} + _doc_args["capitalize"] = { + "type": "be capitalized", + "method": "capitalize", + "version": "", + } + _doc_args["swapcase"] = { + "type": "be swapcased", + "method": "swapcase", + "version": "", + } + _doc_args["casefold"] = { + "type": "be casefolded", + "method": "casefold", + "version": "", + } + + @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) + @forbid_nonstring_types(["bytes"]) + def lower(self): + result = self._data.array._str_lower() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) + @forbid_nonstring_types(["bytes"]) + def upper(self): + result = self._data.array._str_upper() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["title"]) + @forbid_nonstring_types(["bytes"]) + def title(self): + result = self._data.array._str_title() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) + @forbid_nonstring_types(["bytes"]) + def capitalize(self): + result = self._data.array._str_capitalize() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) + @forbid_nonstring_types(["bytes"]) + def swapcase(self): + result = self._data.array._str_swapcase() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) + @forbid_nonstring_types(["bytes"]) + def casefold(self): + result = self._data.array._str_casefold() + return self._wrap_result(result) + + _shared_docs[ + "ismethods" + ] = """ + Check whether all characters in each string are %(type)s. + + This is equivalent to running the Python string method + :meth:`str.%(method)s` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns + ------- + Series or Index of bool + Series or Index of boolean values with the same length as the original + Series/Index. + + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + + Examples + -------- + **Checks for Alphabetic and Numeric Characters** + + >>> s1 = pd.Series(['one', 'one1', '1', '']) + + >>> s1.str.isalpha() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + >>> s1.str.isnumeric() + 0 False + 1 False + 2 True + 3 False + dtype: bool + + >>> s1.str.isalnum() + 0 True + 1 True + 2 True + 3 False + dtype: bool + + Note that checks against characters mixed with any additional punctuation + or whitespace will evaluate to false for an alphanumeric check. + + >>> s2 = pd.Series(['A B', '1.5', '3,000']) + >>> s2.str.isalnum() + 0 False + 1 False + 2 False + dtype: bool + + **More Detailed Checks for Numeric Characters** + + There are several different but overlapping sets of numeric characters that + can be checked for. + + >>> s3 = pd.Series(['23', '³', '⅕', '']) + + The ``s3.str.isdecimal`` method checks for characters used to form numbers + in base 10. + + >>> s3.str.isdecimal() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also + includes special digits, like superscripted and subscripted digits in + unicode. + + >>> s3.str.isdigit() + 0 True + 1 True + 2 False + 3 False + dtype: bool + + The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also + includes other characters that can represent quantities such as unicode + fractions. + + >>> s3.str.isnumeric() + 0 True + 1 True + 2 True + 3 False + dtype: bool + + **Checks for Whitespace** + + >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) + >>> s4.str.isspace() + 0 True + 1 True + 2 False + dtype: bool + + **Checks for Character Case** + + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) + + >>> s5.str.islower() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + >>> s5.str.isupper() + 0 False + 1 False + 2 True + 3 False + dtype: bool + + The ``s5.str.istitle`` method checks for whether all words are in title + case (whether only the first letter of each word is capitalized). Words are + assumed to be as any sequence of non-numeric characters separated by + whitespace characters. + + >>> s5.str.istitle() + 0 False + 1 True + 2 False + 3 False + dtype: bool + """ + _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} + _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} + _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} + _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} + _doc_args["islower"] = {"type": "lowercase", "method": "islower"} + _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"} + _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"} + _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"} + _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"} + # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) + + isalnum = _map_and_wrap( + "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + ) + isalpha = _map_and_wrap( + "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + ) + isdigit = _map_and_wrap( + "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + ) + isspace = _map_and_wrap( + "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"] + ) + islower = _map_and_wrap( + "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] + ) + isupper = _map_and_wrap( + "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + ) + istitle = _map_and_wrap( + "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + ) + isnumeric = _map_and_wrap( + "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + ) + isdecimal = _map_and_wrap( + "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + ) + + +def cat_safe(list_of_columns: list[npt.NDArray[np.object_]], sep: str): + """ + Auxiliary function for :meth:`str.cat`. + + Same signature as cat_core, but handles TypeErrors in concatenation, which + happen if the arrays in list_of columns have the wrong dtypes or content. + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + try: + result = cat_core(list_of_columns, sep) + except TypeError: + # if there are any non-string values (wrong dtype or hidden behind + # object dtype), np.sum will fail; catch and return with better message + for column in list_of_columns: + dtype = lib.infer_dtype(column, skipna=True) + if dtype not in ["string", "empty"]: + raise TypeError( + "Concatenation requires list-likes containing only " + "strings (or missing values). Offending values found in " + f"column {dtype}" + ) from None + return result + + +def cat_core(list_of_columns: list, sep: str): + """ + Auxiliary function for :meth:`str.cat` + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + if sep == "": + # no need to interleave sep if it is empty + arr_of_cols = np.asarray(list_of_columns, dtype=object) + return np.sum(arr_of_cols, axis=0) + list_with_sep = [sep] * (2 * len(list_of_columns) - 1) + list_with_sep[::2] = list_of_columns + arr_with_sep = np.asarray(list_with_sep, dtype=object) + return np.sum(arr_with_sep, axis=0) + + +def _result_dtype(arr): + # workaround #27953 + # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails + # when the list of values is empty. + from pandas.core.arrays.string_ import StringDtype + + if isinstance(arr.dtype, (ArrowDtype, StringDtype)): + return arr.dtype + return object + + +def _get_single_group_name(regex: re.Pattern) -> Hashable: + if regex.groupindex: + return next(iter(regex.groupindex)) + else: + return None + + +def _get_group_names(regex: re.Pattern) -> list[Hashable]: + """ + Get named groups from compiled regex. + + Unnamed groups are numbered. + + Parameters + ---------- + regex : compiled regex + + Returns + ------- + list of column labels + """ + names = {v: k for k, v in regex.groupindex.items()} + return [names.get(1 + i, i) for i in range(regex.groups)] + + +def str_extractall(arr, pat, flags: int = 0) -> DataFrame: + regex = re.compile(pat, flags=flags) + # the regex must contain capture groups. + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + + if isinstance(arr, ABCIndex): + arr = arr.to_series().reset_index(drop=True).astype(arr.dtype) + + columns = _get_group_names(regex) + match_list = [] + index_list = [] + is_mi = arr.index.nlevels > 1 + + for subject_key, subject in arr.items(): + if isinstance(subject, str): + if not is_mi: + subject_key = (subject_key,) + + for match_i, match_tuple in enumerate(regex.findall(subject)): + if isinstance(match_tuple, str): + match_tuple = (match_tuple,) + na_tuple = [np.nan if group == "" else group for group in match_tuple] + match_list.append(na_tuple) + result_key = tuple(subject_key + (match_i,)) + index_list.append(result_key) + + from pandas import MultiIndex + + index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) + dtype = _result_dtype(arr) + + result = arr._constructor_expanddim( + match_list, index=index, columns=columns, dtype=dtype + ) + return result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/base.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/base.py new file mode 100644 index 0000000000000000000000000000000000000000..96b0352666b412cf36a7c9aecfc9ab42628e29df --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/base.py @@ -0,0 +1,262 @@ +from __future__ import annotations + +import abc +from typing import ( + TYPE_CHECKING, + Callable, + Literal, +) + +import numpy as np + +if TYPE_CHECKING: + from collections.abc import Sequence + import re + + from pandas._typing import Scalar + + from pandas import Series + + +class BaseStringArrayMethods(abc.ABC): + """ + Base class for extension arrays implementing string methods. + + This is where our ExtensionArrays can override the implementation of + Series.str.. We don't expect this to work with + 3rd-party extension arrays. + + * User calls Series.str. + * pandas extracts the extension array from the Series + * pandas calls ``extension_array._str_(*args, **kwargs)`` + * pandas wraps the result, to return to the user. + + See :ref:`Series.str` for the docstring of each method. + """ + + def _str_getitem(self, key): + if isinstance(key, slice): + return self._str_slice(start=key.start, stop=key.stop, step=key.step) + else: + return self._str_get(key) + + @abc.abstractmethod + def _str_count(self, pat, flags: int = 0): + pass + + @abc.abstractmethod + def _str_pad( + self, + width: int, + side: Literal["left", "right", "both"] = "left", + fillchar: str = " ", + ): + pass + + @abc.abstractmethod + def _str_contains( + self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + ): + pass + + @abc.abstractmethod + def _str_startswith(self, pat, na=None): + pass + + @abc.abstractmethod + def _str_endswith(self, pat, na=None): + pass + + @abc.abstractmethod + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): + pass + + @abc.abstractmethod + def _str_repeat(self, repeats: int | Sequence[int]): + pass + + @abc.abstractmethod + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan + ): + pass + + @abc.abstractmethod + def _str_fullmatch( + self, + pat: str | re.Pattern, + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, + ): + pass + + @abc.abstractmethod + def _str_encode(self, encoding, errors: str = "strict"): + pass + + @abc.abstractmethod + def _str_find(self, sub, start: int = 0, end=None): + pass + + @abc.abstractmethod + def _str_rfind(self, sub, start: int = 0, end=None): + pass + + @abc.abstractmethod + def _str_findall(self, pat, flags: int = 0): + pass + + @abc.abstractmethod + def _str_get(self, i): + pass + + @abc.abstractmethod + def _str_index(self, sub, start: int = 0, end=None): + pass + + @abc.abstractmethod + def _str_rindex(self, sub, start: int = 0, end=None): + pass + + @abc.abstractmethod + def _str_join(self, sep: str): + pass + + @abc.abstractmethod + def _str_partition(self, sep: str, expand): + pass + + @abc.abstractmethod + def _str_rpartition(self, sep: str, expand): + pass + + @abc.abstractmethod + def _str_len(self): + pass + + @abc.abstractmethod + def _str_slice(self, start=None, stop=None, step=None): + pass + + @abc.abstractmethod + def _str_slice_replace(self, start=None, stop=None, repl=None): + pass + + @abc.abstractmethod + def _str_translate(self, table): + pass + + @abc.abstractmethod + def _str_wrap(self, width: int, **kwargs): + pass + + @abc.abstractmethod + def _str_get_dummies(self, sep: str = "|"): + pass + + @abc.abstractmethod + def _str_isalnum(self): + pass + + @abc.abstractmethod + def _str_isalpha(self): + pass + + @abc.abstractmethod + def _str_isdecimal(self): + pass + + @abc.abstractmethod + def _str_isdigit(self): + pass + + @abc.abstractmethod + def _str_islower(self): + pass + + @abc.abstractmethod + def _str_isnumeric(self): + pass + + @abc.abstractmethod + def _str_isspace(self): + pass + + @abc.abstractmethod + def _str_istitle(self): + pass + + @abc.abstractmethod + def _str_isupper(self): + pass + + @abc.abstractmethod + def _str_capitalize(self): + pass + + @abc.abstractmethod + def _str_casefold(self): + pass + + @abc.abstractmethod + def _str_title(self): + pass + + @abc.abstractmethod + def _str_swapcase(self): + pass + + @abc.abstractmethod + def _str_lower(self): + pass + + @abc.abstractmethod + def _str_upper(self): + pass + + @abc.abstractmethod + def _str_normalize(self, form): + pass + + @abc.abstractmethod + def _str_strip(self, to_strip=None): + pass + + @abc.abstractmethod + def _str_lstrip(self, to_strip=None): + pass + + @abc.abstractmethod + def _str_rstrip(self, to_strip=None): + pass + + @abc.abstractmethod + def _str_removeprefix(self, prefix: str) -> Series: + pass + + @abc.abstractmethod + def _str_removesuffix(self, suffix: str) -> Series: + pass + + @abc.abstractmethod + def _str_split( + self, pat=None, n=-1, expand: bool = False, regex: bool | None = None + ): + pass + + @abc.abstractmethod + def _str_rsplit(self, pat=None, n=-1): + pass + + @abc.abstractmethod + def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): + pass diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/object_array.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/object_array.py new file mode 100644 index 0000000000000000000000000000000000000000..0029beccc40a8d3a671802c7d68fab21612704e9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/strings/object_array.py @@ -0,0 +1,497 @@ +from __future__ import annotations + +import functools +import re +import textwrap +from typing import ( + TYPE_CHECKING, + Callable, + Literal, + cast, +) +import unicodedata + +import numpy as np + +from pandas._libs import lib +import pandas._libs.missing as libmissing +import pandas._libs.ops as libops + +from pandas.core.dtypes.missing import isna + +from pandas.core.strings.base import BaseStringArrayMethods + +if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import ( + NpDtype, + Scalar, + ) + + from pandas import Series + + +class ObjectStringArrayMixin(BaseStringArrayMethods): + """ + String Methods operating on object-dtype ndarrays. + """ + + _str_na_value = np.nan + + def __len__(self) -> int: + # For typing, _str_map relies on the object being sized. + raise NotImplementedError + + def _str_map( + self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True + ): + """ + Map a callable over valid elements of the array. + + Parameters + ---------- + f : Callable + A function to call on each non-NA element. + na_value : Scalar, optional + The value to set for NA values. Might also be used for the + fill value if the callable `f` raises an exception. + This defaults to ``self._str_na_value`` which is ``np.nan`` + for object-dtype and Categorical and ``pd.NA`` for StringArray. + dtype : Dtype, optional + The dtype of the result array. + convert : bool, default True + Whether to call `maybe_convert_objects` on the resulting ndarray + """ + if dtype is None: + dtype = np.dtype("object") + if na_value is None: + na_value = self._str_na_value + + if not len(self): + return np.array([], dtype=dtype) + + arr = np.asarray(self, dtype=object) + mask = isna(arr) + map_convert = convert and not np.all(mask) + try: + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) + except (TypeError, AttributeError) as err: + # Reraise the exception if callable `f` got wrong number of args. + # The user may want to be warned by this, instead of getting NaN + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) + + if len(err.args) >= 1 and re.search(p_err, err.args[0]): + # FIXME: this should be totally avoidable + raise err + + def g(x): + # This type of fallback behavior can be removed once + # we remove object-dtype .str accessor. + try: + return f(x) + except (TypeError, AttributeError): + return na_value + + return self._str_map(g, na_value=na_value, dtype=dtype) + if not isinstance(result, np.ndarray): + return result + if na_value is not np.nan: + np.putmask(result, mask, na_value) + if convert and result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + def _str_count(self, pat, flags: int = 0): + regex = re.compile(pat, flags=flags) + f = lambda x: len(regex.findall(x)) + return self._str_map(f, dtype="int64") + + def _str_pad( + self, + width: int, + side: Literal["left", "right", "both"] = "left", + fillchar: str = " ", + ): + if side == "left": + f = lambda x: x.rjust(width, fillchar) + elif side == "right": + f = lambda x: x.ljust(width, fillchar) + elif side == "both": + f = lambda x: x.center(width, fillchar) + else: # pragma: no cover + raise ValueError("Invalid side") + return self._str_map(f) + + def _str_contains( + self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + ): + if regex: + if not case: + flags |= re.IGNORECASE + + pat = re.compile(pat, flags=flags) + + f = lambda x: pat.search(x) is not None + else: + if case: + f = lambda x: pat in x + else: + upper_pat = pat.upper() + f = lambda x: upper_pat in x.upper() + return self._str_map(f, na, dtype=np.dtype("bool")) + + def _str_startswith(self, pat, na=None): + f = lambda x: x.startswith(pat) + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_endswith(self, pat, na=None): + f = lambda x: x.endswith(pat) + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): + if case is False: + # add case flag, if provided + flags |= re.IGNORECASE + + if regex or flags or callable(repl): + if not isinstance(pat, re.Pattern): + if regex is False: + pat = re.escape(pat) + pat = re.compile(pat, flags=flags) + + n = n if n >= 0 else 0 + f = lambda x: pat.sub(repl=repl, string=x, count=n) + else: + f = lambda x: x.replace(pat, repl, n) + + return self._str_map(f, dtype=str) + + def _str_repeat(self, repeats: int | Sequence[int]): + if lib.is_integer(repeats): + rint = cast(int, repeats) + + def scalar_rep(x): + try: + return bytes.__mul__(x, rint) + except TypeError: + return str.__mul__(x, rint) + + return self._str_map(scalar_rep, dtype=str) + else: + from pandas.core.arrays.string_ import BaseStringArray + + def rep(x, r): + if x is libmissing.NA: + return x + try: + return bytes.__mul__(x, r) + except TypeError: + return str.__mul__(x, r) + + result = libops.vec_binop( + np.asarray(self), + np.asarray(repeats, dtype=object), + rep, + ) + if isinstance(self, BaseStringArray): + # Not going through map, so we have to do this here. + result = type(self)._from_sequence(result, dtype=self.dtype) + return result + + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + f = lambda x: regex.match(x) is not None + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_fullmatch( + self, + pat: str | re.Pattern, + case: bool = True, + flags: int = 0, + na: Scalar | None = None, + ): + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + f = lambda x: regex.fullmatch(x) is not None + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_encode(self, encoding, errors: str = "strict"): + f = lambda x: x.encode(encoding, errors=errors) + return self._str_map(f, dtype=object) + + def _str_find(self, sub, start: int = 0, end=None): + return self._str_find_(sub, start, end, side="left") + + def _str_rfind(self, sub, start: int = 0, end=None): + return self._str_find_(sub, start, end, side="right") + + def _str_find_(self, sub, start, end, side): + if side == "left": + method = "find" + elif side == "right": + method = "rfind" + else: # pragma: no cover + raise ValueError("Invalid side") + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + return self._str_map(f, dtype="int64") + + def _str_findall(self, pat, flags: int = 0): + regex = re.compile(pat, flags=flags) + return self._str_map(regex.findall, dtype="object") + + def _str_get(self, i): + def f(x): + if isinstance(x, dict): + return x.get(i) + elif len(x) > i >= -len(x): + return x[i] + return self._str_na_value + + return self._str_map(f) + + def _str_index(self, sub, start: int = 0, end=None): + if end: + f = lambda x: x.index(sub, start, end) + else: + f = lambda x: x.index(sub, start, end) + return self._str_map(f, dtype="int64") + + def _str_rindex(self, sub, start: int = 0, end=None): + if end: + f = lambda x: x.rindex(sub, start, end) + else: + f = lambda x: x.rindex(sub, start, end) + return self._str_map(f, dtype="int64") + + def _str_join(self, sep: str): + return self._str_map(sep.join) + + def _str_partition(self, sep: str, expand): + result = self._str_map(lambda x: x.partition(sep), dtype="object") + return result + + def _str_rpartition(self, sep: str, expand): + return self._str_map(lambda x: x.rpartition(sep), dtype="object") + + def _str_len(self): + return self._str_map(len, dtype="int64") + + def _str_slice(self, start=None, stop=None, step=None): + obj = slice(start, stop, step) + return self._str_map(lambda x: x[obj]) + + def _str_slice_replace(self, start=None, stop=None, repl=None): + if repl is None: + repl = "" + + def f(x): + if x[start:stop] == "": + local_stop = start + else: + local_stop = stop + y = "" + if start is not None: + y += x[:start] + y += repl + if stop is not None: + y += x[local_stop:] + return y + + return self._str_map(f) + + def _str_split( + self, + pat: str | re.Pattern | None = None, + n=-1, + expand: bool = False, + regex: bool | None = None, + ): + if pat is None: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + new_pat: str | re.Pattern + if regex is True or isinstance(pat, re.Pattern): + new_pat = re.compile(pat) + elif regex is False: + new_pat = pat + # regex is None so link to old behavior #43563 + else: + if len(pat) == 1: + new_pat = pat + else: + new_pat = re.compile(pat) + + if isinstance(new_pat, re.Pattern): + if n is None or n == -1: + n = 0 + f = lambda x: new_pat.split(x, maxsplit=n) + else: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + return self._str_map(f, dtype=object) + + def _str_rsplit(self, pat=None, n=-1): + if n is None or n == 0: + n = -1 + f = lambda x: x.rsplit(pat, n) + return self._str_map(f, dtype="object") + + def _str_translate(self, table): + return self._str_map(lambda x: x.translate(table)) + + def _str_wrap(self, width: int, **kwargs): + kwargs["width"] = width + tw = textwrap.TextWrapper(**kwargs) + return self._str_map(lambda s: "\n".join(tw.wrap(s))) + + def _str_get_dummies(self, sep: str = "|"): + from pandas import Series + + arr = Series(self).fillna("") + try: + arr = sep + arr + sep + except (TypeError, NotImplementedError): + arr = sep + arr.astype(str) + sep + + tags: set[str] = set() + for ts in Series(arr, copy=False).str.split(sep): + tags.update(ts) + tags2 = sorted(tags - {""}) + + dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) + + def _isin(test_elements: str, element: str) -> bool: + return element in test_elements + + for i, t in enumerate(tags2): + pat = sep + t + sep + dummies[:, i] = lib.map_infer( + arr.to_numpy(), functools.partial(_isin, element=pat) + ) + return dummies, tags2 + + def _str_upper(self): + return self._str_map(lambda x: x.upper()) + + def _str_isalnum(self): + return self._str_map(str.isalnum, dtype="bool") + + def _str_isalpha(self): + return self._str_map(str.isalpha, dtype="bool") + + def _str_isdecimal(self): + return self._str_map(str.isdecimal, dtype="bool") + + def _str_isdigit(self): + return self._str_map(str.isdigit, dtype="bool") + + def _str_islower(self): + return self._str_map(str.islower, dtype="bool") + + def _str_isnumeric(self): + return self._str_map(str.isnumeric, dtype="bool") + + def _str_isspace(self): + return self._str_map(str.isspace, dtype="bool") + + def _str_istitle(self): + return self._str_map(str.istitle, dtype="bool") + + def _str_isupper(self): + return self._str_map(str.isupper, dtype="bool") + + def _str_capitalize(self): + return self._str_map(str.capitalize) + + def _str_casefold(self): + return self._str_map(str.casefold) + + def _str_title(self): + return self._str_map(str.title) + + def _str_swapcase(self): + return self._str_map(str.swapcase) + + def _str_lower(self): + return self._str_map(str.lower) + + def _str_normalize(self, form): + f = lambda x: unicodedata.normalize(form, x) + return self._str_map(f) + + def _str_strip(self, to_strip=None): + return self._str_map(lambda x: x.strip(to_strip)) + + def _str_lstrip(self, to_strip=None): + return self._str_map(lambda x: x.lstrip(to_strip)) + + def _str_rstrip(self, to_strip=None): + return self._str_map(lambda x: x.rstrip(to_strip)) + + def _str_removeprefix(self, prefix: str) -> Series: + # outstanding question on whether to use native methods for users on Python 3.9+ + # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770, + # in which case we could do return self._str_map(str.removeprefix) + + def removeprefix(text: str) -> str: + if text.startswith(prefix): + return text[len(prefix) :] + return text + + return self._str_map(removeprefix) + + def _str_removesuffix(self, suffix: str) -> Series: + return self._str_map(lambda x: x.removesuffix(suffix)) + + def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): + regex = re.compile(pat, flags=flags) + na_value = self._str_na_value + + if not expand: + + def g(x): + m = regex.search(x) + return m.groups()[0] if m else na_value + + return self._str_map(g, convert=False) + + empty_row = [na_value] * regex.groups + + def f(x): + if not isinstance(x, str): + return empty_row + m = regex.search(x) + if m: + return [na_value if item is None else item for item in m.groups()] + else: + return empty_row + + return [f(val) for val in np.asarray(self)] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6ecdd9023c1f7c0c6da54f6094a4dce6b6157c9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/times.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/times.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a68f3c7d887b43414202d979bfa8f22e3b6423e6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/__pycache__/times.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/datetimes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/datetimes.py new file mode 100644 index 0000000000000000000000000000000000000000..05262c235568dc017c2d973bf28452f007bdc5cd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/datetimes.py @@ -0,0 +1,1235 @@ +from __future__ import annotations + +from collections import abc +from datetime import date +from functools import partial +from itertools import islice +from typing import ( + TYPE_CHECKING, + Callable, + TypedDict, + Union, + cast, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import ( + lib, + tslib, +) +from pandas._libs.tslibs import ( + OutOfBoundsDatetime, + Timedelta, + Timestamp, + astype_overflowsafe, + is_supported_dtype, + timezones as libtimezones, +) +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas._libs.tslibs.parsing import ( + DateParseError, + guess_datetime_format, +) +from pandas._libs.tslibs.strptime import array_strptime +from pandas._typing import ( + AnyArrayLike, + ArrayLike, + DateTimeErrorChoices, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + ensure_object, + is_float, + is_integer, + is_integer_dtype, + is_list_like, + is_numeric_dtype, +) +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + DatetimeTZDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) + +from pandas.arrays import ( + DatetimeArray, + IntegerArray, + NumpyExtensionArray, +) +from pandas.core.algorithms import unique +from pandas.core.arrays import ArrowExtensionArray +from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.datetimes import ( + maybe_convert_dtype, + objects_to_datetime64, + tz_to_dtype, +) +from pandas.core.construction import extract_array +from pandas.core.indexes.base import Index +from pandas.core.indexes.datetimes import DatetimeIndex + +if TYPE_CHECKING: + from collections.abc import Hashable + + from pandas._libs.tslibs.nattype import NaTType + from pandas._libs.tslibs.timedeltas import UnitChoices + + from pandas import ( + DataFrame, + Series, + ) + +# --------------------------------------------------------------------- +# types used in annotations + +ArrayConvertible = Union[list, tuple, AnyArrayLike] +Scalar = Union[float, str] +DatetimeScalar = Union[Scalar, date, np.datetime64] + +DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible] + +DatetimeDictArg = Union[list[Scalar], tuple[Scalar, ...], AnyArrayLike] + + +class YearMonthDayDict(TypedDict, total=True): + year: DatetimeDictArg + month: DatetimeDictArg + day: DatetimeDictArg + + +class FulldatetimeDict(YearMonthDayDict, total=False): + hour: DatetimeDictArg + hours: DatetimeDictArg + minute: DatetimeDictArg + minutes: DatetimeDictArg + second: DatetimeDictArg + seconds: DatetimeDictArg + ms: DatetimeDictArg + us: DatetimeDictArg + ns: DatetimeDictArg + + +DictConvertible = Union[FulldatetimeDict, "DataFrame"] +start_caching_at = 50 + + +# --------------------------------------------------------------------- + + +def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: + # Try to guess the format based on the first non-NaN element, return None if can't + if (first_non_null := tslib.first_non_null(arr)) != -1: + if type(first_non_nan_element := arr[first_non_null]) is str: # noqa: E721 + # GH#32264 np.str_ object + guessed_format = guess_datetime_format( + first_non_nan_element, dayfirst=dayfirst + ) + if guessed_format is not None: + return guessed_format + # If there are multiple non-null elements, warn about + # how parsing might not be consistent + if tslib.first_non_null(arr[first_non_null + 1 :]) != -1: + warnings.warn( + "Could not infer format, so each element will be parsed " + "individually, falling back to `dateutil`. To ensure parsing is " + "consistent and as-expected, please specify a format.", + UserWarning, + stacklevel=find_stack_level(), + ) + return None + + +def should_cache( + arg: ArrayConvertible, unique_share: float = 0.7, check_count: int | None = None +) -> bool: + """ + Decides whether to do caching. + + If the percent of unique elements among `check_count` elements less + than `unique_share * 100` then we can do caching. + + Parameters + ---------- + arg: listlike, tuple, 1-d array, Series + unique_share: float, default=0.7, optional + 0 < unique_share < 1 + check_count: int, optional + 0 <= check_count <= len(arg) + + Returns + ------- + do_caching: bool + + Notes + ----- + By default for a sequence of less than 50 items in size, we don't do + caching; for the number of elements less than 5000, we take ten percent of + all elements to check for a uniqueness share; if the sequence size is more + than 5000, then we check only the first 500 elements. + All constants were chosen empirically by. + """ + do_caching = True + + # default realization + if check_count is None: + # in this case, the gain from caching is negligible + if len(arg) <= start_caching_at: + return False + + if len(arg) <= 5000: + check_count = len(arg) // 10 + else: + check_count = 500 + else: + assert ( + 0 <= check_count <= len(arg) + ), "check_count must be in next bounds: [0; len(arg)]" + if check_count == 0: + return False + + assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)" + + try: + # We can't cache if the items are not hashable. + unique_elements = set(islice(arg, check_count)) + except TypeError: + return False + if len(unique_elements) > check_count * unique_share: + do_caching = False + return do_caching + + +def _maybe_cache( + arg: ArrayConvertible, + format: str | None, + cache: bool, + convert_listlike: Callable, +) -> Series: + """ + Create a cache of unique dates from an array of dates + + Parameters + ---------- + arg : listlike, tuple, 1-d array, Series + format : string + Strftime format to parse time + cache : bool + True attempts to create a cache of converted values + convert_listlike : function + Conversion function to apply on dates + + Returns + ------- + cache_array : Series + Cache of converted, unique dates. Can be empty + """ + from pandas import Series + + cache_array = Series(dtype=object) + + if cache: + # Perform a quicker unique check + if not should_cache(arg): + return cache_array + + if not isinstance(arg, (np.ndarray, ExtensionArray, Index, ABCSeries)): + arg = np.array(arg) + + unique_dates = unique(arg) + if len(unique_dates) < len(arg): + cache_dates = convert_listlike(unique_dates, format) + # GH#45319 + try: + cache_array = Series(cache_dates, index=unique_dates, copy=False) + except OutOfBoundsDatetime: + return cache_array + # GH#39882 and GH#35888 in case of None and NaT we get duplicates + if not cache_array.index.is_unique: + cache_array = cache_array[~cache_array.index.duplicated()] + return cache_array + + +def _box_as_indexlike( + dt_array: ArrayLike, utc: bool = False, name: Hashable | None = None +) -> Index: + """ + Properly boxes the ndarray of datetimes to DatetimeIndex + if it is possible or to generic Index instead + + Parameters + ---------- + dt_array: 1-d array + Array of datetimes to be wrapped in an Index. + utc : bool + Whether to convert/localize timestamps to UTC. + name : string, default None + Name for a resulting index + + Returns + ------- + result : datetime of converted dates + - DatetimeIndex if convertible to sole datetime64 type + - general Index otherwise + """ + + if lib.is_np_dtype(dt_array.dtype, "M"): + tz = "utc" if utc else None + return DatetimeIndex(dt_array, tz=tz, name=name) + return Index(dt_array, name=name, dtype=dt_array.dtype) + + +def _convert_and_box_cache( + arg: DatetimeScalarOrArrayConvertible, + cache_array: Series, + name: Hashable | None = None, +) -> Index: + """ + Convert array of dates with a cache and wrap the result in an Index. + + Parameters + ---------- + arg : integer, float, string, datetime, list, tuple, 1-d array, Series + cache_array : Series + Cache of converted, unique dates + name : string, default None + Name for a DatetimeIndex + + Returns + ------- + result : Index-like of converted dates + """ + from pandas import Series + + result = Series(arg, dtype=cache_array.index.dtype).map(cache_array) + return _box_as_indexlike(result._values, utc=False, name=name) + + +def _convert_listlike_datetimes( + arg, + format: str | None, + name: Hashable | None = None, + utc: bool = False, + unit: str | None = None, + errors: DateTimeErrorChoices = "raise", + dayfirst: bool | None = None, + yearfirst: bool | None = None, + exact: bool = True, +): + """ + Helper function for to_datetime. Performs the conversions of 1D listlike + of dates + + Parameters + ---------- + arg : list, tuple, ndarray, Series, Index + date to be parsed + name : object + None or string for the Index name + utc : bool + Whether to convert/localize timestamps to UTC. + unit : str + None or string of the frequency of the passed data + errors : str + error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' + dayfirst : bool + dayfirst parsing behavior from to_datetime + yearfirst : bool + yearfirst parsing behavior from to_datetime + exact : bool, default True + exact format matching behavior from to_datetime + + Returns + ------- + Index-like of parsed dates + """ + if isinstance(arg, (list, tuple)): + arg = np.array(arg, dtype="O") + elif isinstance(arg, NumpyExtensionArray): + arg = np.array(arg) + + arg_dtype = getattr(arg, "dtype", None) + # these are shortcutable + tz = "utc" if utc else None + if isinstance(arg_dtype, DatetimeTZDtype): + if not isinstance(arg, (DatetimeArray, DatetimeIndex)): + return DatetimeIndex(arg, tz=tz, name=name) + if utc: + arg = arg.tz_convert(None).tz_localize("utc") + return arg + + elif isinstance(arg_dtype, ArrowDtype) and arg_dtype.type is Timestamp: + # TODO: Combine with above if DTI/DTA supports Arrow timestamps + if utc: + # pyarrow uses UTC, not lowercase utc + if isinstance(arg, Index): + arg_array = cast(ArrowExtensionArray, arg.array) + if arg_dtype.pyarrow_dtype.tz is not None: + arg_array = arg_array._dt_tz_convert("UTC") + else: + arg_array = arg_array._dt_tz_localize("UTC") + arg = Index(arg_array) + else: + # ArrowExtensionArray + if arg_dtype.pyarrow_dtype.tz is not None: + arg = arg._dt_tz_convert("UTC") + else: + arg = arg._dt_tz_localize("UTC") + return arg + + elif lib.is_np_dtype(arg_dtype, "M"): + if not is_supported_dtype(arg_dtype): + # We go to closest supported reso, i.e. "s" + arg = astype_overflowsafe( + # TODO: looks like we incorrectly raise with errors=="ignore" + np.asarray(arg), + np.dtype("M8[s]"), + is_coerce=errors == "coerce", + ) + + if not isinstance(arg, (DatetimeArray, DatetimeIndex)): + return DatetimeIndex(arg, tz=tz, name=name) + elif utc: + # DatetimeArray, DatetimeIndex + return arg.tz_localize("utc") + + return arg + + elif unit is not None: + if format is not None: + raise ValueError("cannot specify both format and unit") + return _to_datetime_with_unit(arg, unit, name, utc, errors) + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, datetime, list, tuple, 1-d array, or Series" + ) + + # warn if passing timedelta64, raise for PeriodDtype + # NB: this must come after unit transformation + try: + arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz)) + except TypeError: + if errors == "coerce": + npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) + return DatetimeIndex(npvalues, name=name) + elif errors == "ignore": + idx = Index(arg, name=name) + return idx + raise + + arg = ensure_object(arg) + + if format is None: + format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) + + # `format` could be inferred, or user didn't ask for mixed-format parsing. + if format is not None and format != "mixed": + return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) + + result, tz_parsed = objects_to_datetime64( + arg, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + errors=errors, + allow_object=True, + ) + + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + out_unit = np.datetime_data(result.dtype)[0] + dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed, out_unit)) + dt64_values = result.view(f"M8[{dtype.unit}]") + dta = DatetimeArray._simple_new(dt64_values, dtype=dtype) + return DatetimeIndex._simple_new(dta, name=name) + + return _box_as_indexlike(result, utc=utc, name=name) + + +def _array_strptime_with_fallback( + arg, + name, + utc: bool, + fmt: str, + exact: bool, + errors: str, +) -> Index: + """ + Call array_strptime, with fallback behavior depending on 'errors'. + """ + result, tz_out = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) + if tz_out is not None: + unit = np.datetime_data(result.dtype)[0] + dtype = DatetimeTZDtype(tz=tz_out, unit=unit) + dta = DatetimeArray._simple_new(result, dtype=dtype) + if utc: + dta = dta.tz_convert("UTC") + return Index(dta, name=name) + elif result.dtype != object and utc: + unit = np.datetime_data(result.dtype)[0] + res = Index(result, dtype=f"M8[{unit}, UTC]", name=name) + return res + return Index(result, dtype=result.dtype, name=name) + + +def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: + """ + to_datetime specalized to the case where a 'unit' is passed. + """ + arg = extract_array(arg, extract_numpy=True) + + # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime + # because it expects an ndarray argument + if isinstance(arg, IntegerArray): + arr = arg.astype(f"datetime64[{unit}]") + tz_parsed = None + else: + arg = np.asarray(arg) + + if arg.dtype.kind in "iu": + # Note we can't do "f" here because that could induce unwanted + # rounding GH#14156, GH#20445 + arr = arg.astype(f"datetime64[{unit}]", copy=False) + try: + arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False) + except OutOfBoundsDatetime: + if errors == "raise": + raise + arg = arg.astype(object) + return _to_datetime_with_unit(arg, unit, name, utc, errors) + tz_parsed = None + + elif arg.dtype.kind == "f": + with np.errstate(over="raise"): + try: + arr = cast_from_unit_vectorized(arg, unit=unit) + except OutOfBoundsDatetime: + if errors != "raise": + return _to_datetime_with_unit( + arg.astype(object), unit, name, utc, errors + ) + raise OutOfBoundsDatetime( + f"cannot convert input with unit '{unit}'" + ) + + arr = arr.view("M8[ns]") + tz_parsed = None + else: + arg = arg.astype(object, copy=False) + arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) + + if errors == "ignore": + # Index constructor _may_ infer to DatetimeIndex + result = Index._with_infer(arr, name=name) + else: + result = DatetimeIndex(arr, name=name) + + if not isinstance(result, DatetimeIndex): + return result + + # GH#23758: We may still need to localize the result with tz + # GH#25546: Apply tz_parsed first (from arg), then tz (from caller) + # result will be naive but in UTC + result = result.tz_localize("UTC").tz_convert(tz_parsed) + + if utc: + if result.tz is None: + result = result.tz_localize("utc") + else: + result = result.tz_convert("utc") + return result + + +def _adjust_to_origin(arg, origin, unit): + """ + Helper function for to_datetime. + Adjust input argument to the specified origin + + Parameters + ---------- + arg : list, tuple, ndarray, Series, Index + date to be adjusted + origin : 'julian' or Timestamp + origin offset for the arg + unit : str + passed unit from to_datetime, must be 'D' + + Returns + ------- + ndarray or scalar of adjusted date(s) + """ + if origin == "julian": + original = arg + j0 = Timestamp(0).to_julian_date() + if unit != "D": + raise ValueError("unit must be 'D' for origin='julian'") + try: + arg = arg - j0 + except TypeError as err: + raise ValueError( + "incompatible 'arg' type for given 'origin'='julian'" + ) from err + + # preemptively check this for a nice range + j_max = Timestamp.max.to_julian_date() - j0 + j_min = Timestamp.min.to_julian_date() - j0 + if np.any(arg > j_max) or np.any(arg < j_min): + raise OutOfBoundsDatetime( + f"{original} is Out of Bounds for origin='julian'" + ) + else: + # arg must be numeric + if not ( + (is_integer(arg) or is_float(arg)) or is_numeric_dtype(np.asarray(arg)) + ): + raise ValueError( + f"'{arg}' is not compatible with origin='{origin}'; " + "it must be numeric with a unit specified" + ) + + # we are going to offset back to unix / epoch time + try: + offset = Timestamp(origin, unit=unit) + except OutOfBoundsDatetime as err: + raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err + except ValueError as err: + raise ValueError( + f"origin {origin} cannot be converted to a Timestamp" + ) from err + + if offset.tz is not None: + raise ValueError(f"origin offset {offset} must be tz-naive") + td_offset = offset - Timestamp(0) + + # convert the offset to the unit of the arg + # this should be lossless in terms of precision + ioffset = td_offset // Timedelta(1, unit=unit) + + # scalars & ndarray-like can handle the addition + if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)): + arg = np.asarray(arg) + arg = arg + ioffset + return arg + + +@overload +def to_datetime( + arg: DatetimeScalar, + errors: DateTimeErrorChoices = ..., + dayfirst: bool = ..., + yearfirst: bool = ..., + utc: bool = ..., + format: str | None = ..., + exact: bool = ..., + unit: str | None = ..., + infer_datetime_format: bool = ..., + origin=..., + cache: bool = ..., +) -> Timestamp: + ... + + +@overload +def to_datetime( + arg: Series | DictConvertible, + errors: DateTimeErrorChoices = ..., + dayfirst: bool = ..., + yearfirst: bool = ..., + utc: bool = ..., + format: str | None = ..., + exact: bool = ..., + unit: str | None = ..., + infer_datetime_format: bool = ..., + origin=..., + cache: bool = ..., +) -> Series: + ... + + +@overload +def to_datetime( + arg: list | tuple | Index | ArrayLike, + errors: DateTimeErrorChoices = ..., + dayfirst: bool = ..., + yearfirst: bool = ..., + utc: bool = ..., + format: str | None = ..., + exact: bool = ..., + unit: str | None = ..., + infer_datetime_format: bool = ..., + origin=..., + cache: bool = ..., +) -> DatetimeIndex: + ... + + +def to_datetime( + arg: DatetimeScalarOrArrayConvertible | DictConvertible, + errors: DateTimeErrorChoices = "raise", + dayfirst: bool = False, + yearfirst: bool = False, + utc: bool = False, + format: str | None = None, + exact: bool | lib.NoDefault = lib.no_default, + unit: str | None = None, + infer_datetime_format: lib.NoDefault | bool = lib.no_default, + origin: str = "unix", + cache: bool = True, +) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None: + """ + Convert argument to datetime. + + This function converts a scalar, array-like, :class:`Series` or + :class:`DataFrame`/dict-like to a pandas datetime object. + + Parameters + ---------- + arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like + The object to convert to a datetime. If a :class:`DataFrame` is provided, the + method expects minimally the following columns: :const:`"year"`, + :const:`"month"`, :const:`"day"`. The column "year" + must be specified in 4-digit format. + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If :const:`'raise'`, then invalid parsing will raise an exception. + - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`. + - If :const:`'ignore'`, then invalid parsing will return the input. + dayfirst : bool, default False + Specify a date parse order if `arg` is str or is list-like. + If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"` + is parsed as :const:`2012-11-10`. + + .. warning:: + + ``dayfirst=True`` is not strict, but will prefer to parse + with day first. + + yearfirst : bool, default False + Specify a date parse order if `arg` is str or is list-like. + + - If :const:`True` parses dates with the year first, e.g. + :const:`"10/11/12"` is parsed as :const:`2010-11-12`. + - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is + preceded (same as :mod:`dateutil`). + + .. warning:: + + ``yearfirst=True`` is not strict, but will prefer to parse + with year first. + + utc : bool, default False + Control timezone-related parsing, localization and conversion. + + - If :const:`True`, the function *always* returns a timezone-aware + UTC-localized :class:`Timestamp`, :class:`Series` or + :class:`DatetimeIndex`. To do this, timezone-naive inputs are + *localized* as UTC, while timezone-aware inputs are *converted* to UTC. + + - If :const:`False` (default), inputs will not be coerced to UTC. + Timezone-naive inputs will remain naive, while timezone-aware ones + will keep their time offsets. Limitations exist for mixed + offsets (typically, daylight savings), see :ref:`Examples + ` section for details. + + .. warning:: + + In a future version of pandas, parsing datetimes with mixed time + zones will raise an error unless `utc=True`. + Please specify `utc=True` to opt in to the new behaviour + and silence this warning. To create a `Series` with mixed offsets and + `object` dtype, please use `apply` and `datetime.datetime.strptime`. + + See also: pandas general documentation about `timezone conversion and + localization + `_. + + format : str, default None + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. + + .. note:: + + If a :class:`DataFrame` is passed, then `format` has no effect. + + exact : bool, default True + Control how `format` is used: + + - If :const:`True`, require an exact `format` match. + - If :const:`False`, allow the `format` to match anywhere in the target + string. + + Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``. + unit : str, default 'ns' + The unit of the arg (D,s,ms,us,ns) denote the unit, which is an + integer or float number. This will be based off the origin. + Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate + the number of milliseconds to the unix epoch start. + infer_datetime_format : bool, default False + If :const:`True` and no `format` is given, attempt to infer the format + of the datetime strings based on the first non-NaN element, + and if it can be inferred, switch to a faster method of parsing them. + In some cases this can increase the parsing speed by ~5-10x. + + .. deprecated:: 2.0.0 + A strict version of this argument is now the default, passing it has + no effect. + + origin : scalar, default 'unix' + Define the reference date. The numeric values would be parsed as number + of units (defined by `unit`) since this reference date. + + - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01. + - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to + beginning of Julian Calendar. Julian day number :const:`0` is assigned + to the day starting at noon on January 1, 4713 BC. + - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date + string), origin is set to Timestamp identified by origin. + - If a float or integer, origin is the difference + (in units determined by the ``unit`` argument) relative to 1970-01-01. + cache : bool, default True + If :const:`True`, use a cache of unique, converted dates to apply the + datetime conversion. May produce significant speed-up when parsing + duplicate date strings, especially ones with timezone offsets. The cache + is only used when there are at least 50 values. The presence of + out-of-bounds values will render the cache unusable and may slow down + parsing. + + Returns + ------- + datetime + If parsing succeeded. + Return type depends on input (types in parenthesis correspond to + fallback in case of unsuccessful timezone or out-of-range timestamp + parsing): + + - scalar: :class:`Timestamp` (or :class:`datetime.datetime`) + - array-like: :class:`DatetimeIndex` (or :class:`Series` with + :class:`object` dtype containing :class:`datetime.datetime`) + - Series: :class:`Series` of :class:`datetime64` dtype (or + :class:`Series` of :class:`object` dtype containing + :class:`datetime.datetime`) + - DataFrame: :class:`Series` of :class:`datetime64` dtype (or + :class:`Series` of :class:`object` dtype containing + :class:`datetime.datetime`) + + Raises + ------ + ParserError + When parsing a date from string fails. + ValueError + When another datetime conversion error happens. For example when one + of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or + when a Timezone-aware :class:`datetime.datetime` is found in an array-like + of mixed time offsets, and ``utc=False``. + + See Also + -------- + DataFrame.astype : Cast argument to a specified dtype. + to_timedelta : Convert argument to timedelta. + convert_dtypes : Convert dtypes. + + Notes + ----- + + Many input types are supported, and lead to different output types: + + - **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime` + module or :mod:`numpy`). They are converted to :class:`Timestamp` when + possible, otherwise they are converted to :class:`datetime.datetime`. + None/NaN/null scalars are converted to :const:`NaT`. + + - **array-like** can contain int, float, str, datetime objects. They are + converted to :class:`DatetimeIndex` when possible, otherwise they are + converted to :class:`Index` with :class:`object` dtype, containing + :class:`datetime.datetime`. None/NaN/null entries are converted to + :const:`NaT` in both cases. + + - **Series** are converted to :class:`Series` with :class:`datetime64` + dtype when possible, otherwise they are converted to :class:`Series` with + :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null + entries are converted to :const:`NaT` in both cases. + + - **DataFrame/dict-like** are converted to :class:`Series` with + :class:`datetime64` dtype. For each row a datetime is created from assembling + the various dataframe columns. Column keys can be common abbreviations + like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or + plurals of the same. + + The following causes are responsible for :class:`datetime.datetime` objects + being returned (possibly inside an :class:`Index` or a :class:`Series` with + :class:`object` dtype) instead of a proper pandas designated type + (:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series` + with :class:`datetime64` dtype): + + - when any input element is before :const:`Timestamp.min` or after + :const:`Timestamp.max`, see `timestamp limitations + `_. + + - when ``utc=False`` (default) and the input is an array-like or + :class:`Series` containing mixed naive/aware datetime, or aware with mixed + time offsets. Note that this happens in the (quite frequent) situation when + the timezone has a daylight savings policy. In that case you may wish to + use ``utc=True``. + + Examples + -------- + + **Handling various input formats** + + Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys + can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', + 'ms', 'us', 'ns']) or plurals of the same + + >>> df = pd.DataFrame({'year': [2015, 2016], + ... 'month': [2, 3], + ... 'day': [4, 5]}) + >>> pd.to_datetime(df) + 0 2015-02-04 + 1 2016-03-05 + dtype: datetime64[ns] + + Using a unix epoch time + + >>> pd.to_datetime(1490195805, unit='s') + Timestamp('2017-03-22 15:16:45') + >>> pd.to_datetime(1490195805433502912, unit='ns') + Timestamp('2017-03-22 15:16:45.433502912') + + .. warning:: For float arg, precision rounding might happen. To prevent + unexpected behavior use a fixed-width exact type. + + Using a non-unix epoch origin + + >>> pd.to_datetime([1, 2, 3], unit='D', + ... origin=pd.Timestamp('1960-01-01')) + DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], + dtype='datetime64[ns]', freq=None) + + **Differences with strptime behavior** + + :const:`"%f"` will parse all the way up to nanoseconds. + + >>> pd.to_datetime('2018-10-26 12:00:00.0000000011', + ... format='%Y-%m-%d %H:%M:%S.%f') + Timestamp('2018-10-26 12:00:00.000000001') + + **Non-convertible date/times** + + Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, + in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. + + >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') + NaT + + .. _to_datetime_tz_examples: + + **Timezones and time offsets** + + The default behaviour (``utc=False``) is as follows: + + - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`: + + >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15']) + DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], + dtype='datetime64[ns]', freq=None) + + - Timezone-aware inputs *with constant time offset* are converted to + timezone-aware :class:`DatetimeIndex`: + + >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500']) + DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], + dtype='datetime64[ns, UTC-05:00]', freq=None) + + - However, timezone-aware inputs *with mixed time offsets* (for example + issued from a timezone with daylight savings, such as Europe/Paris) + are **not successfully converted** to a :class:`DatetimeIndex`. + Parsing datetimes with mixed time zones will show a warning unless + `utc=True`. If you specify `utc=False` the warning below will be shown + and a simple :class:`Index` containing :class:`datetime.datetime` + objects will be returned: + + >>> pd.to_datetime(['2020-10-25 02:00 +0200', + ... '2020-10-25 04:00 +0100']) # doctest: +SKIP + FutureWarning: In a future version of pandas, parsing datetimes with mixed + time zones will raise an error unless `utc=True`. Please specify `utc=True` + to opt in to the new behaviour and silence this warning. To create a `Series` + with mixed offsets and `object` dtype, please use `apply` and + `datetime.datetime.strptime`. + Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], + dtype='object') + + - A mix of timezone-aware and timezone-naive inputs is also converted to + a simple :class:`Index` containing :class:`datetime.datetime` objects: + + >>> from datetime import datetime + >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", + ... datetime(2020, 1, 1, 3, 0)]) # doctest: +SKIP + FutureWarning: In a future version of pandas, parsing datetimes with mixed + time zones will raise an error unless `utc=True`. Please specify `utc=True` + to opt in to the new behaviour and silence this warning. To create a `Series` + with mixed offsets and `object` dtype, please use `apply` and + `datetime.datetime.strptime`. + Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object') + + | + + Setting ``utc=True`` solves most of the above issues: + + - Timezone-naive inputs are *localized* as UTC + + >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True) + DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) + + - Timezone-aware inputs are *converted* to UTC (the output represents the + exact same datetime, but viewed from the UTC time offset `+00:00`). + + >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'], + ... utc=True) + DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) + + - Inputs can contain both string or datetime, the above + rules still apply + + >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True) + DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) + """ + if exact is not lib.no_default and format in {"mixed", "ISO8601"}: + raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") + if infer_datetime_format is not lib.no_default: + warnings.warn( + "The argument 'infer_datetime_format' is deprecated and will " + "be removed in a future version. " + "A strict version of it is now the default, see " + "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " + "You can safely remove this argument.", + stacklevel=find_stack_level(), + ) + if errors == "ignore": + # GH#54467 + warnings.warn( + "errors='ignore' is deprecated and will raise in a future version. " + "Use to_datetime without passing `errors` and catch exceptions " + "explicitly instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if arg is None: + return None + + if origin != "unix": + arg = _adjust_to_origin(arg, origin, unit) + + convert_listlike = partial( + _convert_listlike_datetimes, + utc=utc, + unit=unit, + dayfirst=dayfirst, + yearfirst=yearfirst, + errors=errors, + exact=exact, + ) + # pylint: disable-next=used-before-assignment + result: Timestamp | NaTType | Series | Index + + if isinstance(arg, Timestamp): + result = arg + if utc: + if arg.tz is not None: + result = arg.tz_convert("utc") + else: + result = arg.tz_localize("utc") + elif isinstance(arg, ABCSeries): + cache_array = _maybe_cache(arg, format, cache, convert_listlike) + if not cache_array.empty: + result = arg.map(cache_array) + else: + values = convert_listlike(arg._values, format) + result = arg._constructor(values, index=arg.index, name=arg.name) + elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)): + result = _assemble_from_unit_mappings(arg, errors, utc) + elif isinstance(arg, Index): + cache_array = _maybe_cache(arg, format, cache, convert_listlike) + if not cache_array.empty: + result = _convert_and_box_cache(arg, cache_array, name=arg.name) + else: + result = convert_listlike(arg, format, name=arg.name) + elif is_list_like(arg): + try: + # error: Argument 1 to "_maybe_cache" has incompatible type + # "Union[float, str, datetime, List[Any], Tuple[Any, ...], ExtensionArray, + # ndarray[Any, Any], Series]"; expected "Union[List[Any], Tuple[Any, ...], + # Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series], Series]" + argc = cast( + Union[list, tuple, ExtensionArray, np.ndarray, "Series", Index], arg + ) + cache_array = _maybe_cache(argc, format, cache, convert_listlike) + except OutOfBoundsDatetime: + # caching attempts to create a DatetimeIndex, which may raise + # an OOB. If that's the desired behavior, then just reraise... + if errors == "raise": + raise + # ... otherwise, continue without the cache. + from pandas import Series + + cache_array = Series([], dtype=object) # just an empty array + if not cache_array.empty: + result = _convert_and_box_cache(argc, cache_array) + else: + result = convert_listlike(argc, format) + else: + result = convert_listlike(np.array([arg]), format)[0] + if isinstance(arg, bool) and isinstance(result, np.bool_): + result = bool(result) # TODO: avoid this kludge. + + # error: Incompatible return value type (got "Union[Timestamp, NaTType, + # Series, Index]", expected "Union[DatetimeIndex, Series, float, str, + # NaTType, None]") + return result # type: ignore[return-value] + + +# mappings for assembling units +_unit_map = { + "year": "year", + "years": "year", + "month": "month", + "months": "month", + "day": "day", + "days": "day", + "hour": "h", + "hours": "h", + "minute": "m", + "minutes": "m", + "second": "s", + "seconds": "s", + "ms": "ms", + "millisecond": "ms", + "milliseconds": "ms", + "us": "us", + "microsecond": "us", + "microseconds": "us", + "ns": "ns", + "nanosecond": "ns", + "nanoseconds": "ns", +} + + +def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, utc: bool): + """ + assemble the unit specified fields from the arg (DataFrame) + Return a Series for actual parsing + + Parameters + ---------- + arg : DataFrame + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + + - If :const:`'raise'`, then invalid parsing will raise an exception + - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT` + - If :const:`'ignore'`, then invalid parsing will return the input + utc : bool + Whether to convert/localize timestamps to UTC. + + Returns + ------- + Series + """ + from pandas import ( + DataFrame, + to_numeric, + to_timedelta, + ) + + arg = DataFrame(arg) + if not arg.columns.is_unique: + raise ValueError("cannot assemble with duplicate keys") + + # replace passed unit with _unit_map + def f(value): + if value in _unit_map: + return _unit_map[value] + + # m is case significant + if value.lower() in _unit_map: + return _unit_map[value.lower()] + + return value + + unit = {k: f(k) for k in arg.keys()} + unit_rev = {v: k for k, v in unit.items()} + + # we require at least Ymd + required = ["year", "month", "day"] + req = sorted(set(required) - set(unit_rev.keys())) + if len(req): + _required = ",".join(req) + raise ValueError( + "to assemble mappings requires at least that " + f"[year, month, day] be specified: [{_required}] is missing" + ) + + # keys we don't recognize + excess = sorted(set(unit_rev.keys()) - set(_unit_map.values())) + if len(excess): + _excess = ",".join(excess) + raise ValueError( + f"extra keys have been passed to the datetime assemblage: [{_excess}]" + ) + + def coerce(values): + # we allow coercion to if errors allows + values = to_numeric(values, errors=errors) + + # prevent overflow in case of int8 or int16 + if is_integer_dtype(values.dtype): + values = values.astype("int64", copy=False) + return values + + values = ( + coerce(arg[unit_rev["year"]]) * 10000 + + coerce(arg[unit_rev["month"]]) * 100 + + coerce(arg[unit_rev["day"]]) + ) + try: + values = to_datetime(values, format="%Y%m%d", errors=errors, utc=utc) + except (TypeError, ValueError) as err: + raise ValueError(f"cannot assemble the datetimes: {err}") from err + + units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"] + for u in units: + value = unit_rev.get(u) + if value is not None and value in arg: + try: + values += to_timedelta(coerce(arg[value]), unit=u, errors=errors) + except (TypeError, ValueError) as err: + raise ValueError( + f"cannot assemble the datetimes [{value}]: {err}" + ) from err + return values + + +__all__ = [ + "DateParseError", + "should_cache", + "to_datetime", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/numeric.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/numeric.py new file mode 100644 index 0000000000000000000000000000000000000000..09652a7d8bc92b7274f3064fa1df03f99d4a269e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/numeric.py @@ -0,0 +1,329 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend + +from pandas.core.dtypes.cast import maybe_downcast_numeric +from pandas.core.dtypes.common import ( + ensure_object, + is_bool_dtype, + is_decimal, + is_integer_dtype, + is_number, + is_numeric_dtype, + is_scalar, + is_string_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.dtypes import ArrowDtype +from pandas.core.dtypes.generic import ( + ABCIndex, + ABCSeries, +) + +from pandas.core.arrays import BaseMaskedArray +from pandas.core.arrays.string_ import StringDtype + +if TYPE_CHECKING: + from pandas._typing import ( + DateTimeErrorChoices, + DtypeBackend, + npt, + ) + + +def to_numeric( + arg, + errors: DateTimeErrorChoices = "raise", + downcast: Literal["integer", "signed", "unsigned", "float"] | None = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, +): + """ + Convert argument to a numeric type. + + The default return dtype is `float64` or `int64` + depending on the data supplied. Use the `downcast` parameter + to obtain other dtypes. + + Please note that precision loss may occur if really large numbers + are passed in. Due to the internal limitations of `ndarray`, if + numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) + or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are + passed in, it is very likely they will be converted to float so that + they can be stored in an `ndarray`. These warnings apply similarly to + `Series` since it internally leverages `ndarray`. + + Parameters + ---------- + arg : scalar, list, tuple, 1-d array, or Series + Argument to be converted. + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaN. + - If 'ignore', then invalid parsing will return the input. + + .. versionchanged:: 2.2 + + "ignore" is deprecated. Catch exceptions explicitly instead. + + downcast : str, default None + Can be 'integer', 'signed', 'unsigned', or 'float'. + If not None, and if the data has been successfully cast to a + numerical dtype (or if the data was numeric to begin with), + downcast that resulting data to the smallest numerical dtype + possible according to the following rules: + + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) + - 'unsigned': smallest unsigned int dtype (min.: np.uint8) + - 'float': smallest float dtype (min.: np.float32) + + As this behaviour is separate from the core conversion to + numeric values, any errors raised during the downcasting + will be surfaced regardless of the value of the 'errors' input. + + In addition, downcasting will only occur if the size + of the resulting data's dtype is strictly larger than + the dtype it is to be cast to, so if none of the dtypes + checked satisfy that specification, no downcasting will be + performed on the data. + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + Returns + ------- + ret + Numeric if parsing succeeded. + Return type depends on input. Series if Series, otherwise ndarray. + + See Also + -------- + DataFrame.astype : Cast argument to a specified dtype. + to_datetime : Convert argument to datetime. + to_timedelta : Convert argument to timedelta. + numpy.ndarray.astype : Cast a numpy array to a specified type. + DataFrame.convert_dtypes : Convert dtypes. + + Examples + -------- + Take separate series and convert to numeric, coercing when told to + + >>> s = pd.Series(['1.0', '2', -3]) + >>> pd.to_numeric(s) + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float64 + >>> pd.to_numeric(s, downcast='float') + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float32 + >>> pd.to_numeric(s, downcast='signed') + 0 1 + 1 2 + 2 -3 + dtype: int8 + >>> s = pd.Series(['apple', '1.0', '2', -3]) + >>> pd.to_numeric(s, errors='coerce') + 0 NaN + 1 1.0 + 2 2.0 + 3 -3.0 + dtype: float64 + + Downcasting of nullable integer and floating dtypes is supported: + + >>> s = pd.Series([1, 2, 3], dtype="Int64") + >>> pd.to_numeric(s, downcast="integer") + 0 1 + 1 2 + 2 3 + dtype: Int8 + >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64") + >>> pd.to_numeric(s, downcast="float") + 0 1.0 + 1 2.1 + 2 3.0 + dtype: Float32 + """ + if downcast not in (None, "integer", "signed", "unsigned", "float"): + raise ValueError("invalid downcasting method provided") + + if errors not in ("ignore", "raise", "coerce"): + raise ValueError("invalid error value specified") + if errors == "ignore": + # GH#54467 + warnings.warn( + "errors='ignore' is deprecated and will raise in a future version. " + "Use to_numeric without passing `errors` and catch exceptions " + "explicitly instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + check_dtype_backend(dtype_backend) + + is_series = False + is_index = False + is_scalars = False + + if isinstance(arg, ABCSeries): + is_series = True + values = arg.values + elif isinstance(arg, ABCIndex): + is_index = True + if needs_i8_conversion(arg.dtype): + values = arg.view("i8") + else: + values = arg.values + elif isinstance(arg, (list, tuple)): + values = np.array(arg, dtype="O") + elif is_scalar(arg): + if is_decimal(arg): + return float(arg) + if is_number(arg): + return arg + is_scalars = True + values = np.array([arg], dtype="O") + elif getattr(arg, "ndim", 1) > 1: + raise TypeError("arg must be a list, tuple, 1-d array, or Series") + else: + values = arg + + orig_values = values + + # GH33013: for IntegerArray & FloatingArray extract non-null values for casting + # save mask to reconstruct the full array after casting + mask: npt.NDArray[np.bool_] | None = None + if isinstance(values, BaseMaskedArray): + mask = values._mask + values = values._data[~mask] + + values_dtype = getattr(values, "dtype", None) + if isinstance(values_dtype, ArrowDtype): + mask = values.isna() + values = values.dropna().to_numpy() + new_mask: np.ndarray | None = None + if is_numeric_dtype(values_dtype): + pass + elif lib.is_np_dtype(values_dtype, "mM"): + values = values.view(np.int64) + else: + values = ensure_object(values) + coerce_numeric = errors not in ("ignore", "raise") + try: + values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] + values, + set(), + coerce_numeric=coerce_numeric, + convert_to_masked_nullable=dtype_backend is not lib.no_default + or isinstance(values_dtype, StringDtype) + and not values_dtype.storage == "pyarrow_numpy", + ) + except (ValueError, TypeError): + if errors == "raise": + raise + values = orig_values + + if new_mask is not None: + # Remove unnecessary values, is expected later anyway and enables + # downcasting + values = values[~new_mask] + elif ( + dtype_backend is not lib.no_default + and new_mask is None + or isinstance(values_dtype, StringDtype) + and not values_dtype.storage == "pyarrow_numpy" + ): + new_mask = np.zeros(values.shape, dtype=np.bool_) + + # attempt downcast only if the data has been successfully converted + # to a numerical dtype and if a downcast method has been specified + if downcast is not None and is_numeric_dtype(values.dtype): + typecodes: str | None = None + + if downcast in ("integer", "signed"): + typecodes = np.typecodes["Integer"] + elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0): + typecodes = np.typecodes["UnsignedInteger"] + elif downcast == "float": + typecodes = np.typecodes["Float"] + + # pandas support goes only to np.float32, + # as float dtypes smaller than that are + # extremely rare and not well supported + float_32_char = np.dtype(np.float32).char + float_32_ind = typecodes.index(float_32_char) + typecodes = typecodes[float_32_ind:] + + if typecodes is not None: + # from smallest to largest + for typecode in typecodes: + dtype = np.dtype(typecode) + if dtype.itemsize <= values.dtype.itemsize: + values = maybe_downcast_numeric(values, dtype) + + # successful conversion + if values.dtype == dtype: + break + + # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct + # masked array + if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype): + if mask is None or (new_mask is not None and new_mask.shape == mask.shape): + # GH 52588 + mask = new_mask + else: + mask = mask.copy() + assert isinstance(mask, np.ndarray) + data = np.zeros(mask.shape, dtype=values.dtype) + data[~mask] = values + + from pandas.core.arrays import ( + ArrowExtensionArray, + BooleanArray, + FloatingArray, + IntegerArray, + ) + + klass: type[IntegerArray | BooleanArray | FloatingArray] + if is_integer_dtype(data.dtype): + klass = IntegerArray + elif is_bool_dtype(data.dtype): + klass = BooleanArray + else: + klass = FloatingArray + values = klass(data, mask) + + if dtype_backend == "pyarrow" or isinstance(values_dtype, ArrowDtype): + values = ArrowExtensionArray(values.__arrow_array__()) + + if is_series: + return arg._constructor(values, index=arg.index, name=arg.name) + elif is_index: + # because we want to coerce to numeric if possible, + # do not use _shallow_copy + from pandas import Index + + return Index(values, name=arg.name) + elif is_scalars: + return values[0] + else: + return values diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/timedeltas.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/timedeltas.py new file mode 100644 index 0000000000000000000000000000000000000000..d772c908c473109fcf7e37e06014b43226328e31 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/timedeltas.py @@ -0,0 +1,283 @@ +""" +timedelta support tools +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas._libs.tslibs import ( + NaT, + NaTType, +) +from pandas._libs.tslibs.timedeltas import ( + Timedelta, + disallow_ambiguous_unit, + parse_timedelta_unit, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.dtypes import ArrowDtype +from pandas.core.dtypes.generic import ( + ABCIndex, + ABCSeries, +) + +from pandas.core.arrays.timedeltas import sequence_to_td64ns + +if TYPE_CHECKING: + from collections.abc import Hashable + from datetime import timedelta + + from pandas._libs.tslibs.timedeltas import UnitChoices + from pandas._typing import ( + ArrayLike, + DateTimeErrorChoices, + ) + + from pandas import ( + Index, + Series, + TimedeltaIndex, + ) + + +@overload +def to_timedelta( + arg: str | float | timedelta, + unit: UnitChoices | None = ..., + errors: DateTimeErrorChoices = ..., +) -> Timedelta: + ... + + +@overload +def to_timedelta( + arg: Series, + unit: UnitChoices | None = ..., + errors: DateTimeErrorChoices = ..., +) -> Series: + ... + + +@overload +def to_timedelta( + arg: list | tuple | range | ArrayLike | Index, + unit: UnitChoices | None = ..., + errors: DateTimeErrorChoices = ..., +) -> TimedeltaIndex: + ... + + +def to_timedelta( + arg: str + | int + | float + | timedelta + | list + | tuple + | range + | ArrayLike + | Index + | Series, + unit: UnitChoices | None = None, + errors: DateTimeErrorChoices = "raise", +) -> Timedelta | TimedeltaIndex | Series: + """ + Convert argument to timedelta. + + Timedeltas are absolute differences in times, expressed in difference + units (e.g. days, hours, minutes, seconds). This method converts + an argument from a recognized timedelta format / value into + a Timedelta type. + + Parameters + ---------- + arg : str, timedelta, list-like or Series + The data to be converted to timedelta. + + .. versionchanged:: 2.0 + Strings with units 'M', 'Y' and 'y' do not represent + unambiguous timedelta values and will raise an exception. + + unit : str, optional + Denotes the unit of the arg for numeric `arg`. Defaults to ``"ns"``. + + Possible values: + + * 'W' + * 'D' / 'days' / 'day' + * 'hours' / 'hour' / 'hr' / 'h' / 'H' + * 'm' / 'minute' / 'min' / 'minutes' / 'T' + * 's' / 'seconds' / 'sec' / 'second' / 'S' + * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L' + * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U' + * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N' + + Must not be specified when `arg` contains strings and ``errors="raise"``. + + .. deprecated:: 2.2.0 + Units 'H', 'T', 'S', 'L', 'U' and 'N' are deprecated and will be removed + in a future version. Please use 'h', 'min', 's', 'ms', 'us', and 'ns' + instead of 'H', 'T', 'S', 'L', 'U' and 'N'. + + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaT. + - If 'ignore', then invalid parsing will return the input. + + Returns + ------- + timedelta + If parsing succeeded. + Return type depends on input: + + - list-like: TimedeltaIndex of timedelta64 dtype + - Series: Series of timedelta64 dtype + - scalar: Timedelta + + See Also + -------- + DataFrame.astype : Cast argument to a specified dtype. + to_datetime : Convert argument to datetime. + convert_dtypes : Convert dtypes. + + Notes + ----- + If the precision is higher than nanoseconds, the precision of the duration is + truncated to nanoseconds for string inputs. + + Examples + -------- + Parsing a single string to a Timedelta: + + >>> pd.to_timedelta('1 days 06:05:01.00003') + Timedelta('1 days 06:05:01.000030') + >>> pd.to_timedelta('15.5us') + Timedelta('0 days 00:00:00.000015500') + + Parsing a list or array of strings: + + >>> pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT], + dtype='timedelta64[ns]', freq=None) + + Converting numbers by specifying the `unit` keyword argument: + + >>> pd.to_timedelta(np.arange(5), unit='s') + TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', + '0 days 00:00:03', '0 days 00:00:04'], + dtype='timedelta64[ns]', freq=None) + >>> pd.to_timedelta(np.arange(5), unit='d') + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq=None) + """ + if unit is not None: + unit = parse_timedelta_unit(unit) + disallow_ambiguous_unit(unit) + + if errors not in ("ignore", "raise", "coerce"): + raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'.") + if errors == "ignore": + # GH#54467 + warnings.warn( + "errors='ignore' is deprecated and will raise in a future version. " + "Use to_timedelta without passing `errors` and catch exceptions " + "explicitly instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if arg is None: + return arg + elif isinstance(arg, ABCSeries): + values = _convert_listlike(arg._values, unit=unit, errors=errors) + return arg._constructor(values, index=arg.index, name=arg.name) + elif isinstance(arg, ABCIndex): + return _convert_listlike(arg, unit=unit, errors=errors, name=arg.name) + elif isinstance(arg, np.ndarray) and arg.ndim == 0: + # extract array scalar and process below + # error: Incompatible types in assignment (expression has type "object", + # variable has type "Union[str, int, float, timedelta, List[Any], + # Tuple[Any, ...], Union[Union[ExtensionArray, ndarray[Any, Any]], Index, + # Series]]") [assignment] + arg = lib.item_from_zerodim(arg) # type: ignore[assignment] + elif is_list_like(arg) and getattr(arg, "ndim", 1) == 1: + return _convert_listlike(arg, unit=unit, errors=errors) + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, timedelta, list, tuple, 1-d array, or Series" + ) + + if isinstance(arg, str) and unit is not None: + raise ValueError("unit must not be specified if the input is/contains a str") + + # ...so it must be a scalar value. Return scalar. + return _coerce_scalar_to_timedelta_type(arg, unit=unit, errors=errors) + + +def _coerce_scalar_to_timedelta_type( + r, unit: UnitChoices | None = "ns", errors: DateTimeErrorChoices = "raise" +): + """Convert string 'r' to a timedelta object.""" + result: Timedelta | NaTType + + try: + result = Timedelta(r, unit) + except ValueError: + if errors == "raise": + raise + if errors == "ignore": + return r + + # coerce + result = NaT + + return result + + +def _convert_listlike( + arg, + unit: UnitChoices | None = None, + errors: DateTimeErrorChoices = "raise", + name: Hashable | None = None, +): + """Convert a list of objects to a timedelta index object.""" + arg_dtype = getattr(arg, "dtype", None) + if isinstance(arg, (list, tuple)) or arg_dtype is None: + # This is needed only to ensure that in the case where we end up + # returning arg (errors == "ignore"), and where the input is a + # generator, we return a useful list-like instead of a + # used-up generator + if not hasattr(arg, "__array__"): + arg = list(arg) + arg = np.array(arg, dtype=object) + elif isinstance(arg_dtype, ArrowDtype) and arg_dtype.kind == "m": + return arg + + try: + td64arr = sequence_to_td64ns(arg, unit=unit, errors=errors, copy=False)[0] + except ValueError: + if errors == "ignore": + return arg + else: + # This else-block accounts for the cases when errors='raise' + # and errors='coerce'. If errors == 'raise', these errors + # should be raised. If errors == 'coerce', we shouldn't + # expect any errors to be raised, since all parsing errors + # cause coercion to pd.NaT. However, if an error / bug is + # introduced that causes an Exception to be raised, we would + # like to surface it. + raise + + from pandas import TimedeltaIndex + + value = TimedeltaIndex(td64arr, name=name) + return value diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/times.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/times.py new file mode 100644 index 0000000000000000000000000000000000000000..d77bcc91df7096bfad4b3eddf6355435a4b9e7f6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/tools/times.py @@ -0,0 +1,168 @@ +from __future__ import annotations + +from datetime import ( + datetime, + time, +) +from typing import TYPE_CHECKING +import warnings + +import numpy as np + +from pandas._libs.lib import is_list_like +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.generic import ( + ABCIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import notna + +if TYPE_CHECKING: + from pandas._typing import DateTimeErrorChoices + + +def to_time( + arg, + format: str | None = None, + infer_time_format: bool = False, + errors: DateTimeErrorChoices = "raise", +): + """ + Parse time strings to time objects using fixed strptime formats ("%H:%M", + "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", + "%I%M%S%p") + + Use infer_time_format if all the strings are in the same format to speed + up conversion. + + Parameters + ---------- + arg : string in time format, datetime.time, list, tuple, 1-d array, Series + format : str, default None + Format used to convert arg into a time object. If None, fixed formats + are used. + infer_time_format: bool, default False + Infer the time format based on the first non-NaN element. If all + strings are in the same format, this will speed up conversion. + errors : {'ignore', 'raise', 'coerce'}, default 'raise' + - If 'raise', then invalid parsing will raise an exception + - If 'coerce', then invalid parsing will be set as None + - If 'ignore', then invalid parsing will return the input + + Returns + ------- + datetime.time + """ + if errors == "ignore": + # GH#54467 + warnings.warn( + "errors='ignore' is deprecated and will raise in a future version. " + "Use to_time without passing `errors` and catch exceptions " + "explicitly instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + def _convert_listlike(arg, format): + if isinstance(arg, (list, tuple)): + arg = np.array(arg, dtype="O") + + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, datetime, list, tuple, 1-d array, or Series" + ) + + arg = np.asarray(arg, dtype="O") + + if infer_time_format and format is None: + format = _guess_time_format_for_array(arg) + + times: list[time | None] = [] + if format is not None: + for element in arg: + try: + times.append(datetime.strptime(element, format).time()) + except (ValueError, TypeError) as err: + if errors == "raise": + msg = ( + f"Cannot convert {element} to a time with given " + f"format {format}" + ) + raise ValueError(msg) from err + if errors == "ignore": + return arg + else: + times.append(None) + else: + formats = _time_formats[:] + format_found = False + for element in arg: + time_object = None + try: + time_object = time.fromisoformat(element) + except (ValueError, TypeError): + for time_format in formats: + try: + time_object = datetime.strptime(element, time_format).time() + if not format_found: + # Put the found format in front + fmt = formats.pop(formats.index(time_format)) + formats.insert(0, fmt) + format_found = True + break + except (ValueError, TypeError): + continue + + if time_object is not None: + times.append(time_object) + elif errors == "raise": + raise ValueError(f"Cannot convert arg {arg} to a time") + elif errors == "ignore": + return arg + else: + times.append(None) + + return times + + if arg is None: + return arg + elif isinstance(arg, time): + return arg + elif isinstance(arg, ABCSeries): + values = _convert_listlike(arg._values, format) + return arg._constructor(values, index=arg.index, name=arg.name) + elif isinstance(arg, ABCIndex): + return _convert_listlike(arg, format) + elif is_list_like(arg): + return _convert_listlike(arg, format) + + return _convert_listlike(np.array([arg]), format)[0] + + +# Fixed time formats for time parsing +_time_formats = [ + "%H:%M", + "%H%M", + "%I:%M%p", + "%I%M%p", + "%H:%M:%S", + "%H%M%S", + "%I:%M:%S%p", + "%I%M%S%p", +] + + +def _guess_time_format_for_array(arr): + # Try to guess the format based on the first non-NaN element + non_nan_elements = notna(arr).nonzero()[0] + if len(non_nan_elements): + element = arr[non_nan_elements[0]] + for time_format in _time_formats: + try: + datetime.strptime(element, time_format) + return time_format + except ValueError: + pass + + return None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97899175b75397f8a11de0ad96bd9313d0c8628c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/__pycache__/hashing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/__pycache__/hashing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..080ec5b704587f096c9f5d910533d4be986a4e0f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/__pycache__/hashing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/__pycache__/numba_.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/__pycache__/numba_.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1fef50398c27ac1983d82039754fb1acfb4966fc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/__pycache__/numba_.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/hashing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/hashing.py new file mode 100644 index 0000000000000000000000000000000000000000..4933de32125814baa6cc96926721c0c839540b2a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/hashing.py @@ -0,0 +1,339 @@ +""" +data hash pandas / numpy objects +""" +from __future__ import annotations + +import itertools +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs.hashing import hash_object_array + +from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCExtensionArray, + ABCIndex, + ABCMultiIndex, + ABCSeries, +) + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterable, + Iterator, + ) + + from pandas._typing import ( + ArrayLike, + npt, + ) + + from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + ) + + +# 16 byte long hashing key +_default_hash_key = "0123456789123456" + + +def combine_hash_arrays( + arrays: Iterator[np.ndarray], num_items: int +) -> npt.NDArray[np.uint64]: + """ + Parameters + ---------- + arrays : Iterator[np.ndarray] + num_items : int + + Returns + ------- + np.ndarray[uint64] + + Should be the same as CPython's tupleobject.c + """ + try: + first = next(arrays) + except StopIteration: + return np.array([], dtype=np.uint64) + + arrays = itertools.chain([first], arrays) + + mult = np.uint64(1000003) + out = np.zeros_like(first) + np.uint64(0x345678) + last_i = 0 + for i, a in enumerate(arrays): + inverse_i = num_items - i + out ^= a + out *= mult + mult += np.uint64(82520 + inverse_i + inverse_i) + last_i = i + assert last_i + 1 == num_items, "Fed in wrong num_items" + out += np.uint64(97531) + return out + + +def hash_pandas_object( + obj: Index | DataFrame | Series, + index: bool = True, + encoding: str = "utf8", + hash_key: str | None = _default_hash_key, + categorize: bool = True, +) -> Series: + """ + Return a data hash of the Index/Series/DataFrame. + + Parameters + ---------- + obj : Index, Series, or DataFrame + index : bool, default True + Include the index in the hash (if Series/DataFrame). + encoding : str, default 'utf8' + Encoding for data & key when strings. + hash_key : str, default _default_hash_key + Hash_key for string key to encode. + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + Returns + ------- + Series of uint64, same length as the object + + Examples + -------- + >>> pd.util.hash_pandas_object(pd.Series([1, 2, 3])) + 0 14639053686158035780 + 1 3869563279212530728 + 2 393322362522515241 + dtype: uint64 + """ + from pandas import Series + + if hash_key is None: + hash_key = _default_hash_key + + if isinstance(obj, ABCMultiIndex): + return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) + + elif isinstance(obj, ABCIndex): + h = hash_array(obj._values, encoding, hash_key, categorize).astype( + "uint64", copy=False + ) + ser = Series(h, index=obj, dtype="uint64", copy=False) + + elif isinstance(obj, ABCSeries): + h = hash_array(obj._values, encoding, hash_key, categorize).astype( + "uint64", copy=False + ) + if index: + index_iter = ( + hash_pandas_object( + obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize, + )._values + for _ in [None] + ) + arrays = itertools.chain([h], index_iter) + h = combine_hash_arrays(arrays, 2) + + ser = Series(h, index=obj.index, dtype="uint64", copy=False) + + elif isinstance(obj, ABCDataFrame): + hashes = ( + hash_array(series._values, encoding, hash_key, categorize) + for _, series in obj.items() + ) + num_items = len(obj.columns) + if index: + index_hash_generator = ( + hash_pandas_object( + obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize, + )._values + for _ in [None] + ) + num_items += 1 + + # keep `hashes` specifically a generator to keep mypy happy + _hashes = itertools.chain(hashes, index_hash_generator) + hashes = (x for x in _hashes) + h = combine_hash_arrays(hashes, num_items) + + ser = Series(h, index=obj.index, dtype="uint64", copy=False) + else: + raise TypeError(f"Unexpected type for hashing {type(obj)}") + + return ser + + +def hash_tuples( + vals: MultiIndex | Iterable[tuple[Hashable, ...]], + encoding: str = "utf8", + hash_key: str = _default_hash_key, +) -> npt.NDArray[np.uint64]: + """ + Hash an MultiIndex / listlike-of-tuples efficiently. + + Parameters + ---------- + vals : MultiIndex or listlike-of-tuples + encoding : str, default 'utf8' + hash_key : str, default _default_hash_key + + Returns + ------- + ndarray[np.uint64] of hashed values + """ + if not is_list_like(vals): + raise TypeError("must be convertible to a list-of-tuples") + + from pandas import ( + Categorical, + MultiIndex, + ) + + if not isinstance(vals, ABCMultiIndex): + mi = MultiIndex.from_tuples(vals) + else: + mi = vals + + # create a list-of-Categoricals + cat_vals = [ + Categorical._simple_new( + mi.codes[level], + CategoricalDtype(categories=mi.levels[level], ordered=False), + ) + for level in range(mi.nlevels) + ] + + # hash the list-of-ndarrays + hashes = ( + cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False) + for cat in cat_vals + ) + h = combine_hash_arrays(hashes, len(cat_vals)) + + return h + + +def hash_array( + vals: ArrayLike, + encoding: str = "utf8", + hash_key: str = _default_hash_key, + categorize: bool = True, +) -> npt.NDArray[np.uint64]: + """ + Given a 1d array, return an array of deterministic integers. + + Parameters + ---------- + vals : ndarray or ExtensionArray + encoding : str, default 'utf8' + Encoding for data & key when strings. + hash_key : str, default _default_hash_key + Hash_key for string key to encode. + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + Returns + ------- + ndarray[np.uint64, ndim=1] + Hashed values, same length as the vals. + + Examples + -------- + >>> pd.util.hash_array(np.array([1, 2, 3])) + array([ 6238072747940578789, 15839785061582574730, 2185194620014831856], + dtype=uint64) + """ + if not hasattr(vals, "dtype"): + raise TypeError("must pass a ndarray-like") + + if isinstance(vals, ABCExtensionArray): + return vals._hash_pandas_object( + encoding=encoding, hash_key=hash_key, categorize=categorize + ) + + if not isinstance(vals, np.ndarray): + # GH#42003 + raise TypeError( + "hash_array requires np.ndarray or ExtensionArray, not " + f"{type(vals).__name__}. Use hash_pandas_object instead." + ) + + return _hash_ndarray(vals, encoding, hash_key, categorize) + + +def _hash_ndarray( + vals: np.ndarray, + encoding: str = "utf8", + hash_key: str = _default_hash_key, + categorize: bool = True, +) -> npt.NDArray[np.uint64]: + """ + See hash_array.__doc__. + """ + dtype = vals.dtype + + # _hash_ndarray only takes 64-bit values, so handle 128-bit by parts + if np.issubdtype(dtype, np.complex128): + hash_real = _hash_ndarray(vals.real, encoding, hash_key, categorize) + hash_imag = _hash_ndarray(vals.imag, encoding, hash_key, categorize) + return hash_real + 23 * hash_imag + + # First, turn whatever array this is into unsigned 64-bit ints, if we can + # manage it. + if dtype == bool: + vals = vals.astype("u8") + elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): + vals = vals.view("i8").astype("u8", copy=False) + elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: + vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8") + else: + # With repeated values, its MUCH faster to categorize object dtypes, + # then hash and rename categories. We allow skipping the categorization + # when the values are known/likely to be unique. + if categorize: + from pandas import ( + Categorical, + Index, + factorize, + ) + + codes, categories = factorize(vals, sort=False) + dtype = CategoricalDtype(categories=Index(categories), ordered=False) + cat = Categorical._simple_new(codes, dtype) + return cat._hash_pandas_object( + encoding=encoding, hash_key=hash_key, categorize=False + ) + + try: + vals = hash_object_array(vals, hash_key, encoding) + except TypeError: + # we have mixed types + vals = hash_object_array( + vals.astype(str).astype(object), hash_key, encoding + ) + + # Then, redistribute these 64-bit ints within the space of 64-bit ints + vals ^= vals >> 30 + vals *= np.uint64(0xBF58476D1CE4E5B9) + vals ^= vals >> 27 + vals *= np.uint64(0x94D049BB133111EB) + vals ^= vals >> 31 + return vals diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/numba_.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/numba_.py new file mode 100644 index 0000000000000000000000000000000000000000..4825c9fee24b1b1272c5f245a9c7ae49f53003fc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/util/numba_.py @@ -0,0 +1,98 @@ +"""Common utilities for Numba operations""" +from __future__ import annotations + +import types +from typing import ( + TYPE_CHECKING, + Callable, +) + +import numpy as np + +from pandas.compat._optional import import_optional_dependency +from pandas.errors import NumbaUtilError + +GLOBAL_USE_NUMBA: bool = False + + +def maybe_use_numba(engine: str | None) -> bool: + """Signal whether to use numba routines.""" + return engine == "numba" or (engine is None and GLOBAL_USE_NUMBA) + + +def set_use_numba(enable: bool = False) -> None: + global GLOBAL_USE_NUMBA + if enable: + import_optional_dependency("numba") + GLOBAL_USE_NUMBA = enable + + +def get_jit_arguments( + engine_kwargs: dict[str, bool] | None = None, kwargs: dict | None = None +) -> dict[str, bool]: + """ + Return arguments to pass to numba.JIT, falling back on pandas default JIT settings. + + Parameters + ---------- + engine_kwargs : dict, default None + user passed keyword arguments for numba.JIT + kwargs : dict, default None + user passed keyword arguments to pass into the JITed function + + Returns + ------- + dict[str, bool] + nopython, nogil, parallel + + Raises + ------ + NumbaUtilError + """ + if engine_kwargs is None: + engine_kwargs = {} + + nopython = engine_kwargs.get("nopython", True) + if kwargs and nopython: + raise NumbaUtilError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) + nogil = engine_kwargs.get("nogil", False) + parallel = engine_kwargs.get("parallel", False) + return {"nopython": nopython, "nogil": nogil, "parallel": parallel} + + +def jit_user_function(func: Callable) -> Callable: + """ + If user function is not jitted already, mark the user's function + as jitable. + + Parameters + ---------- + func : function + user defined function + + Returns + ------- + function + Numba JITed function, or function marked as JITable by numba + """ + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + if numba.extending.is_jitted(func): + # Don't jit a user passed jitted function + numba_func = func + elif getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + # Not necessary to jit builtins or np functions + # This will mess up register_jitable + numba_func = func + else: + numba_func = numba.extending.register_jitable(func) + + return numba_func diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..857e12e5467a6a7d2263d9add33e65b9499778fa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__init__.py @@ -0,0 +1,23 @@ +from pandas.core.window.ewm import ( + ExponentialMovingWindow, + ExponentialMovingWindowGroupby, +) +from pandas.core.window.expanding import ( + Expanding, + ExpandingGroupby, +) +from pandas.core.window.rolling import ( + Rolling, + RollingGroupby, + Window, +) + +__all__ = [ + "Expanding", + "ExpandingGroupby", + "ExponentialMovingWindow", + "ExponentialMovingWindowGroupby", + "Rolling", + "RollingGroupby", + "Window", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74f239f9a08bff72d0e7c1307094c17a7afad678 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e139ab1f6d84ab6aff1c5ed441921d9e1fd27374 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/doc.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/doc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a945f6d86fd756080d230bc7b53526729fa808a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/doc.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/ewm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/ewm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..872fd6b76a43816598681e8c6a89faea3ba01ce0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/ewm.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/expanding.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/expanding.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ddf3ccc26d9a26c28be76358c86f06182a5141d8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/expanding.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/numba_.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/numba_.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d01b6f7ab0eb6c54622a12653caa642e186569c4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/numba_.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/rolling.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/rolling.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..946a7b027bcde7afb8bf02208661b0d3fa3ae4cf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/__pycache__/rolling.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/common.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/common.py new file mode 100644 index 0000000000000000000000000000000000000000..fc8eddca09c84029b9ac92b658a47b45e4764414 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/common.py @@ -0,0 +1,169 @@ +"""Common utility functions for rolling operations""" +from __future__ import annotations + +from collections import defaultdict +from typing import cast + +import numpy as np + +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) + +from pandas.core.indexes.api import MultiIndex + + +def flex_binary_moment(arg1, arg2, f, pairwise: bool = False): + if isinstance(arg1, ABCSeries) and isinstance(arg2, ABCSeries): + X, Y = prep_binary(arg1, arg2) + return f(X, Y) + + elif isinstance(arg1, ABCDataFrame): + from pandas import DataFrame + + def dataframe_from_int_dict(data, frame_template) -> DataFrame: + result = DataFrame(data, index=frame_template.index) + if len(result.columns) > 0: + result.columns = frame_template.columns[result.columns] + else: + result.columns = frame_template.columns.copy() + return result + + results = {} + if isinstance(arg2, ABCDataFrame): + if pairwise is False: + if arg1 is arg2: + # special case in order to handle duplicate column names + for i in range(len(arg1.columns)): + results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) + return dataframe_from_int_dict(results, arg1) + else: + if not arg1.columns.is_unique: + raise ValueError("'arg1' columns are not unique") + if not arg2.columns.is_unique: + raise ValueError("'arg2' columns are not unique") + X, Y = arg1.align(arg2, join="outer") + X, Y = prep_binary(X, Y) + res_columns = arg1.columns.union(arg2.columns) + for col in res_columns: + if col in X and col in Y: + results[col] = f(X[col], Y[col]) + return DataFrame(results, index=X.index, columns=res_columns) + elif pairwise is True: + results = defaultdict(dict) + for i in range(len(arg1.columns)): + for j in range(len(arg2.columns)): + if j < i and arg2 is arg1: + # Symmetric case + results[i][j] = results[j][i] + else: + results[i][j] = f( + *prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) + ) + + from pandas import concat + + result_index = arg1.index.union(arg2.index) + if len(result_index): + # construct result frame + result = concat( + [ + concat( + [results[i][j] for j in range(len(arg2.columns))], + ignore_index=True, + ) + for i in range(len(arg1.columns)) + ], + ignore_index=True, + axis=1, + ) + result.columns = arg1.columns + + # set the index and reorder + if arg2.columns.nlevels > 1: + # mypy needs to know columns is a MultiIndex, Index doesn't + # have levels attribute + arg2.columns = cast(MultiIndex, arg2.columns) + # GH 21157: Equivalent to MultiIndex.from_product( + # [result_index], , + # ) + # A normal MultiIndex.from_product will produce too many + # combinations. + result_level = np.tile( + result_index, len(result) // len(result_index) + ) + arg2_levels = ( + np.repeat( + arg2.columns.get_level_values(i), + len(result) // len(arg2.columns), + ) + for i in range(arg2.columns.nlevels) + ) + result_names = list(arg2.columns.names) + [result_index.name] + result.index = MultiIndex.from_arrays( + [*arg2_levels, result_level], names=result_names + ) + # GH 34440 + num_levels = len(result.index.levels) + new_order = [num_levels - 1] + list(range(num_levels - 1)) + result = result.reorder_levels(new_order).sort_index() + else: + result.index = MultiIndex.from_product( + [range(len(arg2.columns)), range(len(result_index))] + ) + result = result.swaplevel(1, 0).sort_index() + result.index = MultiIndex.from_product( + [result_index] + [arg2.columns] + ) + else: + # empty result + result = DataFrame( + index=MultiIndex( + levels=[arg1.index, arg2.columns], codes=[[], []] + ), + columns=arg2.columns, + dtype="float64", + ) + + # reset our index names to arg1 names + # reset our column names to arg2 names + # careful not to mutate the original names + result.columns = result.columns.set_names(arg1.columns.names) + result.index = result.index.set_names( + result_index.names + arg2.columns.names + ) + + return result + else: + results = { + i: f(*prep_binary(arg1.iloc[:, i], arg2)) + for i in range(len(arg1.columns)) + } + return dataframe_from_int_dict(results, arg1) + + else: + return flex_binary_moment(arg2, arg1, f) + + +def zsqrt(x): + with np.errstate(all="ignore"): + result = np.sqrt(x) + mask = x < 0 + + if isinstance(x, ABCDataFrame): + if mask._values.any(): + result[mask] = 0 + else: + if mask.any(): + result[mask] = 0 + + return result + + +def prep_binary(arg1, arg2): + # mask out values, this also makes a common index... + X = arg1 + 0 * arg2 + Y = arg2 + 0 * arg1 + + return X, Y diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/doc.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/doc.py new file mode 100644 index 0000000000000000000000000000000000000000..2a5cbc04921fadacf18a89608f2c0665bd8177e2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/doc.py @@ -0,0 +1,116 @@ +"""Any shareable docstring components for rolling/expanding/ewm""" +from __future__ import annotations + +from textwrap import dedent + +from pandas.core.shared_docs import _shared_docs + +_shared_docs = dict(**_shared_docs) + + +def create_section_header(header: str) -> str: + """Create numpydoc section header""" + return f"{header}\n{'-' * len(header)}\n" + + +template_header = "\nCalculate the {window_method} {aggregation_description}.\n\n" + +template_returns = dedent( + """ + Series or DataFrame + Return type is the same as the original object with ``np.float64`` dtype.\n + """ +).replace("\n", "", 1) + +template_see_also = dedent( + """ + pandas.Series.{window_method} : Calling {window_method} with Series data. + pandas.DataFrame.{window_method} : Calling {window_method} with DataFrames. + pandas.Series.{agg_method} : Aggregating {agg_method} for Series. + pandas.DataFrame.{agg_method} : Aggregating {agg_method} for DataFrame.\n + """ +).replace("\n", "", 1) + +kwargs_numeric_only = dedent( + """ + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0\n + """ +).replace("\n", "", 1) + +kwargs_scipy = dedent( + """ + **kwargs + Keyword arguments to configure the ``SciPy`` weighted window type.\n + """ +).replace("\n", "", 1) + +window_apply_parameters = dedent( + """ + func : function + Must produce a single value from an ndarray input if ``raw=True`` + or a single value from a Series if ``raw=False``. Can also accept a + Numba JIT function with ``engine='numba'`` specified. + + raw : bool, default False + * ``False`` : passes each row or column as a Series to the + function. + * ``True`` : the passed function will receive ndarray + objects instead. + If you are just applying a NumPy reduction function this will + achieve much better performance. + + engine : str, default None + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to both the ``func`` and the ``apply`` rolling aggregation. + + args : tuple, default None + Positional arguments to be passed into func. + + kwargs : dict, default None + Keyword arguments to be passed into func.\n + """ +).replace("\n", "", 1) + +numba_notes = ( + "See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for " + "extended documentation and performance considerations for the Numba engine.\n\n" +) + + +def window_agg_numba_parameters(version: str = "1.3") -> str: + return ( + dedent( + """ + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` + + .. versionadded:: {version}.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: {version}.0\n + """ + ) + .replace("\n", "", 1) + .replace("{version}", version) + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/ewm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/ewm.py new file mode 100644 index 0000000000000000000000000000000000000000..9ebf32d3e536eee7233b911acb880f3d40521793 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/ewm.py @@ -0,0 +1,1095 @@ +from __future__ import annotations + +import datetime +from functools import partial +from textwrap import dedent +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs.tslibs import Timedelta +import pandas._libs.window.aggregations as window_aggregations +from pandas.util._decorators import doc + +from pandas.core.dtypes.common import ( + is_datetime64_dtype, + is_numeric_dtype, +) +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas.core import common +from pandas.core.arrays.datetimelike import dtype_to_unit +from pandas.core.indexers.objects import ( + BaseIndexer, + ExponentialMovingWindowIndexer, + GroupbyIndexer, +) +from pandas.core.util.numba_ import ( + get_jit_arguments, + maybe_use_numba, +) +from pandas.core.window.common import zsqrt +from pandas.core.window.doc import ( + _shared_docs, + create_section_header, + kwargs_numeric_only, + numba_notes, + template_header, + template_returns, + template_see_also, + window_agg_numba_parameters, +) +from pandas.core.window.numba_ import ( + generate_numba_ewm_func, + generate_numba_ewm_table_func, +) +from pandas.core.window.online import ( + EWMMeanState, + generate_online_numba_ewma_func, +) +from pandas.core.window.rolling import ( + BaseWindow, + BaseWindowGroupby, +) + +if TYPE_CHECKING: + from pandas._typing import ( + Axis, + TimedeltaConvertibleTypes, + npt, + ) + + from pandas import ( + DataFrame, + Series, + ) + from pandas.core.generic import NDFrame + + +def get_center_of_mass( + comass: float | None, + span: float | None, + halflife: float | None, + alpha: float | None, +) -> float: + valid_count = common.count_not_none(comass, span, halflife, alpha) + if valid_count > 1: + raise ValueError("comass, span, halflife, and alpha are mutually exclusive") + + # Convert to center of mass; domain checks ensure 0 < alpha <= 1 + if comass is not None: + if comass < 0: + raise ValueError("comass must satisfy: comass >= 0") + elif span is not None: + if span < 1: + raise ValueError("span must satisfy: span >= 1") + comass = (span - 1) / 2 + elif halflife is not None: + if halflife <= 0: + raise ValueError("halflife must satisfy: halflife > 0") + decay = 1 - np.exp(np.log(0.5) / halflife) + comass = 1 / decay - 1 + elif alpha is not None: + if alpha <= 0 or alpha > 1: + raise ValueError("alpha must satisfy: 0 < alpha <= 1") + comass = (1 - alpha) / alpha + else: + raise ValueError("Must pass one of comass, span, halflife, or alpha") + + return float(comass) + + +def _calculate_deltas( + times: np.ndarray | NDFrame, + halflife: float | TimedeltaConvertibleTypes | None, +) -> npt.NDArray[np.float64]: + """ + Return the diff of the times divided by the half-life. These values are used in + the calculation of the ewm mean. + + Parameters + ---------- + times : np.ndarray, Series + Times corresponding to the observations. Must be monotonically increasing + and ``datetime64[ns]`` dtype. + halflife : float, str, timedelta, optional + Half-life specifying the decay + + Returns + ------- + np.ndarray + Diff of the times divided by the half-life + """ + unit = dtype_to_unit(times.dtype) + if isinstance(times, ABCSeries): + times = times._values + _times = np.asarray(times.view(np.int64), dtype=np.float64) + _halflife = float(Timedelta(halflife).as_unit(unit)._value) + return np.diff(_times) / _halflife + + +class ExponentialMovingWindow(BaseWindow): + r""" + Provide exponentially weighted (EW) calculations. + + Exactly one of ``com``, ``span``, ``halflife``, or ``alpha`` must be + provided if ``times`` is not provided. If ``times`` is provided, + ``halflife`` and one of ``com``, ``span`` or ``alpha`` may be provided. + + Parameters + ---------- + com : float, optional + Specify decay in terms of center of mass + + :math:`\alpha = 1 / (1 + com)`, for :math:`com \geq 0`. + + span : float, optional + Specify decay in terms of span + + :math:`\alpha = 2 / (span + 1)`, for :math:`span \geq 1`. + + halflife : float, str, timedelta, optional + Specify decay in terms of half-life + + :math:`\alpha = 1 - \exp\left(-\ln(2) / halflife\right)`, for + :math:`halflife > 0`. + + If ``times`` is specified, a timedelta convertible unit over which an + observation decays to half its value. Only applicable to ``mean()``, + and halflife value will not apply to the other functions. + + alpha : float, optional + Specify smoothing factor :math:`\alpha` directly + + :math:`0 < \alpha \leq 1`. + + min_periods : int, default 0 + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + + adjust : bool, default True + Divide by decaying adjustment factor in beginning periods to account + for imbalance in relative weightings (viewing EWMA as a moving average). + + - When ``adjust=True`` (default), the EW function is calculated using weights + :math:`w_i = (1 - \alpha)^i`. For example, the EW moving average of the series + [:math:`x_0, x_1, ..., x_t`] would be: + + .. math:: + y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ... + (1 - + \alpha)^t x_0}{1 + (1 - \alpha) + (1 - \alpha)^2 + ... + (1 - \alpha)^t} + + - When ``adjust=False``, the exponentially weighted function is calculated + recursively: + + .. math:: + \begin{split} + y_0 &= x_0\\ + y_t &= (1 - \alpha) y_{t-1} + \alpha x_t, + \end{split} + ignore_na : bool, default False + Ignore missing values when calculating weights. + + - When ``ignore_na=False`` (default), weights are based on absolute positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in calculating + the final weighted average of [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``. + + - When ``ignore_na=True``, weights are based + on relative positions. For example, the weights of :math:`x_0` and :math:`x_2` + used in calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if + ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``. + + axis : {0, 1}, default 0 + If ``0`` or ``'index'``, calculate across the rows. + + If ``1`` or ``'columns'``, calculate across the columns. + + For `Series` this parameter is unused and defaults to 0. + + times : np.ndarray, Series, default None + + Only applicable to ``mean()``. + + Times corresponding to the observations. Must be monotonically increasing and + ``datetime64[ns]`` dtype. + + If 1-D array like, a sequence with the same shape as the observations. + + method : str {'single', 'table'}, default 'single' + .. versionadded:: 1.4.0 + + Execute the rolling operation per single column or row (``'single'``) + or over the entire object (``'table'``). + + This argument is only implemented when specifying ``engine='numba'`` + in the method call. + + Only applicable to ``mean()`` + + Returns + ------- + pandas.api.typing.ExponentialMovingWindow + + See Also + -------- + rolling : Provides rolling window calculations. + expanding : Provides expanding transformations. + + Notes + ----- + See :ref:`Windowing Operations ` + for further usage details and examples. + + Examples + -------- + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.ewm(com=0.5).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + >>> df.ewm(alpha=2 / 3).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + + **adjust** + + >>> df.ewm(com=0.5, adjust=True).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + >>> df.ewm(com=0.5, adjust=False).mean() + B + 0 0.000000 + 1 0.666667 + 2 1.555556 + 3 1.555556 + 4 3.650794 + + **ignore_na** + + >>> df.ewm(com=0.5, ignore_na=True).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.225000 + >>> df.ewm(com=0.5, ignore_na=False).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + + **times** + + Exponentially weighted mean with weights calculated with a timedelta ``halflife`` + relative to ``times``. + + >>> times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17'] + >>> df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean() + B + 0 0.000000 + 1 0.585786 + 2 1.523889 + 3 1.523889 + 4 3.233686 + """ + + _attributes = [ + "com", + "span", + "halflife", + "alpha", + "min_periods", + "adjust", + "ignore_na", + "axis", + "times", + "method", + ] + + def __init__( + self, + obj: NDFrame, + com: float | None = None, + span: float | None = None, + halflife: float | TimedeltaConvertibleTypes | None = None, + alpha: float | None = None, + min_periods: int | None = 0, + adjust: bool = True, + ignore_na: bool = False, + axis: Axis = 0, + times: np.ndarray | NDFrame | None = None, + method: str = "single", + *, + selection=None, + ) -> None: + super().__init__( + obj=obj, + min_periods=1 if min_periods is None else max(int(min_periods), 1), + on=None, + center=False, + closed=None, + method=method, + axis=axis, + selection=selection, + ) + self.com = com + self.span = span + self.halflife = halflife + self.alpha = alpha + self.adjust = adjust + self.ignore_na = ignore_na + self.times = times + if self.times is not None: + if not self.adjust: + raise NotImplementedError("times is not supported with adjust=False.") + times_dtype = getattr(self.times, "dtype", None) + if not ( + is_datetime64_dtype(times_dtype) + or isinstance(times_dtype, DatetimeTZDtype) + ): + raise ValueError("times must be datetime64 dtype.") + if len(self.times) != len(obj): + raise ValueError("times must be the same length as the object.") + if not isinstance(self.halflife, (str, datetime.timedelta, np.timedelta64)): + raise ValueError("halflife must be a timedelta convertible object") + if isna(self.times).any(): + raise ValueError("Cannot convert NaT values to integer") + self._deltas = _calculate_deltas(self.times, self.halflife) + # Halflife is no longer applicable when calculating COM + # But allow COM to still be calculated if the user passes other decay args + if common.count_not_none(self.com, self.span, self.alpha) > 0: + self._com = get_center_of_mass(self.com, self.span, None, self.alpha) + else: + self._com = 1.0 + else: + if self.halflife is not None and isinstance( + self.halflife, (str, datetime.timedelta, np.timedelta64) + ): + raise ValueError( + "halflife can only be a timedelta convertible argument if " + "times is not None." + ) + # Without times, points are equally spaced + self._deltas = np.ones( + max(self.obj.shape[self.axis] - 1, 0), dtype=np.float64 + ) + self._com = get_center_of_mass( + # error: Argument 3 to "get_center_of_mass" has incompatible type + # "Union[float, Any, None, timedelta64, signedinteger[_64Bit]]"; + # expected "Optional[float]" + self.com, + self.span, + self.halflife, # type: ignore[arg-type] + self.alpha, + ) + + def _check_window_bounds( + self, start: np.ndarray, end: np.ndarray, num_vals: int + ) -> None: + # emw algorithms are iterative with each point + # ExponentialMovingWindowIndexer "bounds" are the entire window + pass + + def _get_window_indexer(self) -> BaseIndexer: + """ + Return an indexer class that will compute the window start and end bounds + """ + return ExponentialMovingWindowIndexer() + + def online( + self, engine: str = "numba", engine_kwargs=None + ) -> OnlineExponentialMovingWindow: + """ + Return an ``OnlineExponentialMovingWindow`` object to calculate + exponentially moving window aggregations in an online method. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + engine: str, default ``'numba'`` + Execution engine to calculate online aggregations. + Applies to all supported aggregation methods. + + engine_kwargs : dict, default None + Applies to all supported aggregation methods. + + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to the function + + Returns + ------- + OnlineExponentialMovingWindow + """ + return OnlineExponentialMovingWindow( + obj=self.obj, + com=self.com, + span=self.span, + halflife=self.halflife, + alpha=self.alpha, + min_periods=self.min_periods, + adjust=self.adjust, + ignore_na=self.ignore_na, + axis=self.axis, + times=self.times, + engine=engine, + engine_kwargs=engine_kwargs, + selection=self._selection, + ) + + @doc( + _shared_docs["aggregate"], + see_also=dedent( + """ + See Also + -------- + pandas.DataFrame.rolling.aggregate + """ + ), + examples=dedent( + """ + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.ewm(alpha=0.5).mean() + A B C + 0 1.000000 4.000000 7.000000 + 1 1.666667 4.666667 7.666667 + 2 2.428571 5.428571 8.428571 + """ + ), + klass="Series/Dataframe", + axis="", + ) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) + + agg = aggregate + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4]) + >>> ser.ewm(alpha=.2).mean() + 0 1.000000 + 1 1.555556 + 2 2.147541 + 3 2.775068 + dtype: float64 + """ + ), + window_method="ewm", + aggregation_description="(exponential weighted moment) mean", + agg_method="mean", + ) + def mean( + self, + numeric_only: bool = False, + engine=None, + engine_kwargs=None, + ): + if maybe_use_numba(engine): + if self.method == "single": + func = generate_numba_ewm_func + else: + func = generate_numba_ewm_table_func + ewm_func = func( + **get_jit_arguments(engine_kwargs), + com=self._com, + adjust=self.adjust, + ignore_na=self.ignore_na, + deltas=tuple(self._deltas), + normalize=True, + ) + return self._apply(ewm_func, name="mean") + elif engine in ("cython", None): + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + + deltas = None if self.times is None else self._deltas + window_func = partial( + window_aggregations.ewm, + com=self._com, + adjust=self.adjust, + ignore_na=self.ignore_na, + deltas=deltas, + normalize=True, + ) + return self._apply(window_func, name="mean", numeric_only=numeric_only) + else: + raise ValueError("engine must be either 'numba' or 'cython'") + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4]) + >>> ser.ewm(alpha=.2).sum() + 0 1.000 + 1 2.800 + 2 5.240 + 3 8.192 + dtype: float64 + """ + ), + window_method="ewm", + aggregation_description="(exponential weighted moment) sum", + agg_method="sum", + ) + def sum( + self, + numeric_only: bool = False, + engine=None, + engine_kwargs=None, + ): + if not self.adjust: + raise NotImplementedError("sum is not implemented with adjust=False") + if maybe_use_numba(engine): + if self.method == "single": + func = generate_numba_ewm_func + else: + func = generate_numba_ewm_table_func + ewm_func = func( + **get_jit_arguments(engine_kwargs), + com=self._com, + adjust=self.adjust, + ignore_na=self.ignore_na, + deltas=tuple(self._deltas), + normalize=False, + ) + return self._apply(ewm_func, name="sum") + elif engine in ("cython", None): + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + + deltas = None if self.times is None else self._deltas + window_func = partial( + window_aggregations.ewm, + com=self._com, + adjust=self.adjust, + ignore_na=self.ignore_na, + deltas=deltas, + normalize=False, + ) + return self._apply(window_func, name="sum", numeric_only=numeric_only) + else: + raise ValueError("engine must be either 'numba' or 'cython'") + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """\ + bias : bool, default False + Use a standard estimation bias correction. + """ + ), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4]) + >>> ser.ewm(alpha=.2).std() + 0 NaN + 1 0.707107 + 2 0.995893 + 3 1.277320 + dtype: float64 + """ + ), + window_method="ewm", + aggregation_description="(exponential weighted moment) standard deviation", + agg_method="std", + ) + def std(self, bias: bool = False, numeric_only: bool = False): + if ( + numeric_only + and self._selected_obj.ndim == 1 + and not is_numeric_dtype(self._selected_obj.dtype) + ): + # Raise directly so error message says std instead of var + raise NotImplementedError( + f"{type(self).__name__}.std does not implement numeric_only" + ) + return zsqrt(self.var(bias=bias, numeric_only=numeric_only)) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """\ + bias : bool, default False + Use a standard estimation bias correction. + """ + ), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4]) + >>> ser.ewm(alpha=.2).var() + 0 NaN + 1 0.500000 + 2 0.991803 + 3 1.631547 + dtype: float64 + """ + ), + window_method="ewm", + aggregation_description="(exponential weighted moment) variance", + agg_method="var", + ) + def var(self, bias: bool = False, numeric_only: bool = False): + window_func = window_aggregations.ewmcov + wfunc = partial( + window_func, + com=self._com, + adjust=self.adjust, + ignore_na=self.ignore_na, + bias=bias, + ) + + def var_func(values, begin, end, min_periods): + return wfunc(values, begin, end, min_periods, values) + + return self._apply(var_func, name="var", numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """\ + other : Series or DataFrame , optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndex DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + bias : bool, default False + Use a standard estimation bias correction. + """ + ), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser1 = pd.Series([1, 2, 3, 4]) + >>> ser2 = pd.Series([10, 11, 13, 16]) + >>> ser1.ewm(alpha=.2).cov(ser2) + 0 NaN + 1 0.500000 + 2 1.524590 + 3 3.408836 + dtype: float64 + """ + ), + window_method="ewm", + aggregation_description="(exponential weighted moment) sample covariance", + agg_method="cov", + ) + def cov( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + bias: bool = False, + numeric_only: bool = False, + ): + from pandas import Series + + self._validate_numeric_only("cov", numeric_only) + + def cov_func(x, y): + x_array = self._prep_values(x) + y_array = self._prep_values(y) + window_indexer = self._get_window_indexer() + min_periods = ( + self.min_periods + if self.min_periods is not None + else window_indexer.window_size + ) + start, end = window_indexer.get_window_bounds( + num_values=len(x_array), + min_periods=min_periods, + center=self.center, + closed=self.closed, + step=self.step, + ) + result = window_aggregations.ewmcov( + x_array, + start, + end, + # error: Argument 4 to "ewmcov" has incompatible type + # "Optional[int]"; expected "int" + self.min_periods, # type: ignore[arg-type] + y_array, + self._com, + self.adjust, + self.ignore_na, + bias, + ) + return Series(result, index=x.index, name=x.name, copy=False) + + return self._apply_pairwise( + self._selected_obj, other, pairwise, cov_func, numeric_only + ) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """\ + other : Series or DataFrame, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndex DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + """ + ), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser1 = pd.Series([1, 2, 3, 4]) + >>> ser2 = pd.Series([10, 11, 13, 16]) + >>> ser1.ewm(alpha=.2).corr(ser2) + 0 NaN + 1 1.000000 + 2 0.982821 + 3 0.977802 + dtype: float64 + """ + ), + window_method="ewm", + aggregation_description="(exponential weighted moment) sample correlation", + agg_method="corr", + ) + def corr( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + numeric_only: bool = False, + ): + from pandas import Series + + self._validate_numeric_only("corr", numeric_only) + + def cov_func(x, y): + x_array = self._prep_values(x) + y_array = self._prep_values(y) + window_indexer = self._get_window_indexer() + min_periods = ( + self.min_periods + if self.min_periods is not None + else window_indexer.window_size + ) + start, end = window_indexer.get_window_bounds( + num_values=len(x_array), + min_periods=min_periods, + center=self.center, + closed=self.closed, + step=self.step, + ) + + def _cov(X, Y): + return window_aggregations.ewmcov( + X, + start, + end, + min_periods, + Y, + self._com, + self.adjust, + self.ignore_na, + True, + ) + + with np.errstate(all="ignore"): + cov = _cov(x_array, y_array) + x_var = _cov(x_array, x_array) + y_var = _cov(y_array, y_array) + result = cov / zsqrt(x_var * y_var) + return Series(result, index=x.index, name=x.name, copy=False) + + return self._apply_pairwise( + self._selected_obj, other, pairwise, cov_func, numeric_only + ) + + +class ExponentialMovingWindowGroupby(BaseWindowGroupby, ExponentialMovingWindow): + """ + Provide an exponential moving window groupby implementation. + """ + + _attributes = ExponentialMovingWindow._attributes + BaseWindowGroupby._attributes + + def __init__(self, obj, *args, _grouper=None, **kwargs) -> None: + super().__init__(obj, *args, _grouper=_grouper, **kwargs) + + if not obj.empty and self.times is not None: + # sort the times and recalculate the deltas according to the groups + groupby_order = np.concatenate(list(self._grouper.indices.values())) + self._deltas = _calculate_deltas( + self.times.take(groupby_order), + self.halflife, + ) + + def _get_window_indexer(self) -> GroupbyIndexer: + """ + Return an indexer class that will compute the window start and end bounds + + Returns + ------- + GroupbyIndexer + """ + window_indexer = GroupbyIndexer( + groupby_indices=self._grouper.indices, + window_indexer=ExponentialMovingWindowIndexer, + ) + return window_indexer + + +class OnlineExponentialMovingWindow(ExponentialMovingWindow): + def __init__( + self, + obj: NDFrame, + com: float | None = None, + span: float | None = None, + halflife: float | TimedeltaConvertibleTypes | None = None, + alpha: float | None = None, + min_periods: int | None = 0, + adjust: bool = True, + ignore_na: bool = False, + axis: Axis = 0, + times: np.ndarray | NDFrame | None = None, + engine: str = "numba", + engine_kwargs: dict[str, bool] | None = None, + *, + selection=None, + ) -> None: + if times is not None: + raise NotImplementedError( + "times is not implemented with online operations." + ) + super().__init__( + obj=obj, + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + times=times, + selection=selection, + ) + self._mean = EWMMeanState( + self._com, self.adjust, self.ignore_na, self.axis, obj.shape + ) + if maybe_use_numba(engine): + self.engine = engine + self.engine_kwargs = engine_kwargs + else: + raise ValueError("'numba' is the only supported engine") + + def reset(self) -> None: + """ + Reset the state captured by `update` calls. + """ + self._mean.reset() + + def aggregate(self, func, *args, **kwargs): + raise NotImplementedError("aggregate is not implemented.") + + def std(self, bias: bool = False, *args, **kwargs): + raise NotImplementedError("std is not implemented.") + + def corr( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + numeric_only: bool = False, + ): + raise NotImplementedError("corr is not implemented.") + + def cov( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + bias: bool = False, + numeric_only: bool = False, + ): + raise NotImplementedError("cov is not implemented.") + + def var(self, bias: bool = False, numeric_only: bool = False): + raise NotImplementedError("var is not implemented.") + + def mean(self, *args, update=None, update_times=None, **kwargs): + """ + Calculate an online exponentially weighted mean. + + Parameters + ---------- + update: DataFrame or Series, default None + New values to continue calculating the + exponentially weighted mean from the last values and weights. + Values should be float64 dtype. + + ``update`` needs to be ``None`` the first time the + exponentially weighted mean is calculated. + + update_times: Series or 1-D np.ndarray, default None + New times to continue calculating the + exponentially weighted mean from the last values and weights. + If ``None``, values are assumed to be evenly spaced + in time. + This feature is currently unsupported. + + Returns + ------- + DataFrame or Series + + Examples + -------- + >>> df = pd.DataFrame({"a": range(5), "b": range(5, 10)}) + >>> online_ewm = df.head(2).ewm(0.5).online() + >>> online_ewm.mean() + a b + 0 0.00 5.00 + 1 0.75 5.75 + >>> online_ewm.mean(update=df.tail(3)) + a b + 2 1.615385 6.615385 + 3 2.550000 7.550000 + 4 3.520661 8.520661 + >>> online_ewm.reset() + >>> online_ewm.mean() + a b + 0 0.00 5.00 + 1 0.75 5.75 + """ + result_kwargs = {} + is_frame = self._selected_obj.ndim == 2 + if update_times is not None: + raise NotImplementedError("update_times is not implemented.") + update_deltas = np.ones( + max(self._selected_obj.shape[self.axis - 1] - 1, 0), dtype=np.float64 + ) + if update is not None: + if self._mean.last_ewm is None: + raise ValueError( + "Must call mean with update=None first before passing update" + ) + result_from = 1 + result_kwargs["index"] = update.index + if is_frame: + last_value = self._mean.last_ewm[np.newaxis, :] + result_kwargs["columns"] = update.columns + else: + last_value = self._mean.last_ewm + result_kwargs["name"] = update.name + np_array = np.concatenate((last_value, update.to_numpy())) + else: + result_from = 0 + result_kwargs["index"] = self._selected_obj.index + if is_frame: + result_kwargs["columns"] = self._selected_obj.columns + else: + result_kwargs["name"] = self._selected_obj.name + np_array = self._selected_obj.astype(np.float64, copy=False).to_numpy() + ewma_func = generate_online_numba_ewma_func( + **get_jit_arguments(self.engine_kwargs) + ) + result = self._mean.run_ewm( + np_array if is_frame else np_array[:, np.newaxis], + update_deltas, + self.min_periods, + ewma_func, + ) + if not is_frame: + result = result.squeeze() + result = result[result_from:] + result = self._selected_obj._constructor(result, **result_kwargs) + return result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/expanding.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/expanding.py new file mode 100644 index 0000000000000000000000000000000000000000..aac10596ffc699c2b229f959b9c1b26393384b03 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/expanding.py @@ -0,0 +1,964 @@ +from __future__ import annotations + +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, +) + +from pandas.util._decorators import ( + deprecate_kwarg, + doc, +) + +from pandas.core.indexers.objects import ( + BaseIndexer, + ExpandingIndexer, + GroupbyIndexer, +) +from pandas.core.window.doc import ( + _shared_docs, + create_section_header, + kwargs_numeric_only, + numba_notes, + template_header, + template_returns, + template_see_also, + window_agg_numba_parameters, + window_apply_parameters, +) +from pandas.core.window.rolling import ( + BaseWindowGroupby, + RollingAndExpandingMixin, +) + +if TYPE_CHECKING: + from pandas._typing import ( + Axis, + QuantileInterpolation, + WindowingRankType, + ) + + from pandas import ( + DataFrame, + Series, + ) + from pandas.core.generic import NDFrame + + +class Expanding(RollingAndExpandingMixin): + """ + Provide expanding window calculations. + + Parameters + ---------- + min_periods : int, default 1 + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + + axis : int or str, default 0 + If ``0`` or ``'index'``, roll across the rows. + + If ``1`` or ``'columns'``, roll across the columns. + + For `Series` this parameter is unused and defaults to 0. + + method : str {'single', 'table'}, default 'single' + Execute the rolling operation per single column or row (``'single'``) + or over the entire object (``'table'``). + + This argument is only implemented when specifying ``engine='numba'`` + in the method call. + + .. versionadded:: 1.3.0 + + Returns + ------- + pandas.api.typing.Expanding + + See Also + -------- + rolling : Provides rolling window calculations. + ewm : Provides exponential weighted functions. + + Notes + ----- + See :ref:`Windowing Operations ` for further usage details + and examples. + + Examples + -------- + >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + **min_periods** + + Expanding sum with 1 vs 3 observations needed to calculate a value. + + >>> df.expanding(1).sum() + B + 0 0.0 + 1 1.0 + 2 3.0 + 3 3.0 + 4 7.0 + >>> df.expanding(3).sum() + B + 0 NaN + 1 NaN + 2 3.0 + 3 3.0 + 4 7.0 + """ + + _attributes: list[str] = ["min_periods", "axis", "method"] + + def __init__( + self, + obj: NDFrame, + min_periods: int = 1, + axis: Axis = 0, + method: str = "single", + selection=None, + ) -> None: + super().__init__( + obj=obj, + min_periods=min_periods, + axis=axis, + method=method, + selection=selection, + ) + + def _get_window_indexer(self) -> BaseIndexer: + """ + Return an indexer class that will compute the window start and end bounds + """ + return ExpandingIndexer() + + @doc( + _shared_docs["aggregate"], + see_also=dedent( + """ + See Also + -------- + pandas.DataFrame.aggregate : Similar DataFrame method. + pandas.Series.aggregate : Similar Series method. + """ + ), + examples=dedent( + """ + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.ewm(alpha=0.5).mean() + A B C + 0 1.000000 4.000000 7.000000 + 1 1.666667 4.666667 7.666667 + 2 2.428571 5.428571 8.428571 + """ + ), + klass="Series/Dataframe", + axis="", + ) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) + + agg = aggregate + + @doc( + template_header, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().count() + a 1.0 + b 2.0 + c 3.0 + d 4.0 + dtype: float64 + """ + ), + window_method="expanding", + aggregation_description="count of non NaN observations", + agg_method="count", + ) + def count(self, numeric_only: bool = False): + return super().count(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + window_apply_parameters, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().apply(lambda s: s.max() - 2 * s.min()) + a -1.0 + b 0.0 + c 1.0 + d 2.0 + dtype: float64 + """ + ), + window_method="expanding", + aggregation_description="custom aggregation function", + agg_method="apply", + ) + def apply( + self, + func: Callable[..., Any], + raw: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + args: tuple[Any, ...] | None = None, + kwargs: dict[str, Any] | None = None, + ): + return super().apply( + func, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().sum() + a 1.0 + b 3.0 + c 6.0 + d 10.0 + dtype: float64 + """ + ), + window_method="expanding", + aggregation_description="sum", + agg_method="sum", + ) + def sum( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().sum( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([3, 2, 1, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().max() + a 3.0 + b 3.0 + c 3.0 + d 4.0 + dtype: float64 + """ + ), + window_method="expanding", + aggregation_description="maximum", + agg_method="max", + ) + def max( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().max( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([2, 3, 4, 1], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().min() + a 2.0 + b 2.0 + c 2.0 + d 1.0 + dtype: float64 + """ + ), + window_method="expanding", + aggregation_description="minimum", + agg_method="min", + ) + def min( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().min( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().mean() + a 1.0 + b 1.5 + c 2.0 + d 2.5 + dtype: float64 + """ + ), + window_method="expanding", + aggregation_description="mean", + agg_method="mean", + ) + def mean( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().mean( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().median() + a 1.0 + b 1.5 + c 2.0 + d 2.5 + dtype: float64 + """ + ), + window_method="expanding", + aggregation_description="median", + agg_method="median", + ) + def median( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().median( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements.\n + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + window_agg_numba_parameters("1.4"), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "numpy.std : Equivalent method for NumPy array.\n", + template_see_also, + create_section_header("Notes"), + dedent( + """ + The default ``ddof`` of 1 used in :meth:`Series.std` is different + than the default ``ddof`` of 0 in :func:`numpy.std`. + + A minimum of one period is required for the rolling calculation.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + + >>> s.expanding(3).std() + 0 NaN + 1 NaN + 2 0.577350 + 3 0.957427 + 4 0.894427 + 5 0.836660 + 6 0.786796 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="standard deviation", + agg_method="std", + ) + def std( + self, + ddof: int = 1, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().std( + ddof=ddof, + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements.\n + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + window_agg_numba_parameters("1.4"), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "numpy.var : Equivalent method for NumPy array.\n", + template_see_also, + create_section_header("Notes"), + dedent( + """ + The default ``ddof`` of 1 used in :meth:`Series.var` is different + than the default ``ddof`` of 0 in :func:`numpy.var`. + + A minimum of one period is required for the rolling calculation.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + + >>> s.expanding(3).var() + 0 NaN + 1 NaN + 2 0.333333 + 3 0.916667 + 4 0.800000 + 5 0.700000 + 6 0.619048 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="variance", + agg_method="var", + ) + def var( + self, + ddof: int = 1, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().var( + ddof=ddof, + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements.\n + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + "A minimum of one period is required for the calculation.\n\n", + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([0, 1, 2, 3]) + + >>> s.expanding().sem() + 0 NaN + 1 0.707107 + 2 0.707107 + 3 0.745356 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="standard error of mean", + agg_method="sem", + ) + def sem(self, ddof: int = 1, numeric_only: bool = False): + return super().sem(ddof=ddof, numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "scipy.stats.skew : Third moment of a probability density.\n", + template_see_also, + create_section_header("Notes"), + "A minimum of three periods is required for the rolling calculation.\n\n", + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([-1, 0, 2, -1, 2], index=['a', 'b', 'c', 'd', 'e']) + >>> ser.expanding().skew() + a NaN + b NaN + c 0.935220 + d 1.414214 + e 0.315356 + dtype: float64 + """ + ), + window_method="expanding", + aggregation_description="unbiased skewness", + agg_method="skew", + ) + def skew(self, numeric_only: bool = False): + return super().skew(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "scipy.stats.kurtosis : Reference SciPy method.\n", + template_see_also, + create_section_header("Notes"), + "A minimum of four periods is required for the calculation.\n\n", + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> import scipy.stats + >>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}") + -1.200000 + >>> print(f"{{scipy.stats.kurtosis(arr, bias=False):.6f}}") + 4.999874 + >>> s = pd.Series(arr) + >>> s.expanding(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 4.999874 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="Fisher's definition of kurtosis without bias", + agg_method="kurt", + ) + def kurt(self, numeric_only: bool = False): + return super().kurt(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + quantile : float + Quantile to compute. 0 <= quantile <= 1. + + .. deprecated:: 2.1.0 + This will be renamed to 'q' in a future version. + interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}} + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4, 5, 6], index=['a', 'b', 'c', 'd', 'e', 'f']) + >>> ser.expanding(min_periods=4).quantile(.25) + a NaN + b NaN + c NaN + d 1.75 + e 2.00 + f 2.25 + dtype: float64 + """ + ), + window_method="expanding", + aggregation_description="quantile", + agg_method="quantile", + ) + @deprecate_kwarg(old_arg_name="quantile", new_arg_name="q") + def quantile( + self, + q: float, + interpolation: QuantileInterpolation = "linear", + numeric_only: bool = False, + ): + return super().quantile( + q=q, + interpolation=interpolation, + numeric_only=numeric_only, + ) + + @doc( + template_header, + ".. versionadded:: 1.4.0 \n\n", + create_section_header("Parameters"), + dedent( + """ + method : {{'average', 'min', 'max'}}, default 'average' + How to rank the group of records that have the same value (i.e. ties): + + * average: average rank of the group + * min: lowest rank in the group + * max: highest rank in the group + + ascending : bool, default True + Whether or not the elements should be ranked in ascending order. + pct : bool, default False + Whether or not to display the returned rankings in percentile + form. + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 4, 2, 3, 5, 3]) + >>> s.expanding().rank() + 0 1.0 + 1 2.0 + 2 2.0 + 3 3.0 + 4 5.0 + 5 3.5 + dtype: float64 + + >>> s.expanding().rank(method="max") + 0 1.0 + 1 2.0 + 2 2.0 + 3 3.0 + 4 5.0 + 5 4.0 + dtype: float64 + + >>> s.expanding().rank(method="min") + 0 1.0 + 1 2.0 + 2 2.0 + 3 3.0 + 4 5.0 + 5 3.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="rank", + agg_method="rank", + ) + def rank( + self, + method: WindowingRankType = "average", + ascending: bool = True, + pct: bool = False, + numeric_only: bool = False, + ): + return super().rank( + method=method, + ascending=ascending, + pct=pct, + numeric_only=numeric_only, + ) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + other : Series or DataFrame, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser2 = pd.Series([10, 11, 13, 16], index=['a', 'b', 'c', 'd']) + >>> ser1.expanding().cov(ser2) + a NaN + b 0.500000 + c 1.500000 + d 3.333333 + dtype: float64 + """ + ), + window_method="expanding", + aggregation_description="sample covariance", + agg_method="cov", + ) + def cov( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + ddof: int = 1, + numeric_only: bool = False, + ): + return super().cov( + other=other, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + ) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + other : Series or DataFrame, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + cov : Similar method to calculate covariance. + numpy.corrcoef : NumPy Pearson's correlation calculation. + """ + ).replace("\n", "", 1), + template_see_also, + create_section_header("Notes"), + dedent( + """ + This function uses Pearson's definition of correlation + (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). + + When `other` is not specified, the output will be self correlation (e.g. + all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise` + set to `True`. + + Function will return ``NaN`` for correlations of equal valued sequences; + this is the result of a 0/0 division error. + + When `pairwise` is set to `False`, only matching columns between `self` and + `other` will be used. + + When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame + with the original index on the first level, and the `other` DataFrame + columns on the second level. + + In the case of missing elements, only complete pairwise observations + will be used.\n + """ + ), + create_section_header("Examples"), + dedent( + """\ + >>> ser1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser2 = pd.Series([10, 11, 13, 16], index=['a', 'b', 'c', 'd']) + >>> ser1.expanding().corr(ser2) + a NaN + b 1.000000 + c 0.981981 + d 0.975900 + dtype: float64 + """ + ), + window_method="expanding", + aggregation_description="correlation", + agg_method="corr", + ) + def corr( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + ddof: int = 1, + numeric_only: bool = False, + ): + return super().corr( + other=other, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + ) + + +class ExpandingGroupby(BaseWindowGroupby, Expanding): + """ + Provide a expanding groupby implementation. + """ + + _attributes = Expanding._attributes + BaseWindowGroupby._attributes + + def _get_window_indexer(self) -> GroupbyIndexer: + """ + Return an indexer class that will compute the window start and end bounds + + Returns + ------- + GroupbyIndexer + """ + window_indexer = GroupbyIndexer( + groupby_indices=self._grouper.indices, + window_indexer=ExpandingIndexer, + ) + return window_indexer diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/numba_.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/numba_.py new file mode 100644 index 0000000000000000000000000000000000000000..9357945e78c631a3fada24ec3015ca0cf183b99c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/numba_.py @@ -0,0 +1,351 @@ +from __future__ import annotations + +import functools +from typing import ( + TYPE_CHECKING, + Any, + Callable, +) + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + +from pandas.core.util.numba_ import jit_user_function + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +@functools.cache +def generate_numba_apply_func( + func: Callable[..., Scalar], + nopython: bool, + nogil: bool, + parallel: bool, +): + """ + Generate a numba jitted apply function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. + + Parameters + ---------- + func : function + function to be applied to each window and will be JITed + nopython : bool + nopython to be passed into numba.jit + nogil : bool + nogil to be passed into numba.jit + parallel : bool + parallel to be passed into numba.jit + + Returns + ------- + Numba function + """ + numba_func = jit_user_function(func) + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def roll_apply( + values: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + minimum_periods: int, + *args: Any, + ) -> np.ndarray: + result = np.empty(len(begin)) + for i in numba.prange(len(result)): + start = begin[i] + stop = end[i] + window = values[start:stop] + count_nan = np.sum(np.isnan(window)) + if len(window) - count_nan >= minimum_periods: + result[i] = numba_func(window, *args) + else: + result[i] = np.nan + return result + + return roll_apply + + +@functools.cache +def generate_numba_ewm_func( + nopython: bool, + nogil: bool, + parallel: bool, + com: float, + adjust: bool, + ignore_na: bool, + deltas: tuple, + normalize: bool, +): + """ + Generate a numba jitted ewm mean or sum function specified by values + from engine_kwargs. + + Parameters + ---------- + nopython : bool + nopython to be passed into numba.jit + nogil : bool + nogil to be passed into numba.jit + parallel : bool + parallel to be passed into numba.jit + com : float + adjust : bool + ignore_na : bool + deltas : tuple + normalize : bool + + Returns + ------- + Numba function + """ + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def ewm( + values: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + minimum_periods: int, + ) -> np.ndarray: + result = np.empty(len(values)) + alpha = 1.0 / (1.0 + com) + old_wt_factor = 1.0 - alpha + new_wt = 1.0 if adjust else alpha + + for i in numba.prange(len(begin)): + start = begin[i] + stop = end[i] + window = values[start:stop] + sub_result = np.empty(len(window)) + + weighted = window[0] + nobs = int(not np.isnan(weighted)) + sub_result[0] = weighted if nobs >= minimum_periods else np.nan + old_wt = 1.0 + + for j in range(1, len(window)): + cur = window[j] + is_observation = not np.isnan(cur) + nobs += is_observation + if not np.isnan(weighted): + if is_observation or not ignore_na: + if normalize: + # note that len(deltas) = len(vals) - 1 and deltas[i] + # is to be used in conjunction with vals[i+1] + old_wt *= old_wt_factor ** deltas[start + j - 1] + else: + weighted = old_wt_factor * weighted + if is_observation: + if normalize: + # avoid numerical errors on constant series + if weighted != cur: + weighted = old_wt * weighted + new_wt * cur + if normalize: + weighted = weighted / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1.0 + else: + weighted += cur + elif is_observation: + weighted = cur + + sub_result[j] = weighted if nobs >= minimum_periods else np.nan + + result[start:stop] = sub_result + + return result + + return ewm + + +@functools.cache +def generate_numba_table_func( + func: Callable[..., np.ndarray], + nopython: bool, + nogil: bool, + parallel: bool, +): + """ + Generate a numba jitted function to apply window calculations table-wise. + + Func will be passed a M window size x N number of columns array, and + must return a 1 x N number of columns array. Func is intended to operate + row-wise, but the result will be transposed for axis=1. + + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Parameters + ---------- + func : function + function to be applied to each window and will be JITed + nopython : bool + nopython to be passed into numba.jit + nogil : bool + nogil to be passed into numba.jit + parallel : bool + parallel to be passed into numba.jit + + Returns + ------- + Numba function + """ + numba_func = jit_user_function(func) + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def roll_table( + values: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + minimum_periods: int, + *args: Any, + ): + result = np.empty((len(begin), values.shape[1])) + min_periods_mask = np.empty(result.shape) + for i in numba.prange(len(result)): + start = begin[i] + stop = end[i] + window = values[start:stop] + count_nan = np.sum(np.isnan(window), axis=0) + sub_result = numba_func(window, *args) + nan_mask = len(window) - count_nan >= minimum_periods + min_periods_mask[i, :] = nan_mask + result[i, :] = sub_result + result = np.where(min_periods_mask, result, np.nan) + return result + + return roll_table + + +# This function will no longer be needed once numba supports +# axis for all np.nan* agg functions +# https://github.com/numba/numba/issues/1269 +@functools.cache +def generate_manual_numpy_nan_agg_with_axis(nan_func): + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + @numba.jit(nopython=True, nogil=True, parallel=True) + def nan_agg_with_axis(table): + result = np.empty(table.shape[1]) + for i in numba.prange(table.shape[1]): + partition = table[:, i] + result[i] = nan_func(partition) + return result + + return nan_agg_with_axis + + +@functools.cache +def generate_numba_ewm_table_func( + nopython: bool, + nogil: bool, + parallel: bool, + com: float, + adjust: bool, + ignore_na: bool, + deltas: tuple, + normalize: bool, +): + """ + Generate a numba jitted ewm mean or sum function applied table wise specified + by values from engine_kwargs. + + Parameters + ---------- + nopython : bool + nopython to be passed into numba.jit + nogil : bool + nogil to be passed into numba.jit + parallel : bool + parallel to be passed into numba.jit + com : float + adjust : bool + ignore_na : bool + deltas : tuple + normalize: bool + + Returns + ------- + Numba function + """ + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def ewm_table( + values: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + minimum_periods: int, + ) -> np.ndarray: + alpha = 1.0 / (1.0 + com) + old_wt_factor = 1.0 - alpha + new_wt = 1.0 if adjust else alpha + old_wt = np.ones(values.shape[1]) + + result = np.empty(values.shape) + weighted = values[0].copy() + nobs = (~np.isnan(weighted)).astype(np.int64) + result[0] = np.where(nobs >= minimum_periods, weighted, np.nan) + for i in range(1, len(values)): + cur = values[i] + is_observations = ~np.isnan(cur) + nobs += is_observations.astype(np.int64) + for j in numba.prange(len(cur)): + if not np.isnan(weighted[j]): + if is_observations[j] or not ignore_na: + if normalize: + # note that len(deltas) = len(vals) - 1 and deltas[i] + # is to be used in conjunction with vals[i+1] + old_wt[j] *= old_wt_factor ** deltas[i - 1] + else: + weighted[j] = old_wt_factor * weighted[j] + if is_observations[j]: + if normalize: + # avoid numerical errors on constant series + if weighted[j] != cur[j]: + weighted[j] = ( + old_wt[j] * weighted[j] + new_wt * cur[j] + ) + if normalize: + weighted[j] = weighted[j] / (old_wt[j] + new_wt) + if adjust: + old_wt[j] += new_wt + else: + old_wt[j] = 1.0 + else: + weighted[j] += cur[j] + elif is_observations[j]: + weighted[j] = cur[j] + + result[i] = np.where(nobs >= minimum_periods, weighted, np.nan) + + return result + + return ewm_table diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/online.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/online.py new file mode 100644 index 0000000000000000000000000000000000000000..29d1f740e021fd30d3bfc7d72a5e6baefe62ac4c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/online.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + + +def generate_online_numba_ewma_func( + nopython: bool, + nogil: bool, + parallel: bool, +): + """ + Generate a numba jitted groupby ewma function specified by values + from engine_kwargs. + + Parameters + ---------- + nopython : bool + nopython to be passed into numba.jit + nogil : bool + nogil to be passed into numba.jit + parallel : bool + parallel to be passed into numba.jit + + Returns + ------- + Numba function + """ + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def online_ewma( + values: np.ndarray, + deltas: np.ndarray, + minimum_periods: int, + old_wt_factor: float, + new_wt: float, + old_wt: np.ndarray, + adjust: bool, + ignore_na: bool, + ): + """ + Compute online exponentially weighted mean per column over 2D values. + + Takes the first observation as is, then computes the subsequent + exponentially weighted mean accounting minimum periods. + """ + result = np.empty(values.shape) + weighted_avg = values[0].copy() + nobs = (~np.isnan(weighted_avg)).astype(np.int64) + result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) + + for i in range(1, len(values)): + cur = values[i] + is_observations = ~np.isnan(cur) + nobs += is_observations.astype(np.int64) + for j in numba.prange(len(cur)): + if not np.isnan(weighted_avg[j]): + if is_observations[j] or not ignore_na: + # note that len(deltas) = len(vals) - 1 and deltas[i] is to be + # used in conjunction with vals[i+1] + old_wt[j] *= old_wt_factor ** deltas[j - 1] + if is_observations[j]: + # avoid numerical errors on constant series + if weighted_avg[j] != cur[j]: + weighted_avg[j] = ( + (old_wt[j] * weighted_avg[j]) + (new_wt * cur[j]) + ) / (old_wt[j] + new_wt) + if adjust: + old_wt[j] += new_wt + else: + old_wt[j] = 1.0 + elif is_observations[j]: + weighted_avg[j] = cur[j] + + result[i] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) + + return result, old_wt + + return online_ewma + + +class EWMMeanState: + def __init__(self, com, adjust, ignore_na, axis, shape) -> None: + alpha = 1.0 / (1.0 + com) + self.axis = axis + self.shape = shape + self.adjust = adjust + self.ignore_na = ignore_na + self.new_wt = 1.0 if adjust else alpha + self.old_wt_factor = 1.0 - alpha + self.old_wt = np.ones(self.shape[self.axis - 1]) + self.last_ewm = None + + def run_ewm(self, weighted_avg, deltas, min_periods, ewm_func): + result, old_wt = ewm_func( + weighted_avg, + deltas, + min_periods, + self.old_wt_factor, + self.new_wt, + self.old_wt, + self.adjust, + self.ignore_na, + ) + self.old_wt = old_wt + self.last_ewm = result[-1] + return result + + def reset(self) -> None: + self.old_wt = np.ones(self.shape[self.axis - 1]) + self.last_ewm = None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/rolling.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/rolling.py new file mode 100644 index 0000000000000000000000000000000000000000..68cec16ec9eca8b34fadd9e67221219ebd7fe736 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/core/window/rolling.py @@ -0,0 +1,2930 @@ +""" +Provide a generic structure to support window functions, +similar to how we have a Groupby object. +""" +from __future__ import annotations + +import copy +from datetime import timedelta +from functools import partial +import inspect +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, +) + +import numpy as np + +from pandas._libs.tslibs import ( + BaseOffset, + Timedelta, + to_offset, +) +import pandas._libs.window.aggregations as window_aggregations +from pandas.compat._optional import import_optional_dependency +from pandas.errors import DataError +from pandas.util._decorators import ( + deprecate_kwarg, + doc, +) + +from pandas.core.dtypes.common import ( + ensure_float64, + is_bool, + is_integer, + is_numeric_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.dtypes import ArrowDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, +) +from pandas.core.dtypes.missing import notna + +from pandas.core._numba import executor +from pandas.core.algorithms import factorize +from pandas.core.apply import ResamplerWindowApply +from pandas.core.arrays import ExtensionArray +from pandas.core.base import SelectionMixin +import pandas.core.common as com +from pandas.core.indexers.objects import ( + BaseIndexer, + FixedWindowIndexer, + GroupbyIndexer, + VariableWindowIndexer, +) +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + MultiIndex, + PeriodIndex, + TimedeltaIndex, +) +from pandas.core.reshape.concat import concat +from pandas.core.util.numba_ import ( + get_jit_arguments, + maybe_use_numba, +) +from pandas.core.window.common import ( + flex_binary_moment, + zsqrt, +) +from pandas.core.window.doc import ( + _shared_docs, + create_section_header, + kwargs_numeric_only, + kwargs_scipy, + numba_notes, + template_header, + template_returns, + template_see_also, + window_agg_numba_parameters, + window_apply_parameters, +) +from pandas.core.window.numba_ import ( + generate_manual_numpy_nan_agg_with_axis, + generate_numba_apply_func, + generate_numba_table_func, +) + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterator, + Sized, + ) + + from pandas._typing import ( + ArrayLike, + Axis, + NDFrameT, + QuantileInterpolation, + WindowingRankType, + npt, + ) + + from pandas import ( + DataFrame, + Series, + ) + from pandas.core.generic import NDFrame + from pandas.core.groupby.ops import BaseGrouper + +from pandas.core.arrays.datetimelike import dtype_to_unit + + +class BaseWindow(SelectionMixin): + """Provides utilities for performing windowing operations.""" + + _attributes: list[str] = [] + exclusions: frozenset[Hashable] = frozenset() + _on: Index + + def __init__( + self, + obj: NDFrame, + window=None, + min_periods: int | None = None, + center: bool | None = False, + win_type: str | None = None, + axis: Axis = 0, + on: str | Index | None = None, + closed: str | None = None, + step: int | None = None, + method: str = "single", + *, + selection=None, + ) -> None: + self.obj = obj + self.on = on + self.closed = closed + self.step = step + self.window = window + self.min_periods = min_periods + self.center = center + self.win_type = win_type + self.axis = obj._get_axis_number(axis) if axis is not None else None + self.method = method + self._win_freq_i8: int | None = None + if self.on is None: + if self.axis == 0: + self._on = self.obj.index + else: + # i.e. self.axis == 1 + self._on = self.obj.columns + elif isinstance(self.on, Index): + self._on = self.on + elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns: + self._on = Index(self.obj[self.on]) + else: + raise ValueError( + f"invalid on specified as {self.on}, " + "must be a column (of DataFrame), an Index or None" + ) + + self._selection = selection + self._validate() + + def _validate(self) -> None: + if self.center is not None and not is_bool(self.center): + raise ValueError("center must be a boolean") + if self.min_periods is not None: + if not is_integer(self.min_periods): + raise ValueError("min_periods must be an integer") + if self.min_periods < 0: + raise ValueError("min_periods must be >= 0") + if is_integer(self.window) and self.min_periods > self.window: + raise ValueError( + f"min_periods {self.min_periods} must be <= window {self.window}" + ) + if self.closed is not None and self.closed not in [ + "right", + "both", + "left", + "neither", + ]: + raise ValueError("closed must be 'right', 'left', 'both' or 'neither'") + if not isinstance(self.obj, (ABCSeries, ABCDataFrame)): + raise TypeError(f"invalid type: {type(self)}") + if isinstance(self.window, BaseIndexer): + # Validate that the passed BaseIndexer subclass has + # a get_window_bounds with the correct signature. + get_window_bounds_signature = inspect.signature( + self.window.get_window_bounds + ).parameters.keys() + expected_signature = inspect.signature( + BaseIndexer().get_window_bounds + ).parameters.keys() + if get_window_bounds_signature != expected_signature: + raise ValueError( + f"{type(self.window).__name__} does not implement " + f"the correct signature for get_window_bounds" + ) + if self.method not in ["table", "single"]: + raise ValueError("method must be 'table' or 'single") + if self.step is not None: + if not is_integer(self.step): + raise ValueError("step must be an integer") + if self.step < 0: + raise ValueError("step must be >= 0") + + def _check_window_bounds( + self, start: np.ndarray, end: np.ndarray, num_vals: int + ) -> None: + if len(start) != len(end): + raise ValueError( + f"start ({len(start)}) and end ({len(end)}) bounds must be the " + f"same length" + ) + if len(start) != (num_vals + (self.step or 1) - 1) // (self.step or 1): + raise ValueError( + f"start and end bounds ({len(start)}) must be the same length " + f"as the object ({num_vals}) divided by the step ({self.step}) " + f"if given and rounded up" + ) + + def _slice_axis_for_step(self, index: Index, result: Sized | None = None) -> Index: + """ + Slices the index for a given result and the preset step. + """ + return ( + index + if result is None or len(result) == len(index) + else index[:: self.step] + ) + + def _validate_numeric_only(self, name: str, numeric_only: bool) -> None: + """ + Validate numeric_only argument, raising if invalid for the input. + + Parameters + ---------- + name : str + Name of the operator (kernel). + numeric_only : bool + Value passed by user. + """ + if ( + self._selected_obj.ndim == 1 + and numeric_only + and not is_numeric_dtype(self._selected_obj.dtype) + ): + raise NotImplementedError( + f"{type(self).__name__}.{name} does not implement numeric_only" + ) + + def _make_numeric_only(self, obj: NDFrameT) -> NDFrameT: + """Subset DataFrame to numeric columns. + + Parameters + ---------- + obj : DataFrame + + Returns + ------- + obj subset to numeric-only columns. + """ + result = obj.select_dtypes(include=["number"], exclude=["timedelta"]) + return result + + def _create_data(self, obj: NDFrameT, numeric_only: bool = False) -> NDFrameT: + """ + Split data into blocks & return conformed data. + """ + # filter out the on from the object + if self.on is not None and not isinstance(self.on, Index) and obj.ndim == 2: + obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) + if obj.ndim > 1 and (numeric_only or self.axis == 1): + # GH: 20649 in case of mixed dtype and axis=1 we have to convert everything + # to float to calculate the complete row at once. We exclude all non-numeric + # dtypes. + obj = self._make_numeric_only(obj) + if self.axis == 1: + obj = obj.astype("float64", copy=False) + obj._mgr = obj._mgr.consolidate() + return obj + + def _gotitem(self, key, ndim, subset=None): + """ + Sub-classes to define. Return a sliced object. + + Parameters + ---------- + key : str / list of selections + ndim : {1, 2} + requested ndim of result + subset : object, default None + subset to act on + """ + # create a new object to prevent aliasing + if subset is None: + subset = self.obj + + # we need to make a shallow copy of ourselves + # with the same groupby + kwargs = {attr: getattr(self, attr) for attr in self._attributes} + + selection = self._infer_selection(key, subset) + new_win = type(self)(subset, selection=selection, **kwargs) + return new_win + + def __getattr__(self, attr: str): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) + if attr in self.obj: + return self[attr] + + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{attr}'" + ) + + def _dir_additions(self): + return self.obj._dir_additions() + + def __repr__(self) -> str: + """ + Provide a nice str repr of our rolling object. + """ + attrs_list = ( + f"{attr_name}={getattr(self, attr_name)}" + for attr_name in self._attributes + if getattr(self, attr_name, None) is not None and attr_name[0] != "_" + ) + attrs = ",".join(attrs_list) + return f"{type(self).__name__} [{attrs}]" + + def __iter__(self) -> Iterator: + obj = self._selected_obj.set_axis(self._on) + obj = self._create_data(obj) + indexer = self._get_window_indexer() + + start, end = indexer.get_window_bounds( + num_values=len(obj), + min_periods=self.min_periods, + center=self.center, + closed=self.closed, + step=self.step, + ) + self._check_window_bounds(start, end, len(obj)) + + for s, e in zip(start, end): + result = obj.iloc[slice(s, e)] + yield result + + def _prep_values(self, values: ArrayLike) -> np.ndarray: + """Convert input to numpy arrays for Cython routines""" + if needs_i8_conversion(values.dtype): + raise NotImplementedError( + f"ops for {type(self).__name__} for this " + f"dtype {values.dtype} are not implemented" + ) + # GH #12373 : rolling functions error on float32 data + # make sure the data is coerced to float64 + try: + if isinstance(values, ExtensionArray): + values = values.to_numpy(np.float64, na_value=np.nan) + else: + values = ensure_float64(values) + except (ValueError, TypeError) as err: + raise TypeError(f"cannot handle this type -> {values.dtype}") from err + + # Convert inf to nan for C funcs + inf = np.isinf(values) + if inf.any(): + values = np.where(inf, np.nan, values) + + return values + + def _insert_on_column(self, result: DataFrame, obj: DataFrame) -> None: + # if we have an 'on' column we want to put it back into + # the results in the same location + from pandas import Series + + if self.on is not None and not self._on.equals(obj.index): + name = self._on.name + extra_col = Series(self._on, index=self.obj.index, name=name, copy=False) + if name in result.columns: + # TODO: sure we want to overwrite results? + result[name] = extra_col + elif name in result.index.names: + pass + elif name in self._selected_obj.columns: + # insert in the same location as we had in _selected_obj + old_cols = self._selected_obj.columns + new_cols = result.columns + old_loc = old_cols.get_loc(name) + overlap = new_cols.intersection(old_cols[:old_loc]) + new_loc = len(overlap) + result.insert(new_loc, name, extra_col) + else: + # insert at the end + result[name] = extra_col + + @property + def _index_array(self) -> npt.NDArray[np.int64] | None: + # TODO: why do we get here with e.g. MultiIndex? + if isinstance(self._on, (PeriodIndex, DatetimeIndex, TimedeltaIndex)): + return self._on.asi8 + elif isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM": + return self._on.to_numpy(dtype=np.int64) + return None + + def _resolve_output(self, out: DataFrame, obj: DataFrame) -> DataFrame: + """Validate and finalize result.""" + if out.shape[1] == 0 and obj.shape[1] > 0: + raise DataError("No numeric types to aggregate") + if out.shape[1] == 0: + return obj.astype("float64") + + self._insert_on_column(out, obj) + return out + + def _get_window_indexer(self) -> BaseIndexer: + """ + Return an indexer class that will compute the window start and end bounds + """ + if isinstance(self.window, BaseIndexer): + return self.window + if self._win_freq_i8 is not None: + return VariableWindowIndexer( + index_array=self._index_array, + window_size=self._win_freq_i8, + center=self.center, + ) + return FixedWindowIndexer(window_size=self.window) + + def _apply_series( + self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None + ) -> Series: + """ + Series version of _apply_columnwise + """ + obj = self._create_data(self._selected_obj) + + if name == "count": + # GH 12541: Special case for count where we support date-like types + obj = notna(obj).astype(int) + try: + values = self._prep_values(obj._values) + except (TypeError, NotImplementedError) as err: + raise DataError("No numeric types to aggregate") from err + + result = homogeneous_func(values) + index = self._slice_axis_for_step(obj.index, result) + return obj._constructor(result, index=index, name=obj.name) + + def _apply_columnwise( + self, + homogeneous_func: Callable[..., ArrayLike], + name: str, + numeric_only: bool = False, + ) -> DataFrame | Series: + """ + Apply the given function to the DataFrame broken down into homogeneous + sub-frames. + """ + self._validate_numeric_only(name, numeric_only) + if self._selected_obj.ndim == 1: + return self._apply_series(homogeneous_func, name) + + obj = self._create_data(self._selected_obj, numeric_only) + if name == "count": + # GH 12541: Special case for count where we support date-like types + obj = notna(obj).astype(int) + obj._mgr = obj._mgr.consolidate() + + if self.axis == 1: + obj = obj.T + + taker = [] + res_values = [] + for i, arr in enumerate(obj._iter_column_arrays()): + # GH#42736 operate column-wise instead of block-wise + # As of 2.0, hfunc will raise for nuisance columns + try: + arr = self._prep_values(arr) + except (TypeError, NotImplementedError) as err: + raise DataError( + f"Cannot aggregate non-numeric type: {arr.dtype}" + ) from err + res = homogeneous_func(arr) + res_values.append(res) + taker.append(i) + + index = self._slice_axis_for_step( + obj.index, res_values[0] if len(res_values) > 0 else None + ) + df = type(obj)._from_arrays( + res_values, + index=index, + columns=obj.columns.take(taker), + verify_integrity=False, + ) + + if self.axis == 1: + df = df.T + + return self._resolve_output(df, obj) + + def _apply_tablewise( + self, + homogeneous_func: Callable[..., ArrayLike], + name: str | None = None, + numeric_only: bool = False, + ) -> DataFrame | Series: + """ + Apply the given function to the DataFrame across the entire object + """ + if self._selected_obj.ndim == 1: + raise ValueError("method='table' not applicable for Series objects.") + obj = self._create_data(self._selected_obj, numeric_only) + values = self._prep_values(obj.to_numpy()) + values = values.T if self.axis == 1 else values + result = homogeneous_func(values) + result = result.T if self.axis == 1 else result + index = self._slice_axis_for_step(obj.index, result) + columns = ( + obj.columns + if result.shape[1] == len(obj.columns) + else obj.columns[:: self.step] + ) + out = obj._constructor(result, index=index, columns=columns) + + return self._resolve_output(out, obj) + + def _apply_pairwise( + self, + target: DataFrame | Series, + other: DataFrame | Series | None, + pairwise: bool | None, + func: Callable[[DataFrame | Series, DataFrame | Series], DataFrame | Series], + numeric_only: bool, + ) -> DataFrame | Series: + """ + Apply the given pairwise function given 2 pandas objects (DataFrame/Series) + """ + target = self._create_data(target, numeric_only) + if other is None: + other = target + # only default unset + pairwise = True if pairwise is None else pairwise + elif not isinstance(other, (ABCDataFrame, ABCSeries)): + raise ValueError("other must be a DataFrame or Series") + elif other.ndim == 2 and numeric_only: + other = self._make_numeric_only(other) + + return flex_binary_moment(target, other, func, pairwise=bool(pairwise)) + + def _apply( + self, + func: Callable[..., Any], + name: str, + numeric_only: bool = False, + numba_args: tuple[Any, ...] = (), + **kwargs, + ): + """ + Rolling statistical measure using supplied function. + + Designed to be used with passed-in Cython array-based functions. + + Parameters + ---------- + func : callable function to apply + name : str, + numba_args : tuple + args to be passed when func is a numba func + **kwargs + additional arguments for rolling function and window function + + Returns + ------- + y : type of input + """ + window_indexer = self._get_window_indexer() + min_periods = ( + self.min_periods + if self.min_periods is not None + else window_indexer.window_size + ) + + def homogeneous_func(values: np.ndarray): + # calculation function + + if values.size == 0: + return values.copy() + + def calc(x): + start, end = window_indexer.get_window_bounds( + num_values=len(x), + min_periods=min_periods, + center=self.center, + closed=self.closed, + step=self.step, + ) + self._check_window_bounds(start, end, len(x)) + + return func(x, start, end, min_periods, *numba_args) + + with np.errstate(all="ignore"): + result = calc(values) + + return result + + if self.method == "single": + return self._apply_columnwise(homogeneous_func, name, numeric_only) + else: + return self._apply_tablewise(homogeneous_func, name, numeric_only) + + def _numba_apply( + self, + func: Callable[..., Any], + engine_kwargs: dict[str, bool] | None = None, + **func_kwargs, + ): + window_indexer = self._get_window_indexer() + min_periods = ( + self.min_periods + if self.min_periods is not None + else window_indexer.window_size + ) + obj = self._create_data(self._selected_obj) + if self.axis == 1: + obj = obj.T + values = self._prep_values(obj.to_numpy()) + if values.ndim == 1: + values = values.reshape(-1, 1) + start, end = window_indexer.get_window_bounds( + num_values=len(values), + min_periods=min_periods, + center=self.center, + closed=self.closed, + step=self.step, + ) + self._check_window_bounds(start, end, len(values)) + # For now, map everything to float to match the Cython impl + # even though it is wrong + # TODO: Could preserve correct dtypes in future + # xref #53214 + dtype_mapping = executor.float_dtype_mapping + aggregator = executor.generate_shared_aggregator( + func, + dtype_mapping, + is_grouped_kernel=False, + **get_jit_arguments(engine_kwargs), + ) + result = aggregator( + values.T, start=start, end=end, min_periods=min_periods, **func_kwargs + ).T + result = result.T if self.axis == 1 else result + index = self._slice_axis_for_step(obj.index, result) + if obj.ndim == 1: + result = result.squeeze() + out = obj._constructor(result, index=index, name=obj.name) + return out + else: + columns = self._slice_axis_for_step(obj.columns, result.T) + out = obj._constructor(result, index=index, columns=columns) + return self._resolve_output(out, obj) + + def aggregate(self, func, *args, **kwargs): + result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() + if result is None: + return self.apply(func, raw=False, args=args, kwargs=kwargs) + return result + + agg = aggregate + + +class BaseWindowGroupby(BaseWindow): + """ + Provide the groupby windowing facilities. + """ + + _grouper: BaseGrouper + _as_index: bool + _attributes: list[str] = ["_grouper"] + + def __init__( + self, + obj: DataFrame | Series, + *args, + _grouper: BaseGrouper, + _as_index: bool = True, + **kwargs, + ) -> None: + from pandas.core.groupby.ops import BaseGrouper + + if not isinstance(_grouper, BaseGrouper): + raise ValueError("Must pass a BaseGrouper object.") + self._grouper = _grouper + self._as_index = _as_index + # GH 32262: It's convention to keep the grouping column in + # groupby., but unexpected to users in + # groupby.rolling. + obj = obj.drop(columns=self._grouper.names, errors="ignore") + # GH 15354 + if kwargs.get("step") is not None: + raise NotImplementedError("step not implemented for groupby") + super().__init__(obj, *args, **kwargs) + + def _apply( + self, + func: Callable[..., Any], + name: str, + numeric_only: bool = False, + numba_args: tuple[Any, ...] = (), + **kwargs, + ) -> DataFrame | Series: + result = super()._apply( + func, + name, + numeric_only, + numba_args, + **kwargs, + ) + # Reconstruct the resulting MultiIndex + # 1st set of levels = group by labels + # 2nd set of levels = original DataFrame/Series index + grouped_object_index = self.obj.index + grouped_index_name = [*grouped_object_index.names] + groupby_keys = copy.copy(self._grouper.names) + result_index_names = groupby_keys + grouped_index_name + + drop_columns = [ + key + for key in self._grouper.names + if key not in self.obj.index.names or key is None + ] + + if len(drop_columns) != len(groupby_keys): + # Our result will have still kept the column in the result + result = result.drop(columns=drop_columns, errors="ignore") + + codes = self._grouper.codes + levels = copy.copy(self._grouper.levels) + + group_indices = self._grouper.indices.values() + if group_indices: + indexer = np.concatenate(list(group_indices)) + else: + indexer = np.array([], dtype=np.intp) + codes = [c.take(indexer) for c in codes] + + # if the index of the original dataframe needs to be preserved, append + # this index (but reordered) to the codes/levels from the groupby + if grouped_object_index is not None: + idx = grouped_object_index.take(indexer) + if not isinstance(idx, MultiIndex): + idx = MultiIndex.from_arrays([idx]) + codes.extend(list(idx.codes)) + levels.extend(list(idx.levels)) + + result_index = MultiIndex( + levels, codes, names=result_index_names, verify_integrity=False + ) + + result.index = result_index + if not self._as_index: + result = result.reset_index(level=list(range(len(groupby_keys)))) + return result + + def _apply_pairwise( + self, + target: DataFrame | Series, + other: DataFrame | Series | None, + pairwise: bool | None, + func: Callable[[DataFrame | Series, DataFrame | Series], DataFrame | Series], + numeric_only: bool, + ) -> DataFrame | Series: + """ + Apply the given pairwise function given 2 pandas objects (DataFrame/Series) + """ + # Manually drop the grouping column first + target = target.drop(columns=self._grouper.names, errors="ignore") + result = super()._apply_pairwise(target, other, pairwise, func, numeric_only) + # 1) Determine the levels + codes of the groupby levels + if other is not None and not all( + len(group) == len(other) for group in self._grouper.indices.values() + ): + # GH 42915 + # len(other) != len(any group), so must reindex (expand) the result + # from flex_binary_moment to a "transform"-like result + # per groupby combination + old_result_len = len(result) + result = concat( + [ + result.take(gb_indices).reindex(result.index) + for gb_indices in self._grouper.indices.values() + ] + ) + + gb_pairs = ( + com.maybe_make_list(pair) for pair in self._grouper.indices.keys() + ) + groupby_codes = [] + groupby_levels = [] + # e.g. [[1, 2], [4, 5]] as [[1, 4], [2, 5]] + for gb_level_pair in map(list, zip(*gb_pairs)): + labels = np.repeat(np.array(gb_level_pair), old_result_len) + codes, levels = factorize(labels) + groupby_codes.append(codes) + groupby_levels.append(levels) + else: + # pairwise=True or len(other) == len(each group), so repeat + # the groupby labels by the number of columns in the original object + groupby_codes = self._grouper.codes + # error: Incompatible types in assignment (expression has type + # "List[Index]", variable has type "List[Union[ndarray, Index]]") + groupby_levels = self._grouper.levels # type: ignore[assignment] + + group_indices = self._grouper.indices.values() + if group_indices: + indexer = np.concatenate(list(group_indices)) + else: + indexer = np.array([], dtype=np.intp) + + if target.ndim == 1: + repeat_by = 1 + else: + repeat_by = len(target.columns) + groupby_codes = [ + np.repeat(c.take(indexer), repeat_by) for c in groupby_codes + ] + # 2) Determine the levels + codes of the result from super()._apply_pairwise + if isinstance(result.index, MultiIndex): + result_codes = list(result.index.codes) + result_levels = list(result.index.levels) + result_names = list(result.index.names) + else: + idx_codes, idx_levels = factorize(result.index) + result_codes = [idx_codes] + result_levels = [idx_levels] + result_names = [result.index.name] + + # 3) Create the resulting index by combining 1) + 2) + result_codes = groupby_codes + result_codes + result_levels = groupby_levels + result_levels + result_names = self._grouper.names + result_names + + result_index = MultiIndex( + result_levels, result_codes, names=result_names, verify_integrity=False + ) + result.index = result_index + return result + + def _create_data(self, obj: NDFrameT, numeric_only: bool = False) -> NDFrameT: + """ + Split data into blocks & return conformed data. + """ + # Ensure the object we're rolling over is monotonically sorted relative + # to the groups + # GH 36197 + if not obj.empty: + groupby_order = np.concatenate(list(self._grouper.indices.values())).astype( + np.int64 + ) + obj = obj.take(groupby_order) + return super()._create_data(obj, numeric_only) + + def _gotitem(self, key, ndim, subset=None): + # we are setting the index on the actual object + # here so our index is carried through to the selected obj + # when we do the splitting for the groupby + if self.on is not None: + # GH 43355 + subset = self.obj.set_index(self._on) + return super()._gotitem(key, ndim, subset=subset) + + +class Window(BaseWindow): + """ + Provide rolling window calculations. + + Parameters + ---------- + window : int, timedelta, str, offset, or BaseIndexer subclass + Size of the moving window. + + If an integer, the fixed number of observations used for + each window. + + If a timedelta, str, or offset, the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetimelike indexes. + To learn more about the offsets & frequency strings, please see `this link + `__. + + If a BaseIndexer subclass, the window boundaries + based on the defined ``get_window_bounds`` method. Additional rolling + keyword arguments, namely ``min_periods``, ``center``, ``closed`` and + ``step`` will be passed to ``get_window_bounds``. + + min_periods : int, default None + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + + For a window that is specified by an offset, ``min_periods`` will default to 1. + + For a window that is specified by an integer, ``min_periods`` will default + to the size of the window. + + center : bool, default False + If False, set the window labels as the right edge of the window index. + + If True, set the window labels as the center of the window index. + + win_type : str, default None + If ``None``, all points are evenly weighted. + + If a string, it must be a valid `scipy.signal window function + `__. + + Certain Scipy window types require additional parameters to be passed + in the aggregation function. The additional parameters must match + the keywords specified in the Scipy window type method signature. + + on : str, optional + For a DataFrame, a column label or Index level on which + to calculate the rolling window, rather than the DataFrame's index. + + Provided integer column is ignored and excluded from result since + an integer index is not used to calculate the rolling window. + + axis : int or str, default 0 + If ``0`` or ``'index'``, roll across the rows. + + If ``1`` or ``'columns'``, roll across the columns. + + For `Series` this parameter is unused and defaults to 0. + + .. deprecated:: 2.1.0 + + The axis keyword is deprecated. For ``axis=1``, + transpose the DataFrame first instead. + + closed : str, default None + If ``'right'``, the first point in the window is excluded from calculations. + + If ``'left'``, the last point in the window is excluded from calculations. + + If ``'both'``, the no points in the window are excluded from calculations. + + If ``'neither'``, the first and last points in the window are excluded + from calculations. + + Default ``None`` (``'right'``). + + step : int, default None + + .. versionadded:: 1.5.0 + + Evaluate the window at every ``step`` result, equivalent to slicing as + ``[::step]``. ``window`` must be an integer. Using a step argument other + than None or 1 will produce a result with a different shape than the input. + + method : str {'single', 'table'}, default 'single' + + .. versionadded:: 1.3.0 + + Execute the rolling operation per single column or row (``'single'``) + or over the entire object (``'table'``). + + This argument is only implemented when specifying ``engine='numba'`` + in the method call. + + Returns + ------- + pandas.api.typing.Window or pandas.api.typing.Rolling + An instance of Window is returned if ``win_type`` is passed. Otherwise, + an instance of Rolling is returned. + + See Also + -------- + expanding : Provides expanding transformations. + ewm : Provides exponential weighted functions. + + Notes + ----- + See :ref:`Windowing Operations ` for further usage details + and examples. + + Examples + -------- + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + **window** + + Rolling sum with a window length of 2 observations. + + >>> df.rolling(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 NaN + 4 NaN + + Rolling sum with a window span of 2 seconds. + + >>> df_time = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + ... index=[pd.Timestamp('20130101 09:00:00'), + ... pd.Timestamp('20130101 09:00:02'), + ... pd.Timestamp('20130101 09:00:03'), + ... pd.Timestamp('20130101 09:00:05'), + ... pd.Timestamp('20130101 09:00:06')]) + + >>> df_time + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 2.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 + + >>> df_time.rolling('2s').sum() + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 3.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 + + Rolling sum with forward looking windows with 2 observations. + + >>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2) + >>> df.rolling(window=indexer, min_periods=1).sum() + B + 0 1.0 + 1 3.0 + 2 2.0 + 3 4.0 + 4 4.0 + + **min_periods** + + Rolling sum with a window length of 2 observations, but only needs a minimum of 1 + observation to calculate a value. + + >>> df.rolling(2, min_periods=1).sum() + B + 0 0.0 + 1 1.0 + 2 3.0 + 3 2.0 + 4 4.0 + + **center** + + Rolling sum with the result assigned to the center of the window index. + + >>> df.rolling(3, min_periods=1, center=True).sum() + B + 0 1.0 + 1 3.0 + 2 3.0 + 3 6.0 + 4 4.0 + + >>> df.rolling(3, min_periods=1, center=False).sum() + B + 0 0.0 + 1 1.0 + 2 3.0 + 3 3.0 + 4 6.0 + + **step** + + Rolling sum with a window length of 2 observations, minimum of 1 observation to + calculate a value, and a step of 2. + + >>> df.rolling(2, min_periods=1, step=2).sum() + B + 0 0.0 + 2 3.0 + 4 4.0 + + **win_type** + + Rolling sum with a window length of 2, using the Scipy ``'gaussian'`` + window type. ``std`` is required in the aggregation function. + + >>> df.rolling(2, win_type='gaussian').sum(std=3) + B + 0 NaN + 1 0.986207 + 2 2.958621 + 3 NaN + 4 NaN + + **on** + + Rolling sum with a window length of 2 days. + + >>> df = pd.DataFrame({ + ... 'A': [pd.to_datetime('2020-01-01'), + ... pd.to_datetime('2020-01-01'), + ... pd.to_datetime('2020-01-02'),], + ... 'B': [1, 2, 3], }, + ... index=pd.date_range('2020', periods=3)) + + >>> df + A B + 2020-01-01 2020-01-01 1 + 2020-01-02 2020-01-01 2 + 2020-01-03 2020-01-02 3 + + >>> df.rolling('2D', on='A').sum() + A B + 2020-01-01 2020-01-01 1.0 + 2020-01-02 2020-01-01 3.0 + 2020-01-03 2020-01-02 6.0 + """ + + _attributes = [ + "window", + "min_periods", + "center", + "win_type", + "axis", + "on", + "closed", + "step", + "method", + ] + + def _validate(self): + super()._validate() + + if not isinstance(self.win_type, str): + raise ValueError(f"Invalid win_type {self.win_type}") + signal = import_optional_dependency( + "scipy.signal.windows", extra="Scipy is required to generate window weight." + ) + self._scipy_weight_generator = getattr(signal, self.win_type, None) + if self._scipy_weight_generator is None: + raise ValueError(f"Invalid win_type {self.win_type}") + + if isinstance(self.window, BaseIndexer): + raise NotImplementedError( + "BaseIndexer subclasses not implemented with win_types." + ) + if not is_integer(self.window) or self.window < 0: + raise ValueError("window must be an integer 0 or greater") + + if self.method != "single": + raise NotImplementedError("'single' is the only supported method type.") + + def _center_window(self, result: np.ndarray, offset: int) -> np.ndarray: + """ + Center the result in the window for weighted rolling aggregations. + """ + if offset > 0: + lead_indexer = [slice(offset, None)] + result = np.copy(result[tuple(lead_indexer)]) + return result + + def _apply( + self, + func: Callable[[np.ndarray, int, int], np.ndarray], + name: str, + numeric_only: bool = False, + numba_args: tuple[Any, ...] = (), + **kwargs, + ): + """ + Rolling with weights statistical measure using supplied function. + + Designed to be used with passed-in Cython array-based functions. + + Parameters + ---------- + func : callable function to apply + name : str, + numeric_only : bool, default False + Whether to only operate on bool, int, and float columns + numba_args : tuple + unused + **kwargs + additional arguments for scipy windows if necessary + + Returns + ------- + y : type of input + """ + # "None" not callable [misc] + window = self._scipy_weight_generator( # type: ignore[misc] + self.window, **kwargs + ) + offset = (len(window) - 1) // 2 if self.center else 0 + + def homogeneous_func(values: np.ndarray): + # calculation function + + if values.size == 0: + return values.copy() + + def calc(x): + additional_nans = np.array([np.nan] * offset) + x = np.concatenate((x, additional_nans)) + return func( + x, + window, + self.min_periods if self.min_periods is not None else len(window), + ) + + with np.errstate(all="ignore"): + # Our weighted aggregations return memoryviews + result = np.asarray(calc(values)) + + if self.center: + result = self._center_window(result, offset) + + return result + + return self._apply_columnwise(homogeneous_func, name, numeric_only)[ + :: self.step + ] + + @doc( + _shared_docs["aggregate"], + see_also=dedent( + """ + See Also + -------- + pandas.DataFrame.aggregate : Similar DataFrame method. + pandas.Series.aggregate : Similar Series method. + """ + ), + examples=dedent( + """ + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.rolling(2, win_type="boxcar").agg("mean") + A B C + 0 NaN NaN NaN + 1 1.5 4.5 7.5 + 2 2.5 5.5 8.5 + """ + ), + klass="Series/DataFrame", + axis="", + ) + def aggregate(self, func, *args, **kwargs): + result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() + if result is None: + # these must apply directly + result = func(self) + + return result + + agg = aggregate + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + kwargs_scipy, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([0, 1, 5, 2, 8]) + + To get an instance of :class:`~pandas.core.window.rolling.Window` we need + to pass the parameter `win_type`. + + >>> type(ser.rolling(2, win_type='gaussian')) + + + In order to use the `SciPy` Gaussian window we need to provide the parameters + `M` and `std`. The parameter `M` corresponds to 2 in our example. + We pass the second parameter `std` as a parameter of the following method + (`sum` in this case): + + >>> ser.rolling(2, win_type='gaussian').sum(std=3) + 0 NaN + 1 0.986207 + 2 5.917243 + 3 6.903450 + 4 9.862071 + dtype: float64 + """ + ), + window_method="rolling", + aggregation_description="weighted window sum", + agg_method="sum", + ) + def sum(self, numeric_only: bool = False, **kwargs): + window_func = window_aggregations.roll_weighted_sum + # error: Argument 1 to "_apply" of "Window" has incompatible type + # "Callable[[ndarray, ndarray, int], ndarray]"; expected + # "Callable[[ndarray, int, int], ndarray]" + return self._apply( + window_func, # type: ignore[arg-type] + name="sum", + numeric_only=numeric_only, + **kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + kwargs_scipy, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([0, 1, 5, 2, 8]) + + To get an instance of :class:`~pandas.core.window.rolling.Window` we need + to pass the parameter `win_type`. + + >>> type(ser.rolling(2, win_type='gaussian')) + + + In order to use the `SciPy` Gaussian window we need to provide the parameters + `M` and `std`. The parameter `M` corresponds to 2 in our example. + We pass the second parameter `std` as a parameter of the following method: + + >>> ser.rolling(2, win_type='gaussian').mean(std=3) + 0 NaN + 1 0.5 + 2 3.0 + 3 3.5 + 4 5.0 + dtype: float64 + """ + ), + window_method="rolling", + aggregation_description="weighted window mean", + agg_method="mean", + ) + def mean(self, numeric_only: bool = False, **kwargs): + window_func = window_aggregations.roll_weighted_mean + # error: Argument 1 to "_apply" of "Window" has incompatible type + # "Callable[[ndarray, ndarray, int], ndarray]"; expected + # "Callable[[ndarray, int, int], ndarray]" + return self._apply( + window_func, # type: ignore[arg-type] + name="mean", + numeric_only=numeric_only, + **kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + kwargs_scipy, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([0, 1, 5, 2, 8]) + + To get an instance of :class:`~pandas.core.window.rolling.Window` we need + to pass the parameter `win_type`. + + >>> type(ser.rolling(2, win_type='gaussian')) + + + In order to use the `SciPy` Gaussian window we need to provide the parameters + `M` and `std`. The parameter `M` corresponds to 2 in our example. + We pass the second parameter `std` as a parameter of the following method: + + >>> ser.rolling(2, win_type='gaussian').var(std=3) + 0 NaN + 1 0.5 + 2 8.0 + 3 4.5 + 4 18.0 + dtype: float64 + """ + ), + window_method="rolling", + aggregation_description="weighted window variance", + agg_method="var", + ) + def var(self, ddof: int = 1, numeric_only: bool = False, **kwargs): + window_func = partial(window_aggregations.roll_weighted_var, ddof=ddof) + kwargs.pop("name", None) + return self._apply(window_func, name="var", numeric_only=numeric_only, **kwargs) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + kwargs_scipy, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([0, 1, 5, 2, 8]) + + To get an instance of :class:`~pandas.core.window.rolling.Window` we need + to pass the parameter `win_type`. + + >>> type(ser.rolling(2, win_type='gaussian')) + + + In order to use the `SciPy` Gaussian window we need to provide the parameters + `M` and `std`. The parameter `M` corresponds to 2 in our example. + We pass the second parameter `std` as a parameter of the following method: + + >>> ser.rolling(2, win_type='gaussian').std(std=3) + 0 NaN + 1 0.707107 + 2 2.828427 + 3 2.121320 + 4 4.242641 + dtype: float64 + """ + ), + window_method="rolling", + aggregation_description="weighted window standard deviation", + agg_method="std", + ) + def std(self, ddof: int = 1, numeric_only: bool = False, **kwargs): + return zsqrt( + self.var(ddof=ddof, name="std", numeric_only=numeric_only, **kwargs) + ) + + +class RollingAndExpandingMixin(BaseWindow): + def count(self, numeric_only: bool = False): + window_func = window_aggregations.roll_sum + return self._apply(window_func, name="count", numeric_only=numeric_only) + + def apply( + self, + func: Callable[..., Any], + raw: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + args: tuple[Any, ...] | None = None, + kwargs: dict[str, Any] | None = None, + ): + if args is None: + args = () + if kwargs is None: + kwargs = {} + + if not is_bool(raw): + raise ValueError("raw parameter must be `True` or `False`") + + numba_args: tuple[Any, ...] = () + if maybe_use_numba(engine): + if raw is False: + raise ValueError("raw must be `True` when using the numba engine") + numba_args = args + if self.method == "single": + apply_func = generate_numba_apply_func( + func, **get_jit_arguments(engine_kwargs, kwargs) + ) + else: + apply_func = generate_numba_table_func( + func, **get_jit_arguments(engine_kwargs, kwargs) + ) + elif engine in ("cython", None): + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + apply_func = self._generate_cython_apply_func(args, kwargs, raw, func) + else: + raise ValueError("engine must be either 'numba' or 'cython'") + + return self._apply( + apply_func, + name="apply", + numba_args=numba_args, + ) + + def _generate_cython_apply_func( + self, + args: tuple[Any, ...], + kwargs: dict[str, Any], + raw: bool | np.bool_, + function: Callable[..., Any], + ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, int], np.ndarray]: + from pandas import Series + + window_func = partial( + window_aggregations.roll_apply, + args=args, + kwargs=kwargs, + raw=raw, + function=function, + ) + + def apply_func(values, begin, end, min_periods, raw=raw): + if not raw: + # GH 45912 + values = Series(values, index=self._on, copy=False) + return window_func(values, begin, end, min_periods) + + return apply_func + + def sum( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + if self.method == "table": + func = generate_manual_numpy_nan_agg_with_axis(np.nansum) + return self.apply( + func, + raw=True, + engine=engine, + engine_kwargs=engine_kwargs, + ) + else: + from pandas.core._numba.kernels import sliding_sum + + return self._numba_apply(sliding_sum, engine_kwargs) + window_func = window_aggregations.roll_sum + return self._apply(window_func, name="sum", numeric_only=numeric_only) + + def max( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + if self.method == "table": + func = generate_manual_numpy_nan_agg_with_axis(np.nanmax) + return self.apply( + func, + raw=True, + engine=engine, + engine_kwargs=engine_kwargs, + ) + else: + from pandas.core._numba.kernels import sliding_min_max + + return self._numba_apply(sliding_min_max, engine_kwargs, is_max=True) + window_func = window_aggregations.roll_max + return self._apply(window_func, name="max", numeric_only=numeric_only) + + def min( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + if self.method == "table": + func = generate_manual_numpy_nan_agg_with_axis(np.nanmin) + return self.apply( + func, + raw=True, + engine=engine, + engine_kwargs=engine_kwargs, + ) + else: + from pandas.core._numba.kernels import sliding_min_max + + return self._numba_apply(sliding_min_max, engine_kwargs, is_max=False) + window_func = window_aggregations.roll_min + return self._apply(window_func, name="min", numeric_only=numeric_only) + + def mean( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + if self.method == "table": + func = generate_manual_numpy_nan_agg_with_axis(np.nanmean) + return self.apply( + func, + raw=True, + engine=engine, + engine_kwargs=engine_kwargs, + ) + else: + from pandas.core._numba.kernels import sliding_mean + + return self._numba_apply(sliding_mean, engine_kwargs) + window_func = window_aggregations.roll_mean + return self._apply(window_func, name="mean", numeric_only=numeric_only) + + def median( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + if self.method == "table": + func = generate_manual_numpy_nan_agg_with_axis(np.nanmedian) + else: + func = np.nanmedian + + return self.apply( + func, + raw=True, + engine=engine, + engine_kwargs=engine_kwargs, + ) + window_func = window_aggregations.roll_median_c + return self._apply(window_func, name="median", numeric_only=numeric_only) + + def std( + self, + ddof: int = 1, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + if self.method == "table": + raise NotImplementedError("std not supported with method='table'") + from pandas.core._numba.kernels import sliding_var + + return zsqrt(self._numba_apply(sliding_var, engine_kwargs, ddof=ddof)) + window_func = window_aggregations.roll_var + + def zsqrt_func(values, begin, end, min_periods): + return zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) + + return self._apply( + zsqrt_func, + name="std", + numeric_only=numeric_only, + ) + + def var( + self, + ddof: int = 1, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + if maybe_use_numba(engine): + if self.method == "table": + raise NotImplementedError("var not supported with method='table'") + from pandas.core._numba.kernels import sliding_var + + return self._numba_apply(sliding_var, engine_kwargs, ddof=ddof) + window_func = partial(window_aggregations.roll_var, ddof=ddof) + return self._apply( + window_func, + name="var", + numeric_only=numeric_only, + ) + + def skew(self, numeric_only: bool = False): + window_func = window_aggregations.roll_skew + return self._apply( + window_func, + name="skew", + numeric_only=numeric_only, + ) + + def sem(self, ddof: int = 1, numeric_only: bool = False): + # Raise here so error message says sem instead of std + self._validate_numeric_only("sem", numeric_only) + return self.std(numeric_only=numeric_only) / ( + self.count(numeric_only=numeric_only) - ddof + ).pow(0.5) + + def kurt(self, numeric_only: bool = False): + window_func = window_aggregations.roll_kurt + return self._apply( + window_func, + name="kurt", + numeric_only=numeric_only, + ) + + def quantile( + self, + q: float, + interpolation: QuantileInterpolation = "linear", + numeric_only: bool = False, + ): + if q == 1.0: + window_func = window_aggregations.roll_max + elif q == 0.0: + window_func = window_aggregations.roll_min + else: + window_func = partial( + window_aggregations.roll_quantile, + quantile=q, + interpolation=interpolation, + ) + + return self._apply(window_func, name="quantile", numeric_only=numeric_only) + + def rank( + self, + method: WindowingRankType = "average", + ascending: bool = True, + pct: bool = False, + numeric_only: bool = False, + ): + window_func = partial( + window_aggregations.roll_rank, + method=method, + ascending=ascending, + percentile=pct, + ) + + return self._apply(window_func, name="rank", numeric_only=numeric_only) + + def cov( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + ddof: int = 1, + numeric_only: bool = False, + ): + if self.step is not None: + raise NotImplementedError("step not implemented for cov") + self._validate_numeric_only("cov", numeric_only) + + from pandas import Series + + def cov_func(x, y): + x_array = self._prep_values(x) + y_array = self._prep_values(y) + window_indexer = self._get_window_indexer() + min_periods = ( + self.min_periods + if self.min_periods is not None + else window_indexer.window_size + ) + start, end = window_indexer.get_window_bounds( + num_values=len(x_array), + min_periods=min_periods, + center=self.center, + closed=self.closed, + step=self.step, + ) + self._check_window_bounds(start, end, len(x_array)) + + with np.errstate(all="ignore"): + mean_x_y = window_aggregations.roll_mean( + x_array * y_array, start, end, min_periods + ) + mean_x = window_aggregations.roll_mean(x_array, start, end, min_periods) + mean_y = window_aggregations.roll_mean(y_array, start, end, min_periods) + count_x_y = window_aggregations.roll_sum( + notna(x_array + y_array).astype(np.float64), start, end, 0 + ) + result = (mean_x_y - mean_x * mean_y) * (count_x_y / (count_x_y - ddof)) + return Series(result, index=x.index, name=x.name, copy=False) + + return self._apply_pairwise( + self._selected_obj, other, pairwise, cov_func, numeric_only + ) + + def corr( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + ddof: int = 1, + numeric_only: bool = False, + ): + if self.step is not None: + raise NotImplementedError("step not implemented for corr") + self._validate_numeric_only("corr", numeric_only) + + from pandas import Series + + def corr_func(x, y): + x_array = self._prep_values(x) + y_array = self._prep_values(y) + window_indexer = self._get_window_indexer() + min_periods = ( + self.min_periods + if self.min_periods is not None + else window_indexer.window_size + ) + start, end = window_indexer.get_window_bounds( + num_values=len(x_array), + min_periods=min_periods, + center=self.center, + closed=self.closed, + step=self.step, + ) + self._check_window_bounds(start, end, len(x_array)) + + with np.errstate(all="ignore"): + mean_x_y = window_aggregations.roll_mean( + x_array * y_array, start, end, min_periods + ) + mean_x = window_aggregations.roll_mean(x_array, start, end, min_periods) + mean_y = window_aggregations.roll_mean(y_array, start, end, min_periods) + count_x_y = window_aggregations.roll_sum( + notna(x_array + y_array).astype(np.float64), start, end, 0 + ) + x_var = window_aggregations.roll_var( + x_array, start, end, min_periods, ddof + ) + y_var = window_aggregations.roll_var( + y_array, start, end, min_periods, ddof + ) + numerator = (mean_x_y - mean_x * mean_y) * ( + count_x_y / (count_x_y - ddof) + ) + denominator = (x_var * y_var) ** 0.5 + result = numerator / denominator + return Series(result, index=x.index, name=x.name, copy=False) + + return self._apply_pairwise( + self._selected_obj, other, pairwise, corr_func, numeric_only + ) + + +class Rolling(RollingAndExpandingMixin): + _attributes: list[str] = [ + "window", + "min_periods", + "center", + "win_type", + "axis", + "on", + "closed", + "step", + "method", + ] + + def _validate(self): + super()._validate() + + # we allow rolling on a datetimelike index + if ( + self.obj.empty + or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex)) + or (isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM") + ) and isinstance(self.window, (str, BaseOffset, timedelta)): + self._validate_datetimelike_monotonic() + + # this will raise ValueError on non-fixed freqs + try: + freq = to_offset(self.window) + except (TypeError, ValueError) as err: + raise ValueError( + f"passed window {self.window} is not " + "compatible with a datetimelike index" + ) from err + if isinstance(self._on, PeriodIndex): + # error: Incompatible types in assignment (expression has type + # "float", variable has type "Optional[int]") + self._win_freq_i8 = freq.nanos / ( # type: ignore[assignment] + self._on.freq.nanos / self._on.freq.n + ) + else: + try: + unit = dtype_to_unit(self._on.dtype) # type: ignore[arg-type] + except TypeError: + # if not a datetime dtype, eg for empty dataframes + unit = "ns" + self._win_freq_i8 = Timedelta(freq.nanos).as_unit(unit)._value + + # min_periods must be an integer + if self.min_periods is None: + self.min_periods = 1 + + if self.step is not None: + raise NotImplementedError( + "step is not supported with frequency windows" + ) + + elif isinstance(self.window, BaseIndexer): + # Passed BaseIndexer subclass should handle all other rolling kwargs + pass + elif not is_integer(self.window) or self.window < 0: + raise ValueError("window must be an integer 0 or greater") + + def _validate_datetimelike_monotonic(self) -> None: + """ + Validate self._on is monotonic (increasing or decreasing) and has + no NaT values for frequency windows. + """ + if self._on.hasnans: + self._raise_monotonic_error("values must not have NaT") + if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing): + self._raise_monotonic_error("values must be monotonic") + + def _raise_monotonic_error(self, msg: str): + on = self.on + if on is None: + if self.axis == 0: + on = "index" + else: + on = "column" + raise ValueError(f"{on} {msg}") + + @doc( + _shared_docs["aggregate"], + see_also=dedent( + """ + See Also + -------- + pandas.Series.rolling : Calling object with Series data. + pandas.DataFrame.rolling : Calling object with DataFrame data. + """ + ), + examples=dedent( + """ + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.rolling(2).sum() + A B C + 0 NaN NaN NaN + 1 3.0 9.0 15.0 + 2 5.0 11.0 17.0 + + >>> df.rolling(2).agg({"A": "sum", "B": "min"}) + A B + 0 NaN NaN + 1 3.0 4.0 + 2 5.0 5.0 + """ + ), + klass="Series/Dataframe", + axis="", + ) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) + + agg = aggregate + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([2, 3, np.nan, 10]) + >>> s.rolling(2).count() + 0 NaN + 1 2.0 + 2 1.0 + 3 1.0 + dtype: float64 + >>> s.rolling(3).count() + 0 NaN + 1 NaN + 2 2.0 + 3 2.0 + dtype: float64 + >>> s.rolling(4).count() + 0 NaN + 1 NaN + 2 NaN + 3 3.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="count of non NaN observations", + agg_method="count", + ) + def count(self, numeric_only: bool = False): + return super().count(numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + window_apply_parameters, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 6, 5, 4]) + >>> ser.rolling(2).apply(lambda s: s.sum() - s.min()) + 0 NaN + 1 6.0 + 2 6.0 + 3 5.0 + dtype: float64 + """ + ), + window_method="rolling", + aggregation_description="custom aggregation function", + agg_method="apply", + ) + def apply( + self, + func: Callable[..., Any], + raw: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + args: tuple[Any, ...] | None = None, + kwargs: dict[str, Any] | None = None, + ): + return super().apply( + func, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 2, 3, 4, 5]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + dtype: int64 + + >>> s.rolling(3).sum() + 0 NaN + 1 NaN + 2 6.0 + 3 9.0 + 4 12.0 + dtype: float64 + + >>> s.rolling(3, center=True).sum() + 0 NaN + 1 6.0 + 2 9.0 + 3 12.0 + 4 NaN + dtype: float64 + + For DataFrame, each sum is computed column-wise. + + >>> df = pd.DataFrame({{"A": s, "B": s ** 2}}) + >>> df + A B + 0 1 1 + 1 2 4 + 2 3 9 + 3 4 16 + 4 5 25 + + >>> df.rolling(3).sum() + A B + 0 NaN NaN + 1 NaN NaN + 2 6.0 14.0 + 3 9.0 29.0 + 4 12.0 50.0 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="sum", + agg_method="sum", + ) + def sum( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().sum( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4]) + >>> ser.rolling(2).max() + 0 NaN + 1 2.0 + 2 3.0 + 3 4.0 + dtype: float64 + """ + ), + window_method="rolling", + aggregation_description="maximum", + agg_method="max", + ) + def max( + self, + numeric_only: bool = False, + *args, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + **kwargs, + ): + return super().max( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """ + Performing a rolling minimum with a window size of 3. + + >>> s = pd.Series([4, 3, 5, 2, 6]) + >>> s.rolling(3).min() + 0 NaN + 1 NaN + 2 3.0 + 3 2.0 + 4 2.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="minimum", + agg_method="min", + ) + def min( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().min( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """ + The below examples will show rolling mean calculations with window sizes of + two and three, respectively. + + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.rolling(2).mean() + 0 NaN + 1 1.5 + 2 2.5 + 3 3.5 + dtype: float64 + + >>> s.rolling(3).mean() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="mean", + agg_method="mean", + ) + def mean( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().mean( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + window_agg_numba_parameters(), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + numba_notes, + create_section_header("Examples"), + dedent( + """ + Compute the rolling median of a series with a window size of 3. + + >>> s = pd.Series([0, 1, 2, 3, 4]) + >>> s.rolling(3).median() + 0 NaN + 1 NaN + 2 1.0 + 3 2.0 + 4 3.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="median", + agg_method="median", + ) + def median( + self, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().median( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + window_agg_numba_parameters("1.4"), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "numpy.std : Equivalent method for NumPy array.\n", + template_see_also, + create_section_header("Notes"), + dedent( + """ + The default ``ddof`` of 1 used in :meth:`Series.std` is different + than the default ``ddof`` of 0 in :func:`numpy.std`. + + A minimum of one period is required for the rolling calculation.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + >>> s.rolling(3).std() + 0 NaN + 1 NaN + 2 0.577350 + 3 1.000000 + 4 1.000000 + 5 1.154701 + 6 0.000000 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="standard deviation", + agg_method="std", + ) + def std( + self, + ddof: int = 1, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().std( + ddof=ddof, + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + window_agg_numba_parameters("1.4"), + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "numpy.var : Equivalent method for NumPy array.\n", + template_see_also, + create_section_header("Notes"), + dedent( + """ + The default ``ddof`` of 1 used in :meth:`Series.var` is different + than the default ``ddof`` of 0 in :func:`numpy.var`. + + A minimum of one period is required for the rolling calculation.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + >>> s.rolling(3).var() + 0 NaN + 1 NaN + 2 0.333333 + 3 1.000000 + 4 1.000000 + 5 1.333333 + 6 0.000000 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="variance", + agg_method="var", + ) + def var( + self, + ddof: int = 1, + numeric_only: bool = False, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return super().var( + ddof=ddof, + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "scipy.stats.skew : Third moment of a probability density.\n", + template_see_also, + create_section_header("Notes"), + dedent( + """ + A minimum of three periods is required for the rolling calculation.\n + """ + ), + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 5, 2, 7, 15, 6]) + >>> ser.rolling(3).skew().round(6) + 0 NaN + 1 NaN + 2 1.293343 + 3 -0.585583 + 4 0.670284 + 5 1.652317 + dtype: float64 + """ + ), + window_method="rolling", + aggregation_description="unbiased skewness", + agg_method="skew", + ) + def skew(self, numeric_only: bool = False): + return super().skew(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Notes"), + "A minimum of one period is required for the calculation.\n\n", + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([0, 1, 2, 3]) + >>> s.rolling(2, min_periods=1).sem() + 0 NaN + 1 0.707107 + 2 0.707107 + 3 0.707107 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="standard error of mean", + agg_method="sem", + ) + def sem(self, ddof: int = 1, numeric_only: bool = False): + # Raise here so error message says sem instead of std + self._validate_numeric_only("sem", numeric_only) + return self.std(numeric_only=numeric_only) / ( + self.count(numeric_only) - ddof + ).pow(0.5) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + "scipy.stats.kurtosis : Reference SciPy method.\n", + template_see_also, + create_section_header("Notes"), + "A minimum of four periods is required for the calculation.\n\n", + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> import scipy.stats + >>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}") + -1.200000 + >>> print(f"{{scipy.stats.kurtosis(arr[1:], bias=False):.6f}}") + 3.999946 + >>> s = pd.Series(arr) + >>> s.rolling(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 3.999946 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="Fisher's definition of kurtosis without bias", + agg_method="kurt", + ) + def kurt(self, numeric_only: bool = False): + return super().kurt(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + quantile : float + Quantile to compute. 0 <= quantile <= 1. + + .. deprecated:: 2.1.0 + This will be renamed to 'q' in a future version. + interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}} + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.rolling(2).quantile(.4, interpolation='lower') + 0 NaN + 1 1.0 + 2 2.0 + 3 3.0 + dtype: float64 + + >>> s.rolling(2).quantile(.4, interpolation='midpoint') + 0 NaN + 1 1.5 + 2 2.5 + 3 3.5 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="quantile", + agg_method="quantile", + ) + @deprecate_kwarg(old_arg_name="quantile", new_arg_name="q") + def quantile( + self, + q: float, + interpolation: QuantileInterpolation = "linear", + numeric_only: bool = False, + ): + return super().quantile( + q=q, + interpolation=interpolation, + numeric_only=numeric_only, + ) + + @doc( + template_header, + ".. versionadded:: 1.4.0 \n\n", + create_section_header("Parameters"), + dedent( + """ + method : {{'average', 'min', 'max'}}, default 'average' + How to rank the group of records that have the same value (i.e. ties): + + * average: average rank of the group + * min: lowest rank in the group + * max: highest rank in the group + + ascending : bool, default True + Whether or not the elements should be ranked in ascending order. + pct : bool, default False + Whether or not to display the returned rankings in percentile + form. + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1, 4, 2, 3, 5, 3]) + >>> s.rolling(3).rank() + 0 NaN + 1 NaN + 2 2.0 + 3 2.0 + 4 3.0 + 5 1.5 + dtype: float64 + + >>> s.rolling(3).rank(method="max") + 0 NaN + 1 NaN + 2 2.0 + 3 2.0 + 4 3.0 + 5 2.0 + dtype: float64 + + >>> s.rolling(3).rank(method="min") + 0 NaN + 1 NaN + 2 2.0 + 3 2.0 + 4 3.0 + 5 1.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="rank", + agg_method="rank", + ) + def rank( + self, + method: WindowingRankType = "average", + ascending: bool = True, + pct: bool = False, + numeric_only: bool = False, + ): + return super().rank( + method=method, + ascending=ascending, + pct=pct, + numeric_only=numeric_only, + ) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + other : Series or DataFrame, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser1 = pd.Series([1, 2, 3, 4]) + >>> ser2 = pd.Series([1, 4, 5, 8]) + >>> ser1.rolling(2).cov(ser2) + 0 NaN + 1 1.5 + 2 0.5 + 3 1.5 + dtype: float64 + """ + ), + window_method="rolling", + aggregation_description="sample covariance", + agg_method="cov", + ) + def cov( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + ddof: int = 1, + numeric_only: bool = False, + ): + return super().cov( + other=other, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + ) + + @doc( + template_header, + create_section_header("Parameters"), + dedent( + """ + other : Series or DataFrame, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ + ).replace("\n", "", 1), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + dedent( + """ + cov : Similar method to calculate covariance. + numpy.corrcoef : NumPy Pearson's correlation calculation. + """ + ).replace("\n", "", 1), + template_see_also, + create_section_header("Notes"), + dedent( + """ + This function uses Pearson's definition of correlation + (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient). + + When `other` is not specified, the output will be self correlation (e.g. + all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise` + set to `True`. + + Function will return ``NaN`` for correlations of equal valued sequences; + this is the result of a 0/0 division error. + + When `pairwise` is set to `False`, only matching columns between `self` and + `other` will be used. + + When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame + with the original index on the first level, and the `other` DataFrame + columns on the second level. + + In the case of missing elements, only complete pairwise observations + will be used.\n + """ + ).replace("\n", "", 1), + create_section_header("Examples"), + dedent( + """ + The below example shows a rolling calculation with a window size of + four matching the equivalent function call using :meth:`numpy.corrcoef`. + + >>> v1 = [3, 3, 3, 5, 8] + >>> v2 = [3, 4, 4, 4, 8] + >>> np.corrcoef(v1[:-1], v2[:-1]) + array([[1. , 0.33333333], + [0.33333333, 1. ]]) + >>> np.corrcoef(v1[1:], v2[1:]) + array([[1. , 0.9169493], + [0.9169493, 1. ]]) + >>> s1 = pd.Series(v1) + >>> s2 = pd.Series(v2) + >>> s1.rolling(4).corr(s2) + 0 NaN + 1 NaN + 2 NaN + 3 0.333333 + 4 0.916949 + dtype: float64 + + The below example shows a similar rolling calculation on a + DataFrame using the pairwise option. + + >>> matrix = np.array([[51., 35.], + ... [49., 30.], + ... [47., 32.], + ... [46., 31.], + ... [50., 36.]]) + >>> np.corrcoef(matrix[:-1, 0], matrix[:-1, 1]) + array([[1. , 0.6263001], + [0.6263001, 1. ]]) + >>> np.corrcoef(matrix[1:, 0], matrix[1:, 1]) + array([[1. , 0.55536811], + [0.55536811, 1. ]]) + >>> df = pd.DataFrame(matrix, columns=['X', 'Y']) + >>> df + X Y + 0 51.0 35.0 + 1 49.0 30.0 + 2 47.0 32.0 + 3 46.0 31.0 + 4 50.0 36.0 + >>> df.rolling(4).corr(pairwise=True) + X Y + 0 X NaN NaN + Y NaN NaN + 1 X NaN NaN + Y NaN NaN + 2 X NaN NaN + Y NaN NaN + 3 X 1.000000 0.626300 + Y 0.626300 1.000000 + 4 X 1.000000 0.555368 + Y 0.555368 1.000000 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="correlation", + agg_method="corr", + ) + def corr( + self, + other: DataFrame | Series | None = None, + pairwise: bool | None = None, + ddof: int = 1, + numeric_only: bool = False, + ): + return super().corr( + other=other, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + ) + + +Rolling.__doc__ = Window.__doc__ + + +class RollingGroupby(BaseWindowGroupby, Rolling): + """ + Provide a rolling groupby implementation. + """ + + _attributes = Rolling._attributes + BaseWindowGroupby._attributes + + def _get_window_indexer(self) -> GroupbyIndexer: + """ + Return an indexer class that will compute the window start and end bounds + + Returns + ------- + GroupbyIndexer + """ + rolling_indexer: type[BaseIndexer] + indexer_kwargs: dict[str, Any] | None = None + index_array = self._index_array + if isinstance(self.window, BaseIndexer): + rolling_indexer = type(self.window) + indexer_kwargs = self.window.__dict__.copy() + assert isinstance(indexer_kwargs, dict) # for mypy + # We'll be using the index of each group later + indexer_kwargs.pop("index_array", None) + window = self.window + elif self._win_freq_i8 is not None: + rolling_indexer = VariableWindowIndexer + # error: Incompatible types in assignment (expression has type + # "int", variable has type "BaseIndexer") + window = self._win_freq_i8 # type: ignore[assignment] + else: + rolling_indexer = FixedWindowIndexer + window = self.window + window_indexer = GroupbyIndexer( + index_array=index_array, + window_size=window, + groupby_indices=self._grouper.indices, + window_indexer=rolling_indexer, + indexer_kwargs=indexer_kwargs, + ) + return window_indexer + + def _validate_datetimelike_monotonic(self): + """ + Validate that each group in self._on is monotonic + """ + # GH 46061 + if self._on.hasnans: + self._raise_monotonic_error("values must not have NaT") + for group_indices in self._grouper.indices.values(): + group_on = self._on.take(group_indices) + if not ( + group_on.is_monotonic_increasing or group_on.is_monotonic_decreasing + ): + on = "index" if self.on is None else self.on + raise ValueError( + f"Each group within {on} must be monotonic. " + f"Sort the values in {on} first." + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5e56b1bc7ba4377cc5de9d68a1424524aef21cb5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/__init__.py @@ -0,0 +1,9 @@ +# ruff: noqa: TCH004 +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + # import modules that have public classes/functions + from pandas.io.formats import style + + # and mark only those modules as public + __all__ = ["style"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/_color_data.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/_color_data.py new file mode 100644 index 0000000000000000000000000000000000000000..2e7cb7f29646eb11c0ec83d8a909a8cfd7953182 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/_color_data.py @@ -0,0 +1,157 @@ +# GH37967: Enable the use of CSS named colors, as defined in +# matplotlib.colors.CSS4_COLORS, when exporting to Excel. +# This data has been copied here, instead of being imported from matplotlib, +# not to have ``to_excel`` methods require matplotlib. +# source: matplotlib._color_data (3.3.3) +from __future__ import annotations + +CSS4_COLORS = { + "aliceblue": "F0F8FF", + "antiquewhite": "FAEBD7", + "aqua": "00FFFF", + "aquamarine": "7FFFD4", + "azure": "F0FFFF", + "beige": "F5F5DC", + "bisque": "FFE4C4", + "black": "000000", + "blanchedalmond": "FFEBCD", + "blue": "0000FF", + "blueviolet": "8A2BE2", + "brown": "A52A2A", + "burlywood": "DEB887", + "cadetblue": "5F9EA0", + "chartreuse": "7FFF00", + "chocolate": "D2691E", + "coral": "FF7F50", + "cornflowerblue": "6495ED", + "cornsilk": "FFF8DC", + "crimson": "DC143C", + "cyan": "00FFFF", + "darkblue": "00008B", + "darkcyan": "008B8B", + "darkgoldenrod": "B8860B", + "darkgray": "A9A9A9", + "darkgreen": "006400", + "darkgrey": "A9A9A9", + "darkkhaki": "BDB76B", + "darkmagenta": "8B008B", + "darkolivegreen": "556B2F", + "darkorange": "FF8C00", + "darkorchid": "9932CC", + "darkred": "8B0000", + "darksalmon": "E9967A", + "darkseagreen": "8FBC8F", + "darkslateblue": "483D8B", + "darkslategray": "2F4F4F", + "darkslategrey": "2F4F4F", + "darkturquoise": "00CED1", + "darkviolet": "9400D3", + "deeppink": "FF1493", + "deepskyblue": "00BFFF", + "dimgray": "696969", + "dimgrey": "696969", + "dodgerblue": "1E90FF", + "firebrick": "B22222", + "floralwhite": "FFFAF0", + "forestgreen": "228B22", + "fuchsia": "FF00FF", + "gainsboro": "DCDCDC", + "ghostwhite": "F8F8FF", + "gold": "FFD700", + "goldenrod": "DAA520", + "gray": "808080", + "green": "008000", + "greenyellow": "ADFF2F", + "grey": "808080", + "honeydew": "F0FFF0", + "hotpink": "FF69B4", + "indianred": "CD5C5C", + "indigo": "4B0082", + "ivory": "FFFFF0", + "khaki": "F0E68C", + "lavender": "E6E6FA", + "lavenderblush": "FFF0F5", + "lawngreen": "7CFC00", + "lemonchiffon": "FFFACD", + "lightblue": "ADD8E6", + "lightcoral": "F08080", + "lightcyan": "E0FFFF", + "lightgoldenrodyellow": "FAFAD2", + "lightgray": "D3D3D3", + "lightgreen": "90EE90", + "lightgrey": "D3D3D3", + "lightpink": "FFB6C1", + "lightsalmon": "FFA07A", + "lightseagreen": "20B2AA", + "lightskyblue": "87CEFA", + "lightslategray": "778899", + "lightslategrey": "778899", + "lightsteelblue": "B0C4DE", + "lightyellow": "FFFFE0", + "lime": "00FF00", + "limegreen": "32CD32", + "linen": "FAF0E6", + "magenta": "FF00FF", + "maroon": "800000", + "mediumaquamarine": "66CDAA", + "mediumblue": "0000CD", + "mediumorchid": "BA55D3", + "mediumpurple": "9370DB", + "mediumseagreen": "3CB371", + "mediumslateblue": "7B68EE", + "mediumspringgreen": "00FA9A", + "mediumturquoise": "48D1CC", + "mediumvioletred": "C71585", + "midnightblue": "191970", + "mintcream": "F5FFFA", + "mistyrose": "FFE4E1", + "moccasin": "FFE4B5", + "navajowhite": "FFDEAD", + "navy": "000080", + "oldlace": "FDF5E6", + "olive": "808000", + "olivedrab": "6B8E23", + "orange": "FFA500", + "orangered": "FF4500", + "orchid": "DA70D6", + "palegoldenrod": "EEE8AA", + "palegreen": "98FB98", + "paleturquoise": "AFEEEE", + "palevioletred": "DB7093", + "papayawhip": "FFEFD5", + "peachpuff": "FFDAB9", + "peru": "CD853F", + "pink": "FFC0CB", + "plum": "DDA0DD", + "powderblue": "B0E0E6", + "purple": "800080", + "rebeccapurple": "663399", + "red": "FF0000", + "rosybrown": "BC8F8F", + "royalblue": "4169E1", + "saddlebrown": "8B4513", + "salmon": "FA8072", + "sandybrown": "F4A460", + "seagreen": "2E8B57", + "seashell": "FFF5EE", + "sienna": "A0522D", + "silver": "C0C0C0", + "skyblue": "87CEEB", + "slateblue": "6A5ACD", + "slategray": "708090", + "slategrey": "708090", + "snow": "FFFAFA", + "springgreen": "00FF7F", + "steelblue": "4682B4", + "tan": "D2B48C", + "teal": "008080", + "thistle": "D8BFD8", + "tomato": "FF6347", + "turquoise": "40E0D0", + "violet": "EE82EE", + "wheat": "F5DEB3", + "white": "FFFFFF", + "whitesmoke": "F5F5F5", + "yellow": "FFFF00", + "yellowgreen": "9ACD32", +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/console.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/console.py new file mode 100644 index 0000000000000000000000000000000000000000..2a6cbe07629031687c249f70b51bdfbe2dd84041 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/console.py @@ -0,0 +1,94 @@ +""" +Internal module for console introspection +""" +from __future__ import annotations + +from shutil import get_terminal_size + + +def get_console_size() -> tuple[int | None, int | None]: + """ + Return console size as tuple = (width, height). + + Returns (None,None) in non-interactive session. + """ + from pandas import get_option + + display_width = get_option("display.width") + display_height = get_option("display.max_rows") + + # Consider + # interactive shell terminal, can detect term size + # interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term + # size non-interactive script, should disregard term size + + # in addition + # width,height have default values, but setting to 'None' signals + # should use Auto-Detection, But only in interactive shell-terminal. + # Simple. yeah. + + if in_interactive_session(): + if in_ipython_frontend(): + # sane defaults for interactive non-shell terminal + # match default for width,height in config_init + from pandas._config.config import get_default_val + + terminal_width = get_default_val("display.width") + terminal_height = get_default_val("display.max_rows") + else: + # pure terminal + terminal_width, terminal_height = get_terminal_size() + else: + terminal_width, terminal_height = None, None + + # Note if the User sets width/Height to None (auto-detection) + # and we're in a script (non-inter), this will return (None,None) + # caller needs to deal. + return display_width or terminal_width, display_height or terminal_height + + +# ---------------------------------------------------------------------- +# Detect our environment + + +def in_interactive_session() -> bool: + """ + Check if we're running in an interactive shell. + + Returns + ------- + bool + True if running under python/ipython interactive shell. + """ + from pandas import get_option + + def check_main(): + try: + import __main__ as main + except ModuleNotFoundError: + return get_option("mode.sim_interactive") + return not hasattr(main, "__file__") or get_option("mode.sim_interactive") + + try: + # error: Name '__IPYTHON__' is not defined + return __IPYTHON__ or check_main() # type: ignore[name-defined] + except NameError: + return check_main() + + +def in_ipython_frontend() -> bool: + """ + Check if we're inside an IPython zmq frontend. + + Returns + ------- + bool + """ + try: + # error: Name 'get_ipython' is not defined + ip = get_ipython() # type: ignore[name-defined] + return "zmq" in str(type(ip)).lower() + except NameError: + pass + + return False diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/css.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/css.py new file mode 100644 index 0000000000000000000000000000000000000000..ccce60c00a9e02bf3bb7f21c5ec799b7123e8eed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/css.py @@ -0,0 +1,421 @@ +""" +Utilities for interpreting CSS from Stylers for formatting non-HTML outputs. +""" +from __future__ import annotations + +import re +from typing import ( + TYPE_CHECKING, + Callable, +) +import warnings + +from pandas.errors import CSSWarning +from pandas.util._exceptions import find_stack_level + +if TYPE_CHECKING: + from collections.abc import ( + Generator, + Iterable, + Iterator, + ) + + +def _side_expander(prop_fmt: str) -> Callable: + """ + Wrapper to expand shorthand property into top, right, bottom, left properties + + Parameters + ---------- + side : str + The border side to expand into properties + + Returns + ------- + function: Return to call when a 'border(-{side}): {value}' string is encountered + """ + + def expand(self, prop, value: str) -> Generator[tuple[str, str], None, None]: + """ + Expand shorthand property into side-specific property (top, right, bottom, left) + + Parameters + ---------- + prop (str): CSS property name + value (str): String token for property + + Yields + ------ + Tuple (str, str): Expanded property, value + """ + tokens = value.split() + try: + mapping = self.SIDE_SHORTHANDS[len(tokens)] + except KeyError: + warnings.warn( + f'Could not expand "{prop}: {value}"', + CSSWarning, + stacklevel=find_stack_level(), + ) + return + for key, idx in zip(self.SIDES, mapping): + yield prop_fmt.format(key), tokens[idx] + + return expand + + +def _border_expander(side: str = "") -> Callable: + """ + Wrapper to expand 'border' property into border color, style, and width properties + + Parameters + ---------- + side : str + The border side to expand into properties + + Returns + ------- + function: Return to call when a 'border(-{side}): {value}' string is encountered + """ + if side != "": + side = f"-{side}" + + def expand(self, prop, value: str) -> Generator[tuple[str, str], None, None]: + """ + Expand border into color, style, and width tuples + + Parameters + ---------- + prop : str + CSS property name passed to styler + value : str + Value passed to styler for property + + Yields + ------ + Tuple (str, str): Expanded property, value + """ + tokens = value.split() + if len(tokens) == 0 or len(tokens) > 3: + warnings.warn( + f'Too many tokens provided to "{prop}" (expected 1-3)', + CSSWarning, + stacklevel=find_stack_level(), + ) + + # TODO: Can we use current color as initial value to comply with CSS standards? + border_declarations = { + f"border{side}-color": "black", + f"border{side}-style": "none", + f"border{side}-width": "medium", + } + for token in tokens: + if token.lower() in self.BORDER_STYLES: + border_declarations[f"border{side}-style"] = token + elif any(ratio in token.lower() for ratio in self.BORDER_WIDTH_RATIOS): + border_declarations[f"border{side}-width"] = token + else: + border_declarations[f"border{side}-color"] = token + # TODO: Warn user if item entered more than once (e.g. "border: red green") + + # Per CSS, "border" will reset previous "border-*" definitions + yield from self.atomize(border_declarations.items()) + + return expand + + +class CSSResolver: + """ + A callable for parsing and resolving CSS to atomic properties. + """ + + UNIT_RATIOS = { + "pt": ("pt", 1), + "em": ("em", 1), + "rem": ("pt", 12), + "ex": ("em", 0.5), + # 'ch': + "px": ("pt", 0.75), + "pc": ("pt", 12), + "in": ("pt", 72), + "cm": ("in", 1 / 2.54), + "mm": ("in", 1 / 25.4), + "q": ("mm", 0.25), + "!!default": ("em", 0), + } + + FONT_SIZE_RATIOS = UNIT_RATIOS.copy() + FONT_SIZE_RATIOS.update( + { + "%": ("em", 0.01), + "xx-small": ("rem", 0.5), + "x-small": ("rem", 0.625), + "small": ("rem", 0.8), + "medium": ("rem", 1), + "large": ("rem", 1.125), + "x-large": ("rem", 1.5), + "xx-large": ("rem", 2), + "smaller": ("em", 1 / 1.2), + "larger": ("em", 1.2), + "!!default": ("em", 1), + } + ) + + MARGIN_RATIOS = UNIT_RATIOS.copy() + MARGIN_RATIOS.update({"none": ("pt", 0)}) + + BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy() + BORDER_WIDTH_RATIOS.update( + { + "none": ("pt", 0), + "thick": ("px", 4), + "medium": ("px", 2), + "thin": ("px", 1), + # Default: medium only if solid + } + ) + + BORDER_STYLES = [ + "none", + "hidden", + "dotted", + "dashed", + "solid", + "double", + "groove", + "ridge", + "inset", + "outset", + "mediumdashdot", + "dashdotdot", + "hair", + "mediumdashdotdot", + "dashdot", + "slantdashdot", + "mediumdashed", + ] + + SIDE_SHORTHANDS = { + 1: [0, 0, 0, 0], + 2: [0, 1, 0, 1], + 3: [0, 1, 2, 1], + 4: [0, 1, 2, 3], + } + + SIDES = ("top", "right", "bottom", "left") + + CSS_EXPANSIONS = { + **{ + (f"border-{prop}" if prop else "border"): _border_expander(prop) + for prop in ["", "top", "right", "bottom", "left"] + }, + **{ + f"border-{prop}": _side_expander(f"border-{{:s}}-{prop}") + for prop in ["color", "style", "width"] + }, + "margin": _side_expander("margin-{:s}"), + "padding": _side_expander("padding-{:s}"), + } + + def __call__( + self, + declarations: str | Iterable[tuple[str, str]], + inherited: dict[str, str] | None = None, + ) -> dict[str, str]: + """ + The given declarations to atomic properties. + + Parameters + ---------- + declarations_str : str | Iterable[tuple[str, str]] + A CSS string or set of CSS declaration tuples + e.g. "font-weight: bold; background: blue" or + {("font-weight", "bold"), ("background", "blue")} + inherited : dict, optional + Atomic properties indicating the inherited style context in which + declarations_str is to be resolved. ``inherited`` should already + be resolved, i.e. valid output of this method. + + Returns + ------- + dict + Atomic CSS 2.2 properties. + + Examples + -------- + >>> resolve = CSSResolver() + >>> inherited = {'font-family': 'serif', 'font-weight': 'bold'} + >>> out = resolve(''' + ... border-color: BLUE RED; + ... font-size: 1em; + ... font-size: 2em; + ... font-weight: normal; + ... font-weight: inherit; + ... ''', inherited) + >>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE + [('border-bottom-color', 'blue'), + ('border-left-color', 'red'), + ('border-right-color', 'red'), + ('border-top-color', 'blue'), + ('font-family', 'serif'), + ('font-size', '24pt'), + ('font-weight', 'bold')] + """ + if isinstance(declarations, str): + declarations = self.parse(declarations) + props = dict(self.atomize(declarations)) + if inherited is None: + inherited = {} + + props = self._update_initial(props, inherited) + props = self._update_font_size(props, inherited) + return self._update_other_units(props) + + def _update_initial( + self, + props: dict[str, str], + inherited: dict[str, str], + ) -> dict[str, str]: + # 1. resolve inherited, initial + for prop, val in inherited.items(): + if prop not in props: + props[prop] = val + + new_props = props.copy() + for prop, val in props.items(): + if val == "inherit": + val = inherited.get(prop, "initial") + + if val in ("initial", None): + # we do not define a complete initial stylesheet + del new_props[prop] + else: + new_props[prop] = val + return new_props + + def _update_font_size( + self, + props: dict[str, str], + inherited: dict[str, str], + ) -> dict[str, str]: + # 2. resolve relative font size + if props.get("font-size"): + props["font-size"] = self.size_to_pt( + props["font-size"], + self._get_font_size(inherited), + conversions=self.FONT_SIZE_RATIOS, + ) + return props + + def _get_font_size(self, props: dict[str, str]) -> float | None: + if props.get("font-size"): + font_size_string = props["font-size"] + return self._get_float_font_size_from_pt(font_size_string) + return None + + def _get_float_font_size_from_pt(self, font_size_string: str) -> float: + assert font_size_string.endswith("pt") + return float(font_size_string.rstrip("pt")) + + def _update_other_units(self, props: dict[str, str]) -> dict[str, str]: + font_size = self._get_font_size(props) + # 3. TODO: resolve other font-relative units + for side in self.SIDES: + prop = f"border-{side}-width" + if prop in props: + props[prop] = self.size_to_pt( + props[prop], + em_pt=font_size, + conversions=self.BORDER_WIDTH_RATIOS, + ) + + for prop in [f"margin-{side}", f"padding-{side}"]: + if prop in props: + # TODO: support % + props[prop] = self.size_to_pt( + props[prop], + em_pt=font_size, + conversions=self.MARGIN_RATIOS, + ) + return props + + def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS) -> str: + def _error(): + warnings.warn( + f"Unhandled size: {repr(in_val)}", + CSSWarning, + stacklevel=find_stack_level(), + ) + return self.size_to_pt("1!!default", conversions=conversions) + + match = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val) + if match is None: + return _error() + + val, unit = match.groups() + if val == "": + # hack for 'large' etc. + val = 1 + else: + try: + val = float(val) + except ValueError: + return _error() + + while unit != "pt": + if unit == "em": + if em_pt is None: + unit = "rem" + else: + val *= em_pt + unit = "pt" + continue + + try: + unit, mul = conversions[unit] + except KeyError: + return _error() + val *= mul + + val = round(val, 5) + if int(val) == val: + size_fmt = f"{int(val):d}pt" + else: + size_fmt = f"{val:f}pt" + return size_fmt + + def atomize(self, declarations: Iterable) -> Generator[tuple[str, str], None, None]: + for prop, value in declarations: + prop = prop.lower() + value = value.lower() + if prop in self.CSS_EXPANSIONS: + expand = self.CSS_EXPANSIONS[prop] + yield from expand(self, prop, value) + else: + yield prop, value + + def parse(self, declarations_str: str) -> Iterator[tuple[str, str]]: + """ + Generates (prop, value) pairs from declarations. + + In a future version may generate parsed tokens from tinycss/tinycss2 + + Parameters + ---------- + declarations_str : str + """ + for decl in declarations_str.split(";"): + if not decl.strip(): + continue + prop, sep, val = decl.partition(":") + prop = prop.strip().lower() + # TODO: don't lowercase case sensitive parts of values (strings) + val = val.strip().lower() + if sep: + yield prop, val + else: + warnings.warn( + f"Ill-formatted attribute: expected a colon in {repr(decl)}", + CSSWarning, + stacklevel=find_stack_level(), + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/csvs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/csvs.py new file mode 100644 index 0000000000000000000000000000000000000000..50503e862ef433901f40715987c2105f6f16263a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/csvs.py @@ -0,0 +1,330 @@ +""" +Module for formatting output data into CSV files. +""" + +from __future__ import annotations + +from collections.abc import ( + Hashable, + Iterable, + Iterator, + Sequence, +) +import csv as csvlib +import os +from typing import ( + TYPE_CHECKING, + Any, + cast, +) + +import numpy as np + +from pandas._libs import writers as libwriters +from pandas._typing import SequenceNotStr +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.generic import ( + ABCDatetimeIndex, + ABCIndex, + ABCMultiIndex, + ABCPeriodIndex, +) +from pandas.core.dtypes.missing import notna + +from pandas.core.indexes.api import Index + +from pandas.io.common import get_handle + +if TYPE_CHECKING: + from pandas._typing import ( + CompressionOptions, + FilePath, + FloatFormatType, + IndexLabel, + StorageOptions, + WriteBuffer, + npt, + ) + + from pandas.io.formats.format import DataFrameFormatter + + +_DEFAULT_CHUNKSIZE_CELLS = 100_000 + + +class CSVFormatter: + cols: npt.NDArray[np.object_] + + def __init__( + self, + formatter: DataFrameFormatter, + path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "", + sep: str = ",", + cols: Sequence[Hashable] | None = None, + index_label: IndexLabel | None = None, + mode: str = "w", + encoding: str | None = None, + errors: str = "strict", + compression: CompressionOptions = "infer", + quoting: int | None = None, + lineterminator: str | None = "\n", + chunksize: int | None = None, + quotechar: str | None = '"', + date_format: str | None = None, + doublequote: bool = True, + escapechar: str | None = None, + storage_options: StorageOptions | None = None, + ) -> None: + self.fmt = formatter + + self.obj = self.fmt.frame + + self.filepath_or_buffer = path_or_buf + self.encoding = encoding + self.compression: CompressionOptions = compression + self.mode = mode + self.storage_options = storage_options + + self.sep = sep + self.index_label = self._initialize_index_label(index_label) + self.errors = errors + self.quoting = quoting or csvlib.QUOTE_MINIMAL + self.quotechar = self._initialize_quotechar(quotechar) + self.doublequote = doublequote + self.escapechar = escapechar + self.lineterminator = lineterminator or os.linesep + self.date_format = date_format + self.cols = self._initialize_columns(cols) + self.chunksize = self._initialize_chunksize(chunksize) + + @property + def na_rep(self) -> str: + return self.fmt.na_rep + + @property + def float_format(self) -> FloatFormatType | None: + return self.fmt.float_format + + @property + def decimal(self) -> str: + return self.fmt.decimal + + @property + def header(self) -> bool | SequenceNotStr[str]: + return self.fmt.header + + @property + def index(self) -> bool: + return self.fmt.index + + def _initialize_index_label(self, index_label: IndexLabel | None) -> IndexLabel: + if index_label is not False: + if index_label is None: + return self._get_index_label_from_obj() + elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndex)): + # given a string for a DF with Index + return [index_label] + return index_label + + def _get_index_label_from_obj(self) -> Sequence[Hashable]: + if isinstance(self.obj.index, ABCMultiIndex): + return self._get_index_label_multiindex() + else: + return self._get_index_label_flat() + + def _get_index_label_multiindex(self) -> Sequence[Hashable]: + return [name or "" for name in self.obj.index.names] + + def _get_index_label_flat(self) -> Sequence[Hashable]: + index_label = self.obj.index.name + return [""] if index_label is None else [index_label] + + def _initialize_quotechar(self, quotechar: str | None) -> str | None: + if self.quoting != csvlib.QUOTE_NONE: + # prevents crash in _csv + return quotechar + return None + + @property + def has_mi_columns(self) -> bool: + return bool(isinstance(self.obj.columns, ABCMultiIndex)) + + def _initialize_columns( + self, cols: Iterable[Hashable] | None + ) -> npt.NDArray[np.object_]: + # validate mi options + if self.has_mi_columns: + if cols is not None: + msg = "cannot specify cols with a MultiIndex on the columns" + raise TypeError(msg) + + if cols is not None: + if isinstance(cols, ABCIndex): + cols = cols._get_values_for_csv(**self._number_format) + else: + cols = list(cols) + self.obj = self.obj.loc[:, cols] + + # update columns to include possible multiplicity of dupes + # and make sure cols is just a list of labels + new_cols = self.obj.columns + return new_cols._get_values_for_csv(**self._number_format) + + def _initialize_chunksize(self, chunksize: int | None) -> int: + if chunksize is None: + return (_DEFAULT_CHUNKSIZE_CELLS // (len(self.cols) or 1)) or 1 + return int(chunksize) + + @property + def _number_format(self) -> dict[str, Any]: + """Dictionary used for storing number formatting settings.""" + return { + "na_rep": self.na_rep, + "float_format": self.float_format, + "date_format": self.date_format, + "quoting": self.quoting, + "decimal": self.decimal, + } + + @cache_readonly + def data_index(self) -> Index: + data_index = self.obj.index + if ( + isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) + and self.date_format is not None + ): + data_index = Index( + [x.strftime(self.date_format) if notna(x) else "" for x in data_index] + ) + elif isinstance(data_index, ABCMultiIndex): + data_index = data_index.remove_unused_levels() + return data_index + + @property + def nlevels(self) -> int: + if self.index: + return getattr(self.data_index, "nlevels", 1) + else: + return 0 + + @property + def _has_aliases(self) -> bool: + return isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) + + @property + def _need_to_save_header(self) -> bool: + return bool(self._has_aliases or self.header) + + @property + def write_cols(self) -> SequenceNotStr[Hashable]: + if self._has_aliases: + assert not isinstance(self.header, bool) + if len(self.header) != len(self.cols): + raise ValueError( + f"Writing {len(self.cols)} cols but got {len(self.header)} aliases" + ) + return self.header + else: + # self.cols is an ndarray derived from Index._get_values_for_csv, + # so its entries are strings, i.e. hashable + return cast(SequenceNotStr[Hashable], self.cols) + + @property + def encoded_labels(self) -> list[Hashable]: + encoded_labels: list[Hashable] = [] + + if self.index and self.index_label: + assert isinstance(self.index_label, Sequence) + encoded_labels = list(self.index_label) + + if not self.has_mi_columns or self._has_aliases: + encoded_labels += list(self.write_cols) + + return encoded_labels + + def save(self) -> None: + """ + Create the writer & save. + """ + # apply compression and byte/text conversion + with get_handle( + self.filepath_or_buffer, + self.mode, + encoding=self.encoding, + errors=self.errors, + compression=self.compression, + storage_options=self.storage_options, + ) as handles: + # Note: self.encoding is irrelevant here + self.writer = csvlib.writer( + handles.handle, + lineterminator=self.lineterminator, + delimiter=self.sep, + quoting=self.quoting, + doublequote=self.doublequote, + escapechar=self.escapechar, + quotechar=self.quotechar, + ) + + self._save() + + def _save(self) -> None: + if self._need_to_save_header: + self._save_header() + self._save_body() + + def _save_header(self) -> None: + if not self.has_mi_columns or self._has_aliases: + self.writer.writerow(self.encoded_labels) + else: + for row in self._generate_multiindex_header_rows(): + self.writer.writerow(row) + + def _generate_multiindex_header_rows(self) -> Iterator[list[Hashable]]: + columns = self.obj.columns + for i in range(columns.nlevels): + # we need at least 1 index column to write our col names + col_line = [] + if self.index: + # name is the first column + col_line.append(columns.names[i]) + + if isinstance(self.index_label, list) and len(self.index_label) > 1: + col_line.extend([""] * (len(self.index_label) - 1)) + + col_line.extend(columns._get_level_values(i)) + yield col_line + + # Write out the index line if it's not empty. + # Otherwise, we will print out an extraneous + # blank line between the mi and the data rows. + if self.encoded_labels and set(self.encoded_labels) != {""}: + yield self.encoded_labels + [""] * len(columns) + + def _save_body(self) -> None: + nrows = len(self.data_index) + chunks = (nrows // self.chunksize) + 1 + for i in range(chunks): + start_i = i * self.chunksize + end_i = min(start_i + self.chunksize, nrows) + if start_i >= end_i: + break + self._save_chunk(start_i, end_i) + + def _save_chunk(self, start_i: int, end_i: int) -> None: + # create the data for a chunk + slicer = slice(start_i, end_i) + df = self.obj.iloc[slicer] + + res = df._get_values_for_csv(**self._number_format) + data = list(res._iter_column_arrays()) + + ix = self.data_index[slicer]._get_values_for_csv(**self._number_format) + libwriters.write_csv_rows( + data, + ix, + self.nlevels, + self.cols, + self.writer, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/excel.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/excel.py new file mode 100644 index 0000000000000000000000000000000000000000..5fd23cd7d918ad0efddb1088d79fd78f6079cca7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/excel.py @@ -0,0 +1,962 @@ +""" +Utilities for conversion to writer-agnostic Excel representation. +""" +from __future__ import annotations + +from collections.abc import ( + Hashable, + Iterable, + Mapping, + Sequence, +) +import functools +import itertools +import re +from typing import ( + TYPE_CHECKING, + Any, + Callable, + cast, +) +import warnings + +import numpy as np + +from pandas._libs.lib import is_list_like +from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes import missing +from pandas.core.dtypes.common import ( + is_float, + is_scalar, +) + +from pandas import ( + DataFrame, + Index, + MultiIndex, + PeriodIndex, +) +import pandas.core.common as com +from pandas.core.shared_docs import _shared_docs + +from pandas.io.formats._color_data import CSS4_COLORS +from pandas.io.formats.css import ( + CSSResolver, + CSSWarning, +) +from pandas.io.formats.format import get_level_lengths +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from pandas._typing import ( + FilePath, + IndexLabel, + StorageOptions, + WriteExcelBuffer, + ) + + from pandas import ExcelWriter + + +class ExcelCell: + __fields__ = ("row", "col", "val", "style", "mergestart", "mergeend") + __slots__ = __fields__ + + def __init__( + self, + row: int, + col: int, + val, + style=None, + mergestart: int | None = None, + mergeend: int | None = None, + ) -> None: + self.row = row + self.col = col + self.val = val + self.style = style + self.mergestart = mergestart + self.mergeend = mergeend + + +class CssExcelCell(ExcelCell): + def __init__( + self, + row: int, + col: int, + val, + style: dict | None, + css_styles: dict[tuple[int, int], list[tuple[str, Any]]] | None, + css_row: int, + css_col: int, + css_converter: Callable | None, + **kwargs, + ) -> None: + if css_styles and css_converter: + # Use dict to get only one (case-insensitive) declaration per property + declaration_dict = { + prop.lower(): val for prop, val in css_styles[css_row, css_col] + } + # Convert to frozenset for order-invariant caching + unique_declarations = frozenset(declaration_dict.items()) + style = css_converter(unique_declarations) + + super().__init__(row=row, col=col, val=val, style=style, **kwargs) + + +class CSSToExcelConverter: + """ + A callable for converting CSS declarations to ExcelWriter styles + + Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow), + focusing on font styling, backgrounds, borders and alignment. + + Operates by first computing CSS styles in a fairly generic + way (see :meth:`compute_css`) then determining Excel style + properties from CSS properties (see :meth:`build_xlstyle`). + + Parameters + ---------- + inherited : str, optional + CSS declarations understood to be the containing scope for the + CSS processed by :meth:`__call__`. + """ + + NAMED_COLORS = CSS4_COLORS + + VERTICAL_MAP = { + "top": "top", + "text-top": "top", + "middle": "center", + "baseline": "bottom", + "bottom": "bottom", + "text-bottom": "bottom", + # OpenXML also has 'justify', 'distributed' + } + + BOLD_MAP = { + "bold": True, + "bolder": True, + "600": True, + "700": True, + "800": True, + "900": True, + "normal": False, + "lighter": False, + "100": False, + "200": False, + "300": False, + "400": False, + "500": False, + } + + ITALIC_MAP = { + "normal": False, + "italic": True, + "oblique": True, + } + + FAMILY_MAP = { + "serif": 1, # roman + "sans-serif": 2, # swiss + "cursive": 4, # script + "fantasy": 5, # decorative + } + + BORDER_STYLE_MAP = { + style.lower(): style + for style in [ + "dashed", + "mediumDashDot", + "dashDotDot", + "hair", + "dotted", + "mediumDashDotDot", + "double", + "dashDot", + "slantDashDot", + "mediumDashed", + ] + } + + # NB: Most of the methods here could be classmethods, as only __init__ + # and __call__ make use of instance attributes. We leave them as + # instancemethods so that users can easily experiment with extensions + # without monkey-patching. + inherited: dict[str, str] | None + + def __init__(self, inherited: str | None = None) -> None: + if inherited is not None: + self.inherited = self.compute_css(inherited) + else: + self.inherited = None + # We should avoid cache on the __call__ method. + # Otherwise once the method __call__ has been called + # garbage collection no longer deletes the instance. + self._call_cached = functools.cache(self._call_uncached) + + compute_css = CSSResolver() + + def __call__( + self, declarations: str | frozenset[tuple[str, str]] + ) -> dict[str, dict[str, str]]: + """ + Convert CSS declarations to ExcelWriter style. + + Parameters + ---------- + declarations : str | frozenset[tuple[str, str]] + CSS string or set of CSS declaration tuples. + e.g. "font-weight: bold; background: blue" or + {("font-weight", "bold"), ("background", "blue")} + + Returns + ------- + xlstyle : dict + A style as interpreted by ExcelWriter when found in + ExcelCell.style. + """ + return self._call_cached(declarations) + + def _call_uncached( + self, declarations: str | frozenset[tuple[str, str]] + ) -> dict[str, dict[str, str]]: + properties = self.compute_css(declarations, self.inherited) + return self.build_xlstyle(properties) + + def build_xlstyle(self, props: Mapping[str, str]) -> dict[str, dict[str, str]]: + out = { + "alignment": self.build_alignment(props), + "border": self.build_border(props), + "fill": self.build_fill(props), + "font": self.build_font(props), + "number_format": self.build_number_format(props), + } + + # TODO: handle cell width and height: needs support in pandas.io.excel + + def remove_none(d: dict[str, str | None]) -> None: + """Remove key where value is None, through nested dicts""" + for k, v in list(d.items()): + if v is None: + del d[k] + elif isinstance(v, dict): + remove_none(v) + if not v: + del d[k] + + remove_none(out) + return out + + def build_alignment(self, props: Mapping[str, str]) -> dict[str, bool | str | None]: + # TODO: text-indent, padding-left -> alignment.indent + return { + "horizontal": props.get("text-align"), + "vertical": self._get_vertical_alignment(props), + "wrap_text": self._get_is_wrap_text(props), + } + + def _get_vertical_alignment(self, props: Mapping[str, str]) -> str | None: + vertical_align = props.get("vertical-align") + if vertical_align: + return self.VERTICAL_MAP.get(vertical_align) + return None + + def _get_is_wrap_text(self, props: Mapping[str, str]) -> bool | None: + if props.get("white-space") is None: + return None + return bool(props["white-space"] not in ("nowrap", "pre", "pre-line")) + + def build_border( + self, props: Mapping[str, str] + ) -> dict[str, dict[str, str | None]]: + return { + side: { + "style": self._border_style( + props.get(f"border-{side}-style"), + props.get(f"border-{side}-width"), + self.color_to_excel(props.get(f"border-{side}-color")), + ), + "color": self.color_to_excel(props.get(f"border-{side}-color")), + } + for side in ["top", "right", "bottom", "left"] + } + + def _border_style(self, style: str | None, width: str | None, color: str | None): + # convert styles and widths to openxml, one of: + # 'dashDot' + # 'dashDotDot' + # 'dashed' + # 'dotted' + # 'double' + # 'hair' + # 'medium' + # 'mediumDashDot' + # 'mediumDashDotDot' + # 'mediumDashed' + # 'slantDashDot' + # 'thick' + # 'thin' + if width is None and style is None and color is None: + # Return None will remove "border" from style dictionary + return None + + if width is None and style is None: + # Return "none" will keep "border" in style dictionary + return "none" + + if style in ("none", "hidden"): + return "none" + + width_name = self._get_width_name(width) + if width_name is None: + return "none" + + if style in (None, "groove", "ridge", "inset", "outset", "solid"): + # not handled + return width_name + + if style == "double": + return "double" + if style == "dotted": + if width_name in ("hair", "thin"): + return "dotted" + return "mediumDashDotDot" + if style == "dashed": + if width_name in ("hair", "thin"): + return "dashed" + return "mediumDashed" + elif style in self.BORDER_STYLE_MAP: + # Excel-specific styles + return self.BORDER_STYLE_MAP[style] + else: + warnings.warn( + f"Unhandled border style format: {repr(style)}", + CSSWarning, + stacklevel=find_stack_level(), + ) + return "none" + + def _get_width_name(self, width_input: str | None) -> str | None: + width = self._width_to_float(width_input) + if width < 1e-5: + return None + elif width < 1.3: + return "thin" + elif width < 2.8: + return "medium" + return "thick" + + def _width_to_float(self, width: str | None) -> float: + if width is None: + width = "2pt" + return self._pt_to_float(width) + + def _pt_to_float(self, pt_string: str) -> float: + assert pt_string.endswith("pt") + return float(pt_string.rstrip("pt")) + + def build_fill(self, props: Mapping[str, str]): + # TODO: perhaps allow for special properties + # -excel-pattern-bgcolor and -excel-pattern-type + fill_color = props.get("background-color") + if fill_color not in (None, "transparent", "none"): + return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"} + + def build_number_format(self, props: Mapping[str, str]) -> dict[str, str | None]: + fc = props.get("number-format") + fc = fc.replace("§", ";") if isinstance(fc, str) else fc + return {"format_code": fc} + + def build_font( + self, props: Mapping[str, str] + ) -> dict[str, bool | float | str | None]: + font_names = self._get_font_names(props) + decoration = self._get_decoration(props) + return { + "name": font_names[0] if font_names else None, + "family": self._select_font_family(font_names), + "size": self._get_font_size(props), + "bold": self._get_is_bold(props), + "italic": self._get_is_italic(props), + "underline": ("single" if "underline" in decoration else None), + "strike": ("line-through" in decoration) or None, + "color": self.color_to_excel(props.get("color")), + # shadow if nonzero digit before shadow color + "shadow": self._get_shadow(props), + } + + def _get_is_bold(self, props: Mapping[str, str]) -> bool | None: + weight = props.get("font-weight") + if weight: + return self.BOLD_MAP.get(weight) + return None + + def _get_is_italic(self, props: Mapping[str, str]) -> bool | None: + font_style = props.get("font-style") + if font_style: + return self.ITALIC_MAP.get(font_style) + return None + + def _get_decoration(self, props: Mapping[str, str]) -> Sequence[str]: + decoration = props.get("text-decoration") + if decoration is not None: + return decoration.split() + else: + return () + + def _get_underline(self, decoration: Sequence[str]) -> str | None: + if "underline" in decoration: + return "single" + return None + + def _get_shadow(self, props: Mapping[str, str]) -> bool | None: + if "text-shadow" in props: + return bool(re.search("^[^#(]*[1-9]", props["text-shadow"])) + return None + + def _get_font_names(self, props: Mapping[str, str]) -> Sequence[str]: + font_names_tmp = re.findall( + r"""(?x) + ( + "(?:[^"]|\\")+" + | + '(?:[^']|\\')+' + | + [^'",]+ + )(?=,|\s*$) + """, + props.get("font-family", ""), + ) + + font_names = [] + for name in font_names_tmp: + if name[:1] == '"': + name = name[1:-1].replace('\\"', '"') + elif name[:1] == "'": + name = name[1:-1].replace("\\'", "'") + else: + name = name.strip() + if name: + font_names.append(name) + return font_names + + def _get_font_size(self, props: Mapping[str, str]) -> float | None: + size = props.get("font-size") + if size is None: + return size + return self._pt_to_float(size) + + def _select_font_family(self, font_names: Sequence[str]) -> int | None: + family = None + for name in font_names: + family = self.FAMILY_MAP.get(name) + if family: + break + + return family + + def color_to_excel(self, val: str | None) -> str | None: + if val is None: + return None + + if self._is_hex_color(val): + return self._convert_hex_to_excel(val) + + try: + return self.NAMED_COLORS[val] + except KeyError: + warnings.warn( + f"Unhandled color format: {repr(val)}", + CSSWarning, + stacklevel=find_stack_level(), + ) + return None + + def _is_hex_color(self, color_string: str) -> bool: + return bool(color_string.startswith("#")) + + def _convert_hex_to_excel(self, color_string: str) -> str: + code = color_string.lstrip("#") + if self._is_shorthand_color(color_string): + return (code[0] * 2 + code[1] * 2 + code[2] * 2).upper() + else: + return code.upper() + + def _is_shorthand_color(self, color_string: str) -> bool: + """Check if color code is shorthand. + + #FFF is a shorthand as opposed to full #FFFFFF. + """ + code = color_string.lstrip("#") + if len(code) == 3: + return True + elif len(code) == 6: + return False + else: + raise ValueError(f"Unexpected color {color_string}") + + +class ExcelFormatter: + """ + Class for formatting a DataFrame to a list of ExcelCells, + + Parameters + ---------- + df : DataFrame or Styler + na_rep: na representation + float_format : str, default None + Format string for floating point numbers + cols : sequence, optional + Columns to write + header : bool or sequence of str, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names + index : bool, default True + output row names (index) + index_label : str or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + merge_cells : bool, default False + Format MultiIndex and Hierarchical Rows as merged cells. + inf_rep : str, default `'inf'` + representation for np.inf values (which aren't representable in Excel) + A `'-'` sign will be added in front of -inf. + style_converter : callable, optional + This translates Styler styles (CSS) into ExcelWriter styles. + Defaults to ``CSSToExcelConverter()``. + It should have signature css_declarations string -> excel style. + This is only called for body cells. + """ + + max_rows = 2**20 + max_cols = 2**14 + + def __init__( + self, + df, + na_rep: str = "", + float_format: str | None = None, + cols: Sequence[Hashable] | None = None, + header: Sequence[Hashable] | bool = True, + index: bool = True, + index_label: IndexLabel | None = None, + merge_cells: bool = False, + inf_rep: str = "inf", + style_converter: Callable | None = None, + ) -> None: + self.rowcounter = 0 + self.na_rep = na_rep + if not isinstance(df, DataFrame): + self.styler = df + self.styler._compute() # calculate applied styles + df = df.data + if style_converter is None: + style_converter = CSSToExcelConverter() + self.style_converter: Callable | None = style_converter + else: + self.styler = None + self.style_converter = None + self.df = df + if cols is not None: + # all missing, raise + if not len(Index(cols).intersection(df.columns)): + raise KeyError("passes columns are not ALL present dataframe") + + if len(Index(cols).intersection(df.columns)) != len(set(cols)): + # Deprecated in GH#17295, enforced in 1.0.0 + raise KeyError("Not all names specified in 'columns' are found") + + self.df = df.reindex(columns=cols) + + self.columns = self.df.columns + self.float_format = float_format + self.index = index + self.index_label = index_label + self.header = header + self.merge_cells = merge_cells + self.inf_rep = inf_rep + + @property + def header_style(self) -> dict[str, dict[str, str | bool]]: + return { + "font": {"bold": True}, + "borders": { + "top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin", + }, + "alignment": {"horizontal": "center", "vertical": "top"}, + } + + def _format_value(self, val): + if is_scalar(val) and missing.isna(val): + val = self.na_rep + elif is_float(val): + if missing.isposinf_scalar(val): + val = self.inf_rep + elif missing.isneginf_scalar(val): + val = f"-{self.inf_rep}" + elif self.float_format is not None: + val = float(self.float_format % val) + if getattr(val, "tzinfo", None) is not None: + raise ValueError( + "Excel does not support datetimes with " + "timezones. Please ensure that datetimes " + "are timezone unaware before writing to Excel." + ) + return val + + def _format_header_mi(self) -> Iterable[ExcelCell]: + if self.columns.nlevels > 1: + if not self.index: + raise NotImplementedError( + "Writing to Excel with MultiIndex columns and no " + "index ('index'=False) is not yet implemented." + ) + + if not (self._has_aliases or self.header): + return + + columns = self.columns + level_strs = columns._format_multi( + sparsify=self.merge_cells, include_names=False + ) + level_lengths = get_level_lengths(level_strs) + coloffset = 0 + lnum = 0 + + if self.index and isinstance(self.df.index, MultiIndex): + coloffset = len(self.df.index[0]) - 1 + + if self.merge_cells: + # Format multi-index as a merged cells. + for lnum, name in enumerate(columns.names): + yield ExcelCell( + row=lnum, + col=coloffset, + val=name, + style=self.header_style, + ) + + for lnum, (spans, levels, level_codes) in enumerate( + zip(level_lengths, columns.levels, columns.codes) + ): + values = levels.take(level_codes) + for i, span_val in spans.items(): + mergestart, mergeend = None, None + if span_val > 1: + mergestart, mergeend = lnum, coloffset + i + span_val + yield CssExcelCell( + row=lnum, + col=coloffset + i + 1, + val=values[i], + style=self.header_style, + css_styles=getattr(self.styler, "ctx_columns", None), + css_row=lnum, + css_col=i, + css_converter=self.style_converter, + mergestart=mergestart, + mergeend=mergeend, + ) + else: + # Format in legacy format with dots to indicate levels. + for i, values in enumerate(zip(*level_strs)): + v = ".".join(map(pprint_thing, values)) + yield CssExcelCell( + row=lnum, + col=coloffset + i + 1, + val=v, + style=self.header_style, + css_styles=getattr(self.styler, "ctx_columns", None), + css_row=lnum, + css_col=i, + css_converter=self.style_converter, + ) + + self.rowcounter = lnum + + def _format_header_regular(self) -> Iterable[ExcelCell]: + if self._has_aliases or self.header: + coloffset = 0 + + if self.index: + coloffset = 1 + if isinstance(self.df.index, MultiIndex): + coloffset = len(self.df.index.names) + + colnames = self.columns + if self._has_aliases: + self.header = cast(Sequence, self.header) + if len(self.header) != len(self.columns): + raise ValueError( + f"Writing {len(self.columns)} cols " + f"but got {len(self.header)} aliases" + ) + colnames = self.header + + for colindex, colname in enumerate(colnames): + yield CssExcelCell( + row=self.rowcounter, + col=colindex + coloffset, + val=colname, + style=self.header_style, + css_styles=getattr(self.styler, "ctx_columns", None), + css_row=0, + css_col=colindex, + css_converter=self.style_converter, + ) + + def _format_header(self) -> Iterable[ExcelCell]: + gen: Iterable[ExcelCell] + + if isinstance(self.columns, MultiIndex): + gen = self._format_header_mi() + else: + gen = self._format_header_regular() + + gen2: Iterable[ExcelCell] = () + + if self.df.index.names: + row = [x if x is not None else "" for x in self.df.index.names] + [ + "" + ] * len(self.columns) + if functools.reduce(lambda x, y: x and y, (x != "" for x in row)): + gen2 = ( + ExcelCell(self.rowcounter, colindex, val, self.header_style) + for colindex, val in enumerate(row) + ) + self.rowcounter += 1 + return itertools.chain(gen, gen2) + + def _format_body(self) -> Iterable[ExcelCell]: + if isinstance(self.df.index, MultiIndex): + return self._format_hierarchical_rows() + else: + return self._format_regular_rows() + + def _format_regular_rows(self) -> Iterable[ExcelCell]: + if self._has_aliases or self.header: + self.rowcounter += 1 + + # output index and index_label? + if self.index: + # check aliases + # if list only take first as this is not a MultiIndex + if self.index_label and isinstance( + self.index_label, (list, tuple, np.ndarray, Index) + ): + index_label = self.index_label[0] + # if string good to go + elif self.index_label and isinstance(self.index_label, str): + index_label = self.index_label + else: + index_label = self.df.index.names[0] + + if isinstance(self.columns, MultiIndex): + self.rowcounter += 1 + + if index_label and self.header is not False: + yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style) + + # write index_values + index_values = self.df.index + if isinstance(self.df.index, PeriodIndex): + index_values = self.df.index.to_timestamp() + + for idx, idxval in enumerate(index_values): + yield CssExcelCell( + row=self.rowcounter + idx, + col=0, + val=idxval, + style=self.header_style, + css_styles=getattr(self.styler, "ctx_index", None), + css_row=idx, + css_col=0, + css_converter=self.style_converter, + ) + coloffset = 1 + else: + coloffset = 0 + + yield from self._generate_body(coloffset) + + def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: + if self._has_aliases or self.header: + self.rowcounter += 1 + + gcolidx = 0 + + if self.index: + index_labels = self.df.index.names + # check for aliases + if self.index_label and isinstance( + self.index_label, (list, tuple, np.ndarray, Index) + ): + index_labels = self.index_label + + # MultiIndex columns require an extra row + # with index names (blank if None) for + # unambiguous round-trip, unless not merging, + # in which case the names all go on one row Issue #11328 + if isinstance(self.columns, MultiIndex) and self.merge_cells: + self.rowcounter += 1 + + # if index labels are not empty go ahead and dump + if com.any_not_none(*index_labels) and self.header is not False: + for cidx, name in enumerate(index_labels): + yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style) + + if self.merge_cells: + # Format hierarchical rows as merged cells. + level_strs = self.df.index._format_multi( + sparsify=True, include_names=False + ) + level_lengths = get_level_lengths(level_strs) + + for spans, levels, level_codes in zip( + level_lengths, self.df.index.levels, self.df.index.codes + ): + values = levels.take( + level_codes, + allow_fill=levels._can_hold_na, + fill_value=levels._na_value, + ) + + for i, span_val in spans.items(): + mergestart, mergeend = None, None + if span_val > 1: + mergestart = self.rowcounter + i + span_val - 1 + mergeend = gcolidx + yield CssExcelCell( + row=self.rowcounter + i, + col=gcolidx, + val=values[i], + style=self.header_style, + css_styles=getattr(self.styler, "ctx_index", None), + css_row=i, + css_col=gcolidx, + css_converter=self.style_converter, + mergestart=mergestart, + mergeend=mergeend, + ) + gcolidx += 1 + + else: + # Format hierarchical rows with non-merged values. + for indexcolvals in zip(*self.df.index): + for idx, indexcolval in enumerate(indexcolvals): + yield CssExcelCell( + row=self.rowcounter + idx, + col=gcolidx, + val=indexcolval, + style=self.header_style, + css_styles=getattr(self.styler, "ctx_index", None), + css_row=idx, + css_col=gcolidx, + css_converter=self.style_converter, + ) + gcolidx += 1 + + yield from self._generate_body(gcolidx) + + @property + def _has_aliases(self) -> bool: + """Whether the aliases for column names are present.""" + return is_list_like(self.header) + + def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]: + # Write the body of the frame data series by series. + for colidx in range(len(self.columns)): + series = self.df.iloc[:, colidx] + for i, val in enumerate(series): + yield CssExcelCell( + row=self.rowcounter + i, + col=colidx + coloffset, + val=val, + style=None, + css_styles=getattr(self.styler, "ctx", None), + css_row=i, + css_col=colidx, + css_converter=self.style_converter, + ) + + def get_formatted_cells(self) -> Iterable[ExcelCell]: + for cell in itertools.chain(self._format_header(), self._format_body()): + cell.val = self._format_value(cell.val) + yield cell + + @doc(storage_options=_shared_docs["storage_options"]) + def write( + self, + writer: FilePath | WriteExcelBuffer | ExcelWriter, + sheet_name: str = "Sheet1", + startrow: int = 0, + startcol: int = 0, + freeze_panes: tuple[int, int] | None = None, + engine: str | None = None, + storage_options: StorageOptions | None = None, + engine_kwargs: dict | None = None, + ) -> None: + """ + writer : path-like, file-like, or ExcelWriter object + File path or existing ExcelWriter + sheet_name : str, default 'Sheet1' + Name of sheet which will contain DataFrame + startrow : + upper left cell row to dump data frame + startcol : + upper left cell column to dump data frame + freeze_panes : tuple of integer (length 2), default None + Specifies the one-based bottommost row and rightmost column that + is to be frozen + engine : string, default None + write engine to use if writer is a path - you can also set this + via the options ``io.excel.xlsx.writer``, + or ``io.excel.xlsm.writer``. + + {storage_options} + + engine_kwargs: dict, optional + Arbitrary keyword arguments passed to excel engine. + """ + from pandas.io.excel import ExcelWriter + + num_rows, num_cols = self.df.shape + if num_rows > self.max_rows or num_cols > self.max_cols: + raise ValueError( + f"This sheet is too large! Your sheet size is: {num_rows}, {num_cols} " + f"Max sheet size is: {self.max_rows}, {self.max_cols}" + ) + + if engine_kwargs is None: + engine_kwargs = {} + + formatted_cells = self.get_formatted_cells() + if isinstance(writer, ExcelWriter): + need_save = False + else: + writer = ExcelWriter( + writer, + engine=engine, + storage_options=storage_options, + engine_kwargs=engine_kwargs, + ) + need_save = True + + try: + writer._write_cells( + formatted_cells, + sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + ) + finally: + # make sure to close opened file handles + if need_save: + writer.close() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/format.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/format.py new file mode 100644 index 0000000000000000000000000000000000000000..00c7526edfa4894fab655cb5bbfdf2aa93c4e96d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/format.py @@ -0,0 +1,2058 @@ +""" +Internal module for formatting output data in csv, html, xml, +and latex files. This module also applies to display formatting. +""" +from __future__ import annotations + +from collections.abc import ( + Generator, + Hashable, + Mapping, + Sequence, +) +from contextlib import contextmanager +from csv import QUOTE_NONE +from decimal import Decimal +from functools import partial +from io import StringIO +import math +import re +from shutil import get_terminal_size +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Final, + cast, +) + +import numpy as np + +from pandas._config.config import ( + get_option, + set_option, +) + +from pandas._libs import lib +from pandas._libs.missing import NA +from pandas._libs.tslibs import ( + NaT, + Timedelta, + Timestamp, +) +from pandas._libs.tslibs.nattype import NaTType + +from pandas.core.dtypes.common import ( + is_complex_dtype, + is_float, + is_integer, + is_list_like, + is_numeric_dtype, + is_scalar, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + ExtensionDtype, +) +from pandas.core.dtypes.missing import ( + isna, + notna, +) + +from pandas.core.arrays import ( + Categorical, + DatetimeArray, + ExtensionArray, + TimedeltaArray, +) +from pandas.core.arrays.string_ import StringDtype +from pandas.core.base import PandasObject +import pandas.core.common as com +from pandas.core.indexes.api import ( + Index, + MultiIndex, + PeriodIndex, + ensure_index, +) +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex +from pandas.core.reshape.concat import concat + +from pandas.io.common import ( + check_parent_directory, + stringify_path, +) +from pandas.io.formats import printing + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + Axes, + ColspaceArgType, + ColspaceType, + CompressionOptions, + FilePath, + FloatFormatType, + FormattersType, + IndexLabel, + SequenceNotStr, + StorageOptions, + WriteBuffer, + ) + + from pandas import ( + DataFrame, + Series, + ) + + +common_docstring: Final = """ + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + columns : array-like, optional, default None + The subset of columns to write. Writes all columns by default. + col_space : %(col_space_type)s, optional + %(col_space)s. + header : %(header_type)s, optional + %(header)s. + index : bool, optional, default True + Whether to print index (row) labels. + na_rep : str, optional, default 'NaN' + String representation of ``NaN`` to use. + formatters : list, tuple or dict of one-param. functions, optional + Formatter functions to apply to columns' elements by position or + name. + The result of each function must be a unicode string. + List/tuple must be of length equal to the number of columns. + float_format : one-parameter function, optional, default None + Formatter function to apply to columns' elements if they are + floats. This function must return a unicode string and will be + applied only to the non-``NaN`` elements, with ``NaN`` being + handled by ``na_rep``. + sparsify : bool, optional, default True + Set to False for a DataFrame with a hierarchical index to print + every multiindex key at each row. + index_names : bool, optional, default True + Prints the names of the indexes. + justify : str, default None + How to justify the column labels. If None uses the option from + the print configuration (controlled by set_option), 'right' out + of the box. Valid values are + + * left + * right + * center + * justify + * justify-all + * start + * end + * inherit + * match-parent + * initial + * unset. + max_rows : int, optional + Maximum number of rows to display in the console. + max_cols : int, optional + Maximum number of columns to display in the console. + show_dimensions : bool, default False + Display DataFrame dimensions (number of rows by number of columns). + decimal : str, default '.' + Character recognized as decimal separator, e.g. ',' in Europe. + """ + +VALID_JUSTIFY_PARAMETERS = ( + "left", + "right", + "center", + "justify", + "justify-all", + "start", + "end", + "inherit", + "match-parent", + "initial", + "unset", +) + +return_docstring: Final = """ + Returns + ------- + str or None + If buf is None, returns the result as a string. Otherwise returns + None. + """ + + +class SeriesFormatter: + """ + Implement the main logic of Series.to_string, which underlies + Series.__repr__. + """ + + def __init__( + self, + series: Series, + *, + length: bool | str = True, + header: bool = True, + index: bool = True, + na_rep: str = "NaN", + name: bool = False, + float_format: str | None = None, + dtype: bool = True, + max_rows: int | None = None, + min_rows: int | None = None, + ) -> None: + self.series = series + self.buf = StringIO() + self.name = name + self.na_rep = na_rep + self.header = header + self.length = length + self.index = index + self.max_rows = max_rows + self.min_rows = min_rows + + if float_format is None: + float_format = get_option("display.float_format") + self.float_format = float_format + self.dtype = dtype + self.adj = printing.get_adjustment() + + self._chk_truncate() + + def _chk_truncate(self) -> None: + self.tr_row_num: int | None + + min_rows = self.min_rows + max_rows = self.max_rows + # truncation determined by max_rows, actual truncated number of rows + # used below by min_rows + is_truncated_vertically = max_rows and (len(self.series) > max_rows) + series = self.series + if is_truncated_vertically: + max_rows = cast(int, max_rows) + if min_rows: + # if min_rows is set (not None or 0), set max_rows to minimum + # of both + max_rows = min(min_rows, max_rows) + if max_rows == 1: + row_num = max_rows + series = series.iloc[:max_rows] + else: + row_num = max_rows // 2 + series = concat((series.iloc[:row_num], series.iloc[-row_num:])) + self.tr_row_num = row_num + else: + self.tr_row_num = None + self.tr_series = series + self.is_truncated_vertically = is_truncated_vertically + + def _get_footer(self) -> str: + name = self.series.name + footer = "" + + index = self.series.index + if ( + isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)) + and index.freq is not None + ): + footer += f"Freq: {index.freqstr}" + + if self.name is not False and name is not None: + if footer: + footer += ", " + + series_name = printing.pprint_thing(name, escape_chars=("\t", "\r", "\n")) + footer += f"Name: {series_name}" + + if self.length is True or ( + self.length == "truncate" and self.is_truncated_vertically + ): + if footer: + footer += ", " + footer += f"Length: {len(self.series)}" + + if self.dtype is not False and self.dtype is not None: + dtype_name = getattr(self.tr_series.dtype, "name", None) + if dtype_name: + if footer: + footer += ", " + footer += f"dtype: {printing.pprint_thing(dtype_name)}" + + # level infos are added to the end and in a new line, like it is done + # for Categoricals + if isinstance(self.tr_series.dtype, CategoricalDtype): + level_info = self.tr_series._values._get_repr_footer() + if footer: + footer += "\n" + footer += level_info + + return str(footer) + + def _get_formatted_values(self) -> list[str]: + return format_array( + self.tr_series._values, + None, + float_format=self.float_format, + na_rep=self.na_rep, + leading_space=self.index, + ) + + def to_string(self) -> str: + series = self.tr_series + footer = self._get_footer() + + if len(series) == 0: + return f"{type(self.series).__name__}([], {footer})" + + index = series.index + have_header = _has_names(index) + if isinstance(index, MultiIndex): + fmt_index = index._format_multi(include_names=True, sparsify=None) + adj = printing.get_adjustment() + fmt_index = adj.adjoin(2, *fmt_index).split("\n") + else: + fmt_index = index._format_flat(include_name=True) + fmt_values = self._get_formatted_values() + + if self.is_truncated_vertically: + n_header_rows = 0 + row_num = self.tr_row_num + row_num = cast(int, row_num) + width = self.adj.len(fmt_values[row_num - 1]) + if width > 3: + dot_str = "..." + else: + dot_str = ".." + # Series uses mode=center because it has single value columns + # DataFrame uses mode=left + dot_str = self.adj.justify([dot_str], width, mode="center")[0] + fmt_values.insert(row_num + n_header_rows, dot_str) + fmt_index.insert(row_num + 1, "") + + if self.index: + result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values]) + else: + result = self.adj.adjoin(3, fmt_values) + + if self.header and have_header: + result = fmt_index[0] + "\n" + result + + if footer: + result += "\n" + footer + + return str("".join(result)) + + +def get_dataframe_repr_params() -> dict[str, Any]: + """Get the parameters used to repr(dataFrame) calls using DataFrame.to_string. + + Supplying these parameters to DataFrame.to_string is equivalent to calling + ``repr(DataFrame)``. This is useful if you want to adjust the repr output. + + .. versionadded:: 1.4.0 + + Example + ------- + >>> import pandas as pd + >>> + >>> df = pd.DataFrame([[1, 2], [3, 4]]) + >>> repr_params = pd.io.formats.format.get_dataframe_repr_params() + >>> repr(df) == df.to_string(**repr_params) + True + """ + from pandas.io.formats import console + + if get_option("display.expand_frame_repr"): + line_width, _ = console.get_console_size() + else: + line_width = None + return { + "max_rows": get_option("display.max_rows"), + "min_rows": get_option("display.min_rows"), + "max_cols": get_option("display.max_columns"), + "max_colwidth": get_option("display.max_colwidth"), + "show_dimensions": get_option("display.show_dimensions"), + "line_width": line_width, + } + + +def get_series_repr_params() -> dict[str, Any]: + """Get the parameters used to repr(Series) calls using Series.to_string. + + Supplying these parameters to Series.to_string is equivalent to calling + ``repr(series)``. This is useful if you want to adjust the series repr output. + + .. versionadded:: 1.4.0 + + Example + ------- + >>> import pandas as pd + >>> + >>> ser = pd.Series([1, 2, 3, 4]) + >>> repr_params = pd.io.formats.format.get_series_repr_params() + >>> repr(ser) == ser.to_string(**repr_params) + True + """ + width, height = get_terminal_size() + max_rows_opt = get_option("display.max_rows") + max_rows = height if max_rows_opt == 0 else max_rows_opt + min_rows = height if max_rows_opt == 0 else get_option("display.min_rows") + + return { + "name": True, + "dtype": True, + "min_rows": min_rows, + "max_rows": max_rows, + "length": get_option("display.show_dimensions"), + } + + +class DataFrameFormatter: + """ + Class for processing dataframe formatting options and data. + + Used by DataFrame.to_string, which backs DataFrame.__repr__. + """ + + __doc__ = __doc__ if __doc__ else "" + __doc__ += common_docstring + return_docstring + + def __init__( + self, + frame: DataFrame, + columns: Axes | None = None, + col_space: ColspaceArgType | None = None, + header: bool | SequenceNotStr[str] = True, + index: bool = True, + na_rep: str = "NaN", + formatters: FormattersType | None = None, + justify: str | None = None, + float_format: FloatFormatType | None = None, + sparsify: bool | None = None, + index_names: bool = True, + max_rows: int | None = None, + min_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool | str = False, + decimal: str = ".", + bold_rows: bool = False, + escape: bool = True, + ) -> None: + self.frame = frame + self.columns = self._initialize_columns(columns) + self.col_space = self._initialize_colspace(col_space) + self.header = header + self.index = index + self.na_rep = na_rep + self.formatters = self._initialize_formatters(formatters) + self.justify = self._initialize_justify(justify) + self.float_format = float_format + self.sparsify = self._initialize_sparsify(sparsify) + self.show_index_names = index_names + self.decimal = decimal + self.bold_rows = bold_rows + self.escape = escape + self.max_rows = max_rows + self.min_rows = min_rows + self.max_cols = max_cols + self.show_dimensions = show_dimensions + + self.max_cols_fitted = self._calc_max_cols_fitted() + self.max_rows_fitted = self._calc_max_rows_fitted() + + self.tr_frame = self.frame + self.truncate() + self.adj = printing.get_adjustment() + + def get_strcols(self) -> list[list[str]]: + """ + Render a DataFrame to a list of columns (as lists of strings). + """ + strcols = self._get_strcols_without_index() + + if self.index: + str_index = self._get_formatted_index(self.tr_frame) + strcols.insert(0, str_index) + + return strcols + + @property + def should_show_dimensions(self) -> bool: + return self.show_dimensions is True or ( + self.show_dimensions == "truncate" and self.is_truncated + ) + + @property + def is_truncated(self) -> bool: + return bool(self.is_truncated_horizontally or self.is_truncated_vertically) + + @property + def is_truncated_horizontally(self) -> bool: + return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted)) + + @property + def is_truncated_vertically(self) -> bool: + return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted)) + + @property + def dimensions_info(self) -> str: + return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" + + @property + def has_index_names(self) -> bool: + return _has_names(self.frame.index) + + @property + def has_column_names(self) -> bool: + return _has_names(self.frame.columns) + + @property + def show_row_idx_names(self) -> bool: + return all((self.has_index_names, self.index, self.show_index_names)) + + @property + def show_col_idx_names(self) -> bool: + return all((self.has_column_names, self.show_index_names, self.header)) + + @property + def max_rows_displayed(self) -> int: + return min(self.max_rows or len(self.frame), len(self.frame)) + + def _initialize_sparsify(self, sparsify: bool | None) -> bool: + if sparsify is None: + return get_option("display.multi_sparse") + return sparsify + + def _initialize_formatters( + self, formatters: FormattersType | None + ) -> FormattersType: + if formatters is None: + return {} + elif len(self.frame.columns) == len(formatters) or isinstance(formatters, dict): + return formatters + else: + raise ValueError( + f"Formatters length({len(formatters)}) should match " + f"DataFrame number of columns({len(self.frame.columns)})" + ) + + def _initialize_justify(self, justify: str | None) -> str: + if justify is None: + return get_option("display.colheader_justify") + else: + return justify + + def _initialize_columns(self, columns: Axes | None) -> Index: + if columns is not None: + cols = ensure_index(columns) + self.frame = self.frame[cols] + return cols + else: + return self.frame.columns + + def _initialize_colspace(self, col_space: ColspaceArgType | None) -> ColspaceType: + result: ColspaceType + + if col_space is None: + result = {} + elif isinstance(col_space, (int, str)): + result = {"": col_space} + result.update({column: col_space for column in self.frame.columns}) + elif isinstance(col_space, Mapping): + for column in col_space.keys(): + if column not in self.frame.columns and column != "": + raise ValueError( + f"Col_space is defined for an unknown column: {column}" + ) + result = col_space + else: + if len(self.frame.columns) != len(col_space): + raise ValueError( + f"Col_space length({len(col_space)}) should match " + f"DataFrame number of columns({len(self.frame.columns)})" + ) + result = dict(zip(self.frame.columns, col_space)) + return result + + def _calc_max_cols_fitted(self) -> int | None: + """Number of columns fitting the screen.""" + if not self._is_in_terminal(): + return self.max_cols + + width, _ = get_terminal_size() + if self._is_screen_narrow(width): + return width + else: + return self.max_cols + + def _calc_max_rows_fitted(self) -> int | None: + """Number of rows with data fitting the screen.""" + max_rows: int | None + + if self._is_in_terminal(): + _, height = get_terminal_size() + if self.max_rows == 0: + # rows available to fill with actual data + return height - self._get_number_of_auxiliary_rows() + + if self._is_screen_short(height): + max_rows = height + else: + max_rows = self.max_rows + else: + max_rows = self.max_rows + + return self._adjust_max_rows(max_rows) + + def _adjust_max_rows(self, max_rows: int | None) -> int | None: + """Adjust max_rows using display logic. + + See description here: + https://pandas.pydata.org/docs/dev/user_guide/options.html#frequently-used-options + + GH #37359 + """ + if max_rows: + if (len(self.frame) > max_rows) and self.min_rows: + # if truncated, set max_rows showed to min_rows + max_rows = min(self.min_rows, max_rows) + return max_rows + + def _is_in_terminal(self) -> bool: + """Check if the output is to be shown in terminal.""" + return bool(self.max_cols == 0 or self.max_rows == 0) + + def _is_screen_narrow(self, max_width) -> bool: + return bool(self.max_cols == 0 and len(self.frame.columns) > max_width) + + def _is_screen_short(self, max_height) -> bool: + return bool(self.max_rows == 0 and len(self.frame) > max_height) + + def _get_number_of_auxiliary_rows(self) -> int: + """Get number of rows occupied by prompt, dots and dimension info.""" + dot_row = 1 + prompt_row = 1 + num_rows = dot_row + prompt_row + + if self.show_dimensions: + num_rows += len(self.dimensions_info.splitlines()) + + if self.header: + num_rows += 1 + + return num_rows + + def truncate(self) -> None: + """ + Check whether the frame should be truncated. If so, slice the frame up. + """ + if self.is_truncated_horizontally: + self._truncate_horizontally() + + if self.is_truncated_vertically: + self._truncate_vertically() + + def _truncate_horizontally(self) -> None: + """Remove columns, which are not to be displayed and adjust formatters. + + Attributes affected: + - tr_frame + - formatters + - tr_col_num + """ + assert self.max_cols_fitted is not None + col_num = self.max_cols_fitted // 2 + if col_num >= 1: + left = self.tr_frame.iloc[:, :col_num] + right = self.tr_frame.iloc[:, -col_num:] + self.tr_frame = concat((left, right), axis=1) + + # truncate formatter + if isinstance(self.formatters, (list, tuple)): + self.formatters = [ + *self.formatters[:col_num], + *self.formatters[-col_num:], + ] + else: + col_num = cast(int, self.max_cols) + self.tr_frame = self.tr_frame.iloc[:, :col_num] + self.tr_col_num = col_num + + def _truncate_vertically(self) -> None: + """Remove rows, which are not to be displayed. + + Attributes affected: + - tr_frame + - tr_row_num + """ + assert self.max_rows_fitted is not None + row_num = self.max_rows_fitted // 2 + if row_num >= 1: + _len = len(self.tr_frame) + _slice = np.hstack([np.arange(row_num), np.arange(_len - row_num, _len)]) + self.tr_frame = self.tr_frame.iloc[_slice] + else: + row_num = cast(int, self.max_rows) + self.tr_frame = self.tr_frame.iloc[:row_num, :] + self.tr_row_num = row_num + + def _get_strcols_without_index(self) -> list[list[str]]: + strcols: list[list[str]] = [] + + if not is_list_like(self.header) and not self.header: + for i, c in enumerate(self.tr_frame): + fmt_values = self.format_col(i) + fmt_values = _make_fixed_width( + strings=fmt_values, + justify=self.justify, + minimum=int(self.col_space.get(c, 0)), + adj=self.adj, + ) + strcols.append(fmt_values) + return strcols + + if is_list_like(self.header): + # cast here since can't be bool if is_list_like + self.header = cast(list[str], self.header) + if len(self.header) != len(self.columns): + raise ValueError( + f"Writing {len(self.columns)} cols " + f"but got {len(self.header)} aliases" + ) + str_columns = [[label] for label in self.header] + else: + str_columns = self._get_formatted_column_labels(self.tr_frame) + + if self.show_row_idx_names: + for x in str_columns: + x.append("") + + for i, c in enumerate(self.tr_frame): + cheader = str_columns[i] + header_colwidth = max( + int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader) + ) + fmt_values = self.format_col(i) + fmt_values = _make_fixed_width( + fmt_values, self.justify, minimum=header_colwidth, adj=self.adj + ) + + max_len = max(*(self.adj.len(x) for x in fmt_values), header_colwidth) + cheader = self.adj.justify(cheader, max_len, mode=self.justify) + strcols.append(cheader + fmt_values) + + return strcols + + def format_col(self, i: int) -> list[str]: + frame = self.tr_frame + formatter = self._get_formatter(i) + return format_array( + frame.iloc[:, i]._values, + formatter, + float_format=self.float_format, + na_rep=self.na_rep, + space=self.col_space.get(frame.columns[i]), + decimal=self.decimal, + leading_space=self.index, + ) + + def _get_formatter(self, i: str | int) -> Callable | None: + if isinstance(self.formatters, (list, tuple)): + if is_integer(i): + i = cast(int, i) + return self.formatters[i] + else: + return None + else: + if is_integer(i) and i not in self.columns: + i = self.columns[i] + return self.formatters.get(i, None) + + def _get_formatted_column_labels(self, frame: DataFrame) -> list[list[str]]: + from pandas.core.indexes.multi import sparsify_labels + + columns = frame.columns + + if isinstance(columns, MultiIndex): + fmt_columns = columns._format_multi(sparsify=False, include_names=False) + fmt_columns = list(zip(*fmt_columns)) + dtypes = self.frame.dtypes._values + + # if we have a Float level, they don't use leading space at all + restrict_formatting = any(level.is_floating for level in columns.levels) + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + + def space_format(x, y): + if ( + y not in self.formatters + and need_leadsp[x] + and not restrict_formatting + ): + return " " + y + return y + + str_columns_tuple = list( + zip(*([space_format(x, y) for y in x] for x in fmt_columns)) + ) + if self.sparsify and len(str_columns_tuple): + str_columns_tuple = sparsify_labels(str_columns_tuple) + + str_columns = [list(x) for x in zip(*str_columns_tuple)] + else: + fmt_columns = columns._format_flat(include_name=False) + dtypes = self.frame.dtypes + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + str_columns = [ + [" " + x if not self._get_formatter(i) and need_leadsp[x] else x] + for i, x in enumerate(fmt_columns) + ] + # self.str_columns = str_columns + return str_columns + + def _get_formatted_index(self, frame: DataFrame) -> list[str]: + # Note: this is only used by to_string() and to_latex(), not by + # to_html(). so safe to cast col_space here. + col_space = {k: cast(int, v) for k, v in self.col_space.items()} + index = frame.index + columns = frame.columns + fmt = self._get_formatter("__index__") + + if isinstance(index, MultiIndex): + fmt_index = index._format_multi( + sparsify=self.sparsify, + include_names=self.show_row_idx_names, + formatter=fmt, + ) + else: + fmt_index = [ + index._format_flat(include_name=self.show_row_idx_names, formatter=fmt) + ] + + fmt_index = [ + tuple( + _make_fixed_width( + list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj + ) + ) + for x in fmt_index + ] + + adjoined = self.adj.adjoin(1, *fmt_index).split("\n") + + # empty space for columns + if self.show_col_idx_names: + col_header = [str(x) for x in self._get_column_name_list()] + else: + col_header = [""] * columns.nlevels + + if self.header: + return col_header + adjoined + else: + return adjoined + + def _get_column_name_list(self) -> list[Hashable]: + names: list[Hashable] = [] + columns = self.frame.columns + if isinstance(columns, MultiIndex): + names.extend("" if name is None else name for name in columns.names) + else: + names.append("" if columns.name is None else columns.name) + return names + + +class DataFrameRenderer: + """Class for creating dataframe output in multiple formats. + + Called in pandas.core.generic.NDFrame: + - to_csv + - to_latex + + Called in pandas.core.frame.DataFrame: + - to_html + - to_string + + Parameters + ---------- + fmt : DataFrameFormatter + Formatter with the formatting options. + """ + + def __init__(self, fmt: DataFrameFormatter) -> None: + self.fmt = fmt + + def to_html( + self, + buf: FilePath | WriteBuffer[str] | None = None, + encoding: str | None = None, + classes: str | list | tuple | None = None, + notebook: bool = False, + border: int | bool | None = None, + table_id: str | None = None, + render_links: bool = False, + ) -> str | None: + """ + Render a DataFrame to a html table. + + Parameters + ---------- + buf : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``write()`` function. If None, the result is + returned as a string. + encoding : str, default “utf-8” + Set character encoding. + classes : str or list-like + classes to include in the `class` attribute of the opening + ```` tag, in addition to the default "dataframe". + notebook : {True, False}, optional, default False + Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + ``
`` tag. Default ``pd.options.display.html.border``. + table_id : str, optional + A css id is included in the opening `
` tag if specified. + render_links : bool, default False + Convert URLs to HTML links. + """ + from pandas.io.formats.html import ( + HTMLFormatter, + NotebookFormatter, + ) + + Klass = NotebookFormatter if notebook else HTMLFormatter + + html_formatter = Klass( + self.fmt, + classes=classes, + border=border, + table_id=table_id, + render_links=render_links, + ) + string = html_formatter.to_string() + return save_to_buffer(string, buf=buf, encoding=encoding) + + def to_string( + self, + buf: FilePath | WriteBuffer[str] | None = None, + encoding: str | None = None, + line_width: int | None = None, + ) -> str | None: + """ + Render a DataFrame to a console-friendly tabular output. + + Parameters + ---------- + buf : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``write()`` function. If None, the result is + returned as a string. + encoding: str, default “utf-8” + Set character encoding. + line_width : int, optional + Width to wrap a line in characters. + """ + from pandas.io.formats.string import StringFormatter + + string_formatter = StringFormatter(self.fmt, line_width=line_width) + string = string_formatter.to_string() + return save_to_buffer(string, buf=buf, encoding=encoding) + + def to_csv( + self, + path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + encoding: str | None = None, + sep: str = ",", + columns: Sequence[Hashable] | None = None, + index_label: IndexLabel | None = None, + mode: str = "w", + compression: CompressionOptions = "infer", + quoting: int | None = None, + quotechar: str = '"', + lineterminator: str | None = None, + chunksize: int | None = None, + date_format: str | None = None, + doublequote: bool = True, + escapechar: str | None = None, + errors: str = "strict", + storage_options: StorageOptions | None = None, + ) -> str | None: + """ + Render dataframe as comma-separated file. + """ + from pandas.io.formats.csvs import CSVFormatter + + if path_or_buf is None: + created_buffer = True + path_or_buf = StringIO() + else: + created_buffer = False + + csv_formatter = CSVFormatter( + path_or_buf=path_or_buf, + lineterminator=lineterminator, + sep=sep, + encoding=encoding, + errors=errors, + compression=compression, + quoting=quoting, + cols=columns, + index_label=index_label, + mode=mode, + chunksize=chunksize, + quotechar=quotechar, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, + storage_options=storage_options, + formatter=self.fmt, + ) + csv_formatter.save() + + if created_buffer: + assert isinstance(path_or_buf, StringIO) + content = path_or_buf.getvalue() + path_or_buf.close() + return content + + return None + + +def save_to_buffer( + string: str, + buf: FilePath | WriteBuffer[str] | None = None, + encoding: str | None = None, +) -> str | None: + """ + Perform serialization. Write to buf or return as string if buf is None. + """ + with _get_buffer(buf, encoding=encoding) as fd: + fd.write(string) + if buf is None: + # error: "WriteBuffer[str]" has no attribute "getvalue" + return fd.getvalue() # type: ignore[attr-defined] + return None + + +@contextmanager +def _get_buffer( + buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None +) -> Generator[WriteBuffer[str], None, None] | Generator[StringIO, None, None]: + """ + Context manager to open, yield and close buffer for filenames or Path-like + objects, otherwise yield buf unchanged. + """ + if buf is not None: + buf = stringify_path(buf) + else: + buf = StringIO() + + if encoding is None: + encoding = "utf-8" + elif not isinstance(buf, str): + raise ValueError("buf is not a file name and encoding is specified.") + + if hasattr(buf, "write"): + # Incompatible types in "yield" (actual type "Union[str, WriteBuffer[str], + # StringIO]", expected type "Union[WriteBuffer[str], StringIO]") + yield buf # type: ignore[misc] + elif isinstance(buf, str): + check_parent_directory(str(buf)) + with open(buf, "w", encoding=encoding, newline="") as f: + # GH#30034 open instead of codecs.open prevents a file leak + # if we have an invalid encoding argument. + # newline="" is needed to roundtrip correctly on + # windows test_to_latex_filename + yield f + else: + raise TypeError("buf is not a file name and it has no write method") + + +# ---------------------------------------------------------------------- +# Array formatters + + +def format_array( + values: ArrayLike, + formatter: Callable | None, + float_format: FloatFormatType | None = None, + na_rep: str = "NaN", + digits: int | None = None, + space: str | int | None = None, + justify: str = "right", + decimal: str = ".", + leading_space: bool | None = True, + quoting: int | None = None, + fallback_formatter: Callable | None = None, +) -> list[str]: + """ + Format an array for printing. + + Parameters + ---------- + values : np.ndarray or ExtensionArray + formatter + float_format + na_rep + digits + space + justify + decimal + leading_space : bool, optional, default True + Whether the array should be formatted with a leading space. + When an array as a column of a Series or DataFrame, we do want + the leading space to pad between columns. + + When formatting an Index subclass + (e.g. IntervalIndex._get_values_for_csv), we don't want the + leading space since it should be left-aligned. + fallback_formatter + + Returns + ------- + List[str] + """ + fmt_klass: type[_GenericArrayFormatter] + if lib.is_np_dtype(values.dtype, "M"): + fmt_klass = _Datetime64Formatter + values = cast(DatetimeArray, values) + elif isinstance(values.dtype, DatetimeTZDtype): + fmt_klass = _Datetime64TZFormatter + values = cast(DatetimeArray, values) + elif lib.is_np_dtype(values.dtype, "m"): + fmt_klass = _Timedelta64Formatter + values = cast(TimedeltaArray, values) + elif isinstance(values.dtype, ExtensionDtype): + fmt_klass = _ExtensionArrayFormatter + elif lib.is_np_dtype(values.dtype, "fc"): + fmt_klass = FloatArrayFormatter + elif lib.is_np_dtype(values.dtype, "iu"): + fmt_klass = _IntArrayFormatter + else: + fmt_klass = _GenericArrayFormatter + + if space is None: + space = 12 + + if float_format is None: + float_format = get_option("display.float_format") + + if digits is None: + digits = get_option("display.precision") + + fmt_obj = fmt_klass( + values, + digits=digits, + na_rep=na_rep, + float_format=float_format, + formatter=formatter, + space=space, + justify=justify, + decimal=decimal, + leading_space=leading_space, + quoting=quoting, + fallback_formatter=fallback_formatter, + ) + + return fmt_obj.get_result() + + +class _GenericArrayFormatter: + def __init__( + self, + values: ArrayLike, + digits: int = 7, + formatter: Callable | None = None, + na_rep: str = "NaN", + space: str | int = 12, + float_format: FloatFormatType | None = None, + justify: str = "right", + decimal: str = ".", + quoting: int | None = None, + fixed_width: bool = True, + leading_space: bool | None = True, + fallback_formatter: Callable | None = None, + ) -> None: + self.values = values + self.digits = digits + self.na_rep = na_rep + self.space = space + self.formatter = formatter + self.float_format = float_format + self.justify = justify + self.decimal = decimal + self.quoting = quoting + self.fixed_width = fixed_width + self.leading_space = leading_space + self.fallback_formatter = fallback_formatter + + def get_result(self) -> list[str]: + fmt_values = self._format_strings() + return _make_fixed_width(fmt_values, self.justify) + + def _format_strings(self) -> list[str]: + if self.float_format is None: + float_format = get_option("display.float_format") + if float_format is None: + precision = get_option("display.precision") + float_format = lambda x: _trim_zeros_single_float( + f"{x: .{precision:d}f}" + ) + else: + float_format = self.float_format + + if self.formatter is not None: + formatter = self.formatter + elif self.fallback_formatter is not None: + formatter = self.fallback_formatter + else: + quote_strings = self.quoting is not None and self.quoting != QUOTE_NONE + formatter = partial( + printing.pprint_thing, + escape_chars=("\t", "\r", "\n"), + quote_strings=quote_strings, + ) + + def _format(x): + if self.na_rep is not None and is_scalar(x) and isna(x): + if x is None: + return "None" + elif x is NA: + return str(NA) + elif lib.is_float(x) and np.isinf(x): + # TODO(3.0): this will be unreachable when use_inf_as_na + # deprecation is enforced + return str(x) + elif x is NaT or isinstance(x, (np.datetime64, np.timedelta64)): + return "NaT" + return self.na_rep + elif isinstance(x, PandasObject): + return str(x) + elif isinstance(x, StringDtype): + return repr(x) + else: + # object dtype + return str(formatter(x)) + + vals = self.values + if not isinstance(vals, np.ndarray): + raise TypeError( + "ExtensionArray formatting should use _ExtensionArrayFormatter" + ) + inferred = lib.map_infer(vals, is_float) + is_float_type = ( + inferred + # vals may have 2 or more dimensions + & np.all(notna(vals), axis=tuple(range(1, len(vals.shape)))) + ) + leading_space = self.leading_space + if leading_space is None: + leading_space = is_float_type.any() + + fmt_values = [] + for i, v in enumerate(vals): + if (not is_float_type[i] or self.formatter is not None) and leading_space: + fmt_values.append(f" {_format(v)}") + elif is_float_type[i]: + fmt_values.append(float_format(v)) + else: + if leading_space is False: + # False specifically, so that the default is + # to include a space if we get here. + tpl = "{v}" + else: + tpl = " {v}" + fmt_values.append(tpl.format(v=_format(v))) + + return fmt_values + + +class FloatArrayFormatter(_GenericArrayFormatter): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + # float_format is expected to be a string + # formatter should be used to pass a function + if self.float_format is not None and self.formatter is None: + # GH21625, GH22270 + self.fixed_width = False + if callable(self.float_format): + self.formatter = self.float_format + self.float_format = None + + def _value_formatter( + self, + float_format: FloatFormatType | None = None, + threshold: float | None = None, + ) -> Callable: + """Returns a function to be applied on each value to format it""" + # the float_format parameter supersedes self.float_format + if float_format is None: + float_format = self.float_format + + # we are going to compose different functions, to first convert to + # a string, then replace the decimal symbol, and finally chop according + # to the threshold + + # when there is no float_format, we use str instead of '%g' + # because str(0.0) = '0.0' while '%g' % 0.0 = '0' + if float_format: + + def base_formatter(v): + assert float_format is not None # for mypy + # error: "str" not callable + # error: Unexpected keyword argument "value" for "__call__" of + # "EngFormatter" + return ( + float_format(value=v) # type: ignore[operator,call-arg] + if notna(v) + else self.na_rep + ) + + else: + + def base_formatter(v): + return str(v) if notna(v) else self.na_rep + + if self.decimal != ".": + + def decimal_formatter(v): + return base_formatter(v).replace(".", self.decimal, 1) + + else: + decimal_formatter = base_formatter + + if threshold is None: + return decimal_formatter + + def formatter(value): + if notna(value): + if abs(value) > threshold: + return decimal_formatter(value) + else: + return decimal_formatter(0.0) + else: + return self.na_rep + + return formatter + + def get_result_as_array(self) -> np.ndarray: + """ + Returns the float values converted into strings using + the parameters given at initialisation, as a numpy array + """ + + def format_with_na_rep(values: ArrayLike, formatter: Callable, na_rep: str): + mask = isna(values) + formatted = np.array( + [ + formatter(val) if not m else na_rep + for val, m in zip(values.ravel(), mask.ravel()) + ] + ).reshape(values.shape) + return formatted + + def format_complex_with_na_rep( + values: ArrayLike, formatter: Callable, na_rep: str + ): + real_values = np.real(values).ravel() # type: ignore[arg-type] + imag_values = np.imag(values).ravel() # type: ignore[arg-type] + real_mask, imag_mask = isna(real_values), isna(imag_values) + formatted_lst = [] + for val, real_val, imag_val, re_isna, im_isna in zip( + values.ravel(), + real_values, + imag_values, + real_mask, + imag_mask, + ): + if not re_isna and not im_isna: + formatted_lst.append(formatter(val)) + elif not re_isna: # xxx+nanj + formatted_lst.append(f"{formatter(real_val)}+{na_rep}j") + elif not im_isna: # nan[+/-]xxxj + # The imaginary part may either start with a "-" or a space + imag_formatted = formatter(imag_val).strip() + if imag_formatted.startswith("-"): + formatted_lst.append(f"{na_rep}{imag_formatted}j") + else: + formatted_lst.append(f"{na_rep}+{imag_formatted}j") + else: # nan+nanj + formatted_lst.append(f"{na_rep}+{na_rep}j") + return np.array(formatted_lst).reshape(values.shape) + + if self.formatter is not None: + return format_with_na_rep(self.values, self.formatter, self.na_rep) + + if self.fixed_width: + threshold = get_option("display.chop_threshold") + else: + threshold = None + + # if we have a fixed_width, we'll need to try different float_format + def format_values_with(float_format): + formatter = self._value_formatter(float_format, threshold) + + # default formatter leaves a space to the left when formatting + # floats, must be consistent for left-justifying NaNs (GH #25061) + na_rep = " " + self.na_rep if self.justify == "left" else self.na_rep + + # different formatting strategies for complex and non-complex data + # need to distinguish complex and float NaNs (GH #53762) + values = self.values + is_complex = is_complex_dtype(values) + + # separate the wheat from the chaff + if is_complex: + values = format_complex_with_na_rep(values, formatter, na_rep) + else: + values = format_with_na_rep(values, formatter, na_rep) + + if self.fixed_width: + if is_complex: + result = _trim_zeros_complex(values, self.decimal) + else: + result = _trim_zeros_float(values, self.decimal) + return np.asarray(result, dtype="object") + + return values + + # There is a special default string when we are fixed-width + # The default is otherwise to use str instead of a formatting string + float_format: FloatFormatType | None + if self.float_format is None: + if self.fixed_width: + if self.leading_space is True: + fmt_str = "{value: .{digits:d}f}" + else: + fmt_str = "{value:.{digits:d}f}" + float_format = partial(fmt_str.format, digits=self.digits) + else: + float_format = self.float_format + else: + float_format = lambda value: self.float_format % value + + formatted_values = format_values_with(float_format) + + if not self.fixed_width: + return formatted_values + + # we need do convert to engineering format if some values are too small + # and would appear as 0, or if some values are too big and take too + # much space + + if len(formatted_values) > 0: + maxlen = max(len(x) for x in formatted_values) + too_long = maxlen > self.digits + 6 + else: + too_long = False + + abs_vals = np.abs(self.values) + # this is pretty arbitrary for now + # large values: more that 8 characters including decimal symbol + # and first digit, hence > 1e6 + has_large_values = (abs_vals > 1e6).any() + has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any() + + if has_small_values or (too_long and has_large_values): + if self.leading_space is True: + fmt_str = "{value: .{digits:d}e}" + else: + fmt_str = "{value:.{digits:d}e}" + float_format = partial(fmt_str.format, digits=self.digits) + formatted_values = format_values_with(float_format) + + return formatted_values + + def _format_strings(self) -> list[str]: + return list(self.get_result_as_array()) + + +class _IntArrayFormatter(_GenericArrayFormatter): + def _format_strings(self) -> list[str]: + if self.leading_space is False: + formatter_str = lambda x: f"{x:d}".format(x=x) + else: + formatter_str = lambda x: f"{x: d}".format(x=x) + formatter = self.formatter or formatter_str + fmt_values = [formatter(x) for x in self.values] + return fmt_values + + +class _Datetime64Formatter(_GenericArrayFormatter): + values: DatetimeArray + + def __init__( + self, + values: DatetimeArray, + nat_rep: str = "NaT", + date_format: None = None, + **kwargs, + ) -> None: + super().__init__(values, **kwargs) + self.nat_rep = nat_rep + self.date_format = date_format + + def _format_strings(self) -> list[str]: + """we by definition have DO NOT have a TZ""" + values = self.values + + if self.formatter is not None: + return [self.formatter(x) for x in values] + + fmt_values = values._format_native_types( + na_rep=self.nat_rep, date_format=self.date_format + ) + return fmt_values.tolist() + + +class _ExtensionArrayFormatter(_GenericArrayFormatter): + values: ExtensionArray + + def _format_strings(self) -> list[str]: + values = self.values + + formatter = self.formatter + fallback_formatter = None + if formatter is None: + fallback_formatter = values._formatter(boxed=True) + + if isinstance(values, Categorical): + # Categorical is special for now, so that we can preserve tzinfo + array = values._internal_get_values() + else: + array = np.asarray(values, dtype=object) + + fmt_values = format_array( + array, + formatter, + float_format=self.float_format, + na_rep=self.na_rep, + digits=self.digits, + space=self.space, + justify=self.justify, + decimal=self.decimal, + leading_space=self.leading_space, + quoting=self.quoting, + fallback_formatter=fallback_formatter, + ) + return fmt_values + + +def format_percentiles( + percentiles: (np.ndarray | Sequence[float]), +) -> list[str]: + """ + Outputs rounded and formatted percentiles. + + Parameters + ---------- + percentiles : list-like, containing floats from interval [0,1] + + Returns + ------- + formatted : list of strings + + Notes + ----- + Rounding precision is chosen so that: (1) if any two elements of + ``percentiles`` differ, they remain different after rounding + (2) no entry is *rounded* to 0% or 100%. + Any non-integer is always rounded to at least 1 decimal place. + + Examples + -------- + Keeps all entries different after rounding: + + >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) + ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] + + No element is rounded to 0% or 100% (unless already equal to it). + Duplicates are allowed: + + >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) + ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] + """ + percentiles = np.asarray(percentiles) + + # It checks for np.nan as well + if ( + not is_numeric_dtype(percentiles) + or not np.all(percentiles >= 0) + or not np.all(percentiles <= 1) + ): + raise ValueError("percentiles should all be in the interval [0,1]") + + percentiles = 100 * percentiles + prec = get_precision(percentiles) + percentiles_round_type = percentiles.round(prec).astype(int) + + int_idx = np.isclose(percentiles_round_type, percentiles) + + if np.all(int_idx): + out = percentiles_round_type.astype(str) + return [i + "%" for i in out] + + unique_pcts = np.unique(percentiles) + prec = get_precision(unique_pcts) + out = np.empty_like(percentiles, dtype=object) + out[int_idx] = percentiles[int_idx].round().astype(int).astype(str) + + out[~int_idx] = percentiles[~int_idx].round(prec).astype(str) + return [i + "%" for i in out] + + +def get_precision(array: np.ndarray | Sequence[float]) -> int: + to_begin = array[0] if array[0] > 0 else None + to_end = 100 - array[-1] if array[-1] < 100 else None + diff = np.ediff1d(array, to_begin=to_begin, to_end=to_end) + diff = abs(diff) + prec = -np.floor(np.log10(np.min(diff))).astype(int) + prec = max(1, prec) + return prec + + +def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str: + if x is NaT: + return nat_rep + + # Timestamp.__str__ falls back to datetime.datetime.__str__ = isoformat(sep=' ') + # so it already uses string formatting rather than strftime (faster). + return str(x) + + +def _format_datetime64_dateonly( + x: NaTType | Timestamp, + nat_rep: str = "NaT", + date_format: str | None = None, +) -> str: + if isinstance(x, NaTType): + return nat_rep + + if date_format: + return x.strftime(date_format) + else: + # Timestamp._date_repr relies on string formatting (faster than strftime) + return x._date_repr + + +def get_format_datetime64( + is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None +) -> Callable: + """Return a formatter callable taking a datetime64 as input and providing + a string as output""" + + if is_dates_only: + return lambda x: _format_datetime64_dateonly( + x, nat_rep=nat_rep, date_format=date_format + ) + else: + return lambda x: _format_datetime64(x, nat_rep=nat_rep) + + +class _Datetime64TZFormatter(_Datetime64Formatter): + values: DatetimeArray + + def _format_strings(self) -> list[str]: + """we by definition have a TZ""" + ido = self.values._is_dates_only + values = self.values.astype(object) + formatter = self.formatter or get_format_datetime64( + ido, date_format=self.date_format + ) + fmt_values = [formatter(x) for x in values] + + return fmt_values + + +class _Timedelta64Formatter(_GenericArrayFormatter): + values: TimedeltaArray + + def __init__( + self, + values: TimedeltaArray, + nat_rep: str = "NaT", + **kwargs, + ) -> None: + # TODO: nat_rep is never passed, na_rep is. + super().__init__(values, **kwargs) + self.nat_rep = nat_rep + + def _format_strings(self) -> list[str]: + formatter = self.formatter or get_format_timedelta64( + self.values, nat_rep=self.nat_rep, box=False + ) + return [formatter(x) for x in self.values] + + +def get_format_timedelta64( + values: TimedeltaArray, + nat_rep: str | float = "NaT", + box: bool = False, +) -> Callable: + """ + Return a formatter function for a range of timedeltas. + These will all have the same format argument + + If box, then show the return in quotes + """ + even_days = values._is_dates_only + + if even_days: + format = None + else: + format = "long" + + def _formatter(x): + if x is None or (is_scalar(x) and isna(x)): + return nat_rep + + if not isinstance(x, Timedelta): + x = Timedelta(x) + + # Timedelta._repr_base uses string formatting (faster than strftime) + result = x._repr_base(format=format) + if box: + result = f"'{result}'" + return result + + return _formatter + + +def _make_fixed_width( + strings: list[str], + justify: str = "right", + minimum: int | None = None, + adj: printing._TextAdjustment | None = None, +) -> list[str]: + if len(strings) == 0 or justify == "all": + return strings + + if adj is None: + adjustment = printing.get_adjustment() + else: + adjustment = adj + + max_len = max(adjustment.len(x) for x in strings) + + if minimum is not None: + max_len = max(minimum, max_len) + + conf_max = get_option("display.max_colwidth") + if conf_max is not None and max_len > conf_max: + max_len = conf_max + + def just(x: str) -> str: + if conf_max is not None: + if (conf_max > 3) & (adjustment.len(x) > max_len): + x = x[: max_len - 3] + "..." + return x + + strings = [just(x) for x in strings] + result = adjustment.justify(strings, max_len, mode=justify) + return result + + +def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[str]: + """ + Separates the real and imaginary parts from the complex number, and + executes the _trim_zeros_float method on each of those. + """ + real_part, imag_part = [], [] + for x in str_complexes: + # Complex numbers are represented as "(-)xxx(+/-)xxxj" + # The split will give [{"", "-"}, "xxx", "+/-", "xxx", "j", ""] + # Therefore, the imaginary part is the 4th and 3rd last elements, + # and the real part is everything before the imaginary part + trimmed = re.split(r"([j+-])", x) + real_part.append("".join(trimmed[:-4])) + imag_part.append("".join(trimmed[-4:-2])) + + # We want to align the lengths of the real and imaginary parts of each complex + # number, as well as the lengths the real (resp. complex) parts of all numbers + # in the array + n = len(str_complexes) + padded_parts = _trim_zeros_float(real_part + imag_part, decimal) + if len(padded_parts) == 0: + return [] + padded_length = max(len(part) for part in padded_parts) - 1 + padded = [ + real_pt # real part, possibly NaN + + imag_pt[0] # +/- + + f"{imag_pt[1:]:>{padded_length}}" # complex part (no sign), possibly nan + + "j" + for real_pt, imag_pt in zip(padded_parts[:n], padded_parts[n:]) + ] + return padded + + +def _trim_zeros_single_float(str_float: str) -> str: + """ + Trims trailing zeros after a decimal point, + leaving just one if necessary. + """ + str_float = str_float.rstrip("0") + if str_float.endswith("."): + str_float += "0" + + return str_float + + +def _trim_zeros_float( + str_floats: ArrayLike | list[str], decimal: str = "." +) -> list[str]: + """ + Trims the maximum number of trailing zeros equally from + all numbers containing decimals, leaving just one if + necessary. + """ + trimmed = str_floats + number_regex = re.compile(rf"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$") + + def is_number_with_decimal(x) -> bool: + return re.match(number_regex, x) is not None + + def should_trim(values: ArrayLike | list[str]) -> bool: + """ + Determine if an array of strings should be trimmed. + + Returns True if all numbers containing decimals (defined by the + above regular expression) within the array end in a zero, otherwise + returns False. + """ + numbers = [x for x in values if is_number_with_decimal(x)] + return len(numbers) > 0 and all(x.endswith("0") for x in numbers) + + while should_trim(trimmed): + trimmed = [x[:-1] if is_number_with_decimal(x) else x for x in trimmed] + + # leave one 0 after the decimal points if need be. + result = [ + x + "0" if is_number_with_decimal(x) and x.endswith(decimal) else x + for x in trimmed + ] + return result + + +def _has_names(index: Index) -> bool: + if isinstance(index, MultiIndex): + return com.any_not_none(*index.names) + else: + return index.name is not None + + +class EngFormatter: + """ + Formats float values according to engineering format. + + Based on matplotlib.ticker.EngFormatter + """ + + # The SI engineering prefixes + ENG_PREFIXES = { + -24: "y", + -21: "z", + -18: "a", + -15: "f", + -12: "p", + -9: "n", + -6: "u", + -3: "m", + 0: "", + 3: "k", + 6: "M", + 9: "G", + 12: "T", + 15: "P", + 18: "E", + 21: "Z", + 24: "Y", + } + + def __init__( + self, accuracy: int | None = None, use_eng_prefix: bool = False + ) -> None: + self.accuracy = accuracy + self.use_eng_prefix = use_eng_prefix + + def __call__(self, num: float) -> str: + """ + Formats a number in engineering notation, appending a letter + representing the power of 1000 of the original number. Some examples: + >>> format_eng = EngFormatter(accuracy=0, use_eng_prefix=True) + >>> format_eng(0) + ' 0' + >>> format_eng = EngFormatter(accuracy=1, use_eng_prefix=True) + >>> format_eng(1_000_000) + ' 1.0M' + >>> format_eng = EngFormatter(accuracy=2, use_eng_prefix=False) + >>> format_eng("-1e-6") + '-1.00E-06' + + @param num: the value to represent + @type num: either a numeric value or a string that can be converted to + a numeric value (as per decimal.Decimal constructor) + + @return: engineering formatted string + """ + dnum = Decimal(str(num)) + + if Decimal.is_nan(dnum): + return "NaN" + + if Decimal.is_infinite(dnum): + return "inf" + + sign = 1 + + if dnum < 0: # pragma: no cover + sign = -1 + dnum = -dnum + + if dnum != 0: + pow10 = Decimal(int(math.floor(dnum.log10() / 3) * 3)) + else: + pow10 = Decimal(0) + + pow10 = pow10.min(max(self.ENG_PREFIXES.keys())) + pow10 = pow10.max(min(self.ENG_PREFIXES.keys())) + int_pow10 = int(pow10) + + if self.use_eng_prefix: + prefix = self.ENG_PREFIXES[int_pow10] + elif int_pow10 < 0: + prefix = f"E-{-int_pow10:02d}" + else: + prefix = f"E+{int_pow10:02d}" + + mant = sign * dnum / (10**pow10) + + if self.accuracy is None: # pragma: no cover + format_str = "{mant: g}{prefix}" + else: + format_str = f"{{mant: .{self.accuracy:d}f}}{{prefix}}" + + formatted = format_str.format(mant=mant, prefix=prefix) + + return formatted + + +def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> None: + """ + Format float representation in DataFrame with SI notation. + + Parameters + ---------- + accuracy : int, default 3 + Number of decimal digits after the floating point. + use_eng_prefix : bool, default False + Whether to represent a value with SI prefixes. + + Returns + ------- + None + + Examples + -------- + >>> df = pd.DataFrame([1e-9, 1e-3, 1, 1e3, 1e6]) + >>> df + 0 + 0 1.000000e-09 + 1 1.000000e-03 + 2 1.000000e+00 + 3 1.000000e+03 + 4 1.000000e+06 + + >>> pd.set_eng_float_format(accuracy=1) + >>> df + 0 + 0 1.0E-09 + 1 1.0E-03 + 2 1.0E+00 + 3 1.0E+03 + 4 1.0E+06 + + >>> pd.set_eng_float_format(use_eng_prefix=True) + >>> df + 0 + 0 1.000n + 1 1.000m + 2 1.000 + 3 1.000k + 4 1.000M + + >>> pd.set_eng_float_format(accuracy=1, use_eng_prefix=True) + >>> df + 0 + 0 1.0n + 1 1.0m + 2 1.0 + 3 1.0k + 4 1.0M + + >>> pd.set_option("display.float_format", None) # unset option + """ + set_option("display.float_format", EngFormatter(accuracy, use_eng_prefix)) + + +def get_level_lengths( + levels: Any, sentinel: bool | object | str = "" +) -> list[dict[int, int]]: + """ + For each index in each level the function returns lengths of indexes. + + Parameters + ---------- + levels : list of lists + List of values on for level. + sentinel : string, optional + Value which states that no new index starts on there. + + Returns + ------- + Returns list of maps. For each level returns map of indexes (key is index + in row and value is length of index). + """ + if len(levels) == 0: + return [] + + control = [True] * len(levels[0]) + + result = [] + for level in levels: + last_index = 0 + + lengths = {} + for i, key in enumerate(level): + if control[i] and key == sentinel: + pass + else: + control[i] = False + lengths[last_index] = i - last_index + last_index = i + + lengths[last_index] = len(level) - last_index + + result.append(lengths) + + return result + + +def buffer_put_lines(buf: WriteBuffer[str], lines: list[str]) -> None: + """ + Appends lines to a buffer. + + Parameters + ---------- + buf + The buffer to write to + lines + The lines to append. + """ + if any(isinstance(x, str) for x in lines): + lines = [str(x) for x in lines] + buf.write("\n".join(lines)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/html.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/html.py new file mode 100644 index 0000000000000000000000000000000000000000..794ce77b3b45ec38d9fa58a708939e53bb8ae629 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/html.py @@ -0,0 +1,646 @@ +""" +Module for formatting output data in HTML. +""" +from __future__ import annotations + +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Any, + Final, + cast, +) + +from pandas._config import get_option + +from pandas._libs import lib + +from pandas import ( + MultiIndex, + option_context, +) + +from pandas.io.common import is_url +from pandas.io.formats.format import ( + DataFrameFormatter, + get_level_lengths, +) +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterable, + Mapping, + ) + + +class HTMLFormatter: + """ + Internal class for formatting output data in html. + This class is intended for shared functionality between + DataFrame.to_html() and DataFrame._repr_html_(). + Any logic in common with other output formatting methods + should ideally be inherited from classes in format.py + and this class responsible for only producing html markup. + """ + + indent_delta: Final = 2 + + def __init__( + self, + formatter: DataFrameFormatter, + classes: str | list[str] | tuple[str, ...] | None = None, + border: int | bool | None = None, + table_id: str | None = None, + render_links: bool = False, + ) -> None: + self.fmt = formatter + self.classes = classes + + self.frame = self.fmt.frame + self.columns = self.fmt.tr_frame.columns + self.elements: list[str] = [] + self.bold_rows = self.fmt.bold_rows + self.escape = self.fmt.escape + self.show_dimensions = self.fmt.show_dimensions + if border is None or border is True: + border = cast(int, get_option("display.html.border")) + elif not border: + border = None + + self.border = border + self.table_id = table_id + self.render_links = render_links + + self.col_space = {} + is_multi_index = isinstance(self.columns, MultiIndex) + for column, value in self.fmt.col_space.items(): + col_space_value = f"{value}px" if isinstance(value, int) else value + self.col_space[column] = col_space_value + # GH 53885: Handling case where column is index + # Flatten the data in the multi index and add in the map + if is_multi_index and isinstance(column, tuple): + for column_index in column: + self.col_space[str(column_index)] = col_space_value + + def to_string(self) -> str: + lines = self.render() + if any(isinstance(x, str) for x in lines): + lines = [str(x) for x in lines] + return "\n".join(lines) + + def render(self) -> list[str]: + self._write_table() + + if self.should_show_dimensions: + by = chr(215) # × # noqa: RUF003 + self.write( + f"

{len(self.frame)} rows {by} {len(self.frame.columns)} columns

" + ) + + return self.elements + + @property + def should_show_dimensions(self) -> bool: + return self.fmt.should_show_dimensions + + @property + def show_row_idx_names(self) -> bool: + return self.fmt.show_row_idx_names + + @property + def show_col_idx_names(self) -> bool: + return self.fmt.show_col_idx_names + + @property + def row_levels(self) -> int: + if self.fmt.index: + # showing (row) index + return self.frame.index.nlevels + elif self.show_col_idx_names: + # see gh-22579 + # Column misalignment also occurs for + # a standard index when the columns index is named. + # If the row index is not displayed a column of + # blank cells need to be included before the DataFrame values. + return 1 + # not showing (row) index + return 0 + + def _get_columns_formatted_values(self) -> Iterable: + return self.columns + + @property + def is_truncated(self) -> bool: + return self.fmt.is_truncated + + @property + def ncols(self) -> int: + return len(self.fmt.tr_frame.columns) + + def write(self, s: Any, indent: int = 0) -> None: + rs = pprint_thing(s) + self.elements.append(" " * indent + rs) + + def write_th( + self, s: Any, header: bool = False, indent: int = 0, tags: str | None = None + ) -> None: + """ + Method for writing a formatted . This will + cause min-width to be set if there is one. + indent : int, default 0 + The indentation level of the cell. + tags : str, default None + Tags to include in the cell. + + Returns + ------- + A written ", indent) + else: + self.write(f'', indent) + indent += indent_delta + + for i, s in enumerate(line): + val_tag = tags.get(i, None) + if header or (self.bold_rows and i < nindex_levels): + self.write_th(s, indent=indent, header=header, tags=val_tag) + else: + self.write_td(s, indent, tags=val_tag) + + indent -= indent_delta + self.write("", indent) + + def _write_table(self, indent: int = 0) -> None: + _classes = ["dataframe"] # Default class. + use_mathjax = get_option("display.html.use_mathjax") + if not use_mathjax: + _classes.append("tex2jax_ignore") + if self.classes is not None: + if isinstance(self.classes, str): + self.classes = self.classes.split() + if not isinstance(self.classes, (list, tuple)): + raise TypeError( + "classes must be a string, list, " + f"or tuple, not {type(self.classes)}" + ) + _classes.extend(self.classes) + + if self.table_id is None: + id_section = "" + else: + id_section = f' id="{self.table_id}"' + + if self.border is None: + border_attr = "" + else: + border_attr = f' border="{self.border}"' + + self.write( + f'', + indent, + ) + + if self.fmt.header or self.show_row_idx_names: + self._write_header(indent + self.indent_delta) + + self._write_body(indent + self.indent_delta) + + self.write("
cell. + + If col_space is set on the formatter then that is used for + the value of min-width. + + Parameters + ---------- + s : object + The data to be written inside the cell. + header : bool, default False + Set to True if the is for use inside
cell. + """ + col_space = self.col_space.get(s, None) + + if header and col_space is not None: + tags = tags or "" + tags += f'style="min-width: {col_space};"' + + self._write_cell(s, kind="th", indent=indent, tags=tags) + + def write_td(self, s: Any, indent: int = 0, tags: str | None = None) -> None: + self._write_cell(s, kind="td", indent=indent, tags=tags) + + def _write_cell( + self, s: Any, kind: str = "td", indent: int = 0, tags: str | None = None + ) -> None: + if tags is not None: + start_tag = f"<{kind} {tags}>" + else: + start_tag = f"<{kind}>" + + if self.escape: + # escape & first to prevent double escaping of & + esc = {"&": r"&", "<": r"<", ">": r">"} + else: + esc = {} + + rs = pprint_thing(s, escape_chars=esc).strip() + + if self.render_links and is_url(rs): + rs_unescaped = pprint_thing(s, escape_chars={}).strip() + start_tag += f'' + end_a = "" + else: + end_a = "" + + self.write(f"{start_tag}{rs}{end_a}", indent) + + def write_tr( + self, + line: Iterable, + indent: int = 0, + indent_delta: int = 0, + header: bool = False, + align: str | None = None, + tags: dict[int, str] | None = None, + nindex_levels: int = 0, + ) -> None: + if tags is None: + tags = {} + + if align is None: + self.write("
", indent) + + def _write_col_header(self, indent: int) -> None: + row: list[Hashable] + is_truncated_horizontally = self.fmt.is_truncated_horizontally + if isinstance(self.columns, MultiIndex): + template = 'colspan="{span:d}" halign="left"' + + sentinel: lib.NoDefault | bool + if self.fmt.sparsify: + # GH3547 + sentinel = lib.no_default + else: + sentinel = False + levels = self.columns._format_multi(sparsify=sentinel, include_names=False) + level_lengths = get_level_lengths(levels, sentinel) + inner_lvl = len(level_lengths) - 1 + for lnum, (records, values) in enumerate(zip(level_lengths, levels)): + if is_truncated_horizontally: + # modify the header lines + ins_col = self.fmt.tr_col_num + if self.fmt.sparsify: + recs_new = {} + # Increment tags after ... col. + for tag, span in list(records.items()): + if tag >= ins_col: + recs_new[tag + 1] = span + elif tag + span > ins_col: + recs_new[tag] = span + 1 + if lnum == inner_lvl: + values = ( + values[:ins_col] + ("...",) + values[ins_col:] + ) + else: + # sparse col headers do not receive a ... + values = ( + values[:ins_col] + + (values[ins_col - 1],) + + values[ins_col:] + ) + else: + recs_new[tag] = span + # if ins_col lies between tags, all col headers + # get ... + if tag + span == ins_col: + recs_new[ins_col] = 1 + values = values[:ins_col] + ("...",) + values[ins_col:] + records = recs_new + inner_lvl = len(level_lengths) - 1 + if lnum == inner_lvl: + records[ins_col] = 1 + else: + recs_new = {} + for tag, span in list(records.items()): + if tag >= ins_col: + recs_new[tag + 1] = span + else: + recs_new[tag] = span + recs_new[ins_col] = 1 + records = recs_new + values = values[:ins_col] + ["..."] + values[ins_col:] + + # see gh-22579 + # Column Offset Bug with to_html(index=False) with + # MultiIndex Columns and Index. + # Initially fill row with blank cells before column names. + # TODO: Refactor to remove code duplication with code + # block below for standard columns index. + row = [""] * (self.row_levels - 1) + if self.fmt.index or self.show_col_idx_names: + # see gh-22747 + # If to_html(index_names=False) do not show columns + # index names. + # TODO: Refactor to use _get_column_name_list from + # DataFrameFormatter class and create a + # _get_formatted_column_labels function for code + # parity with DataFrameFormatter class. + if self.fmt.show_index_names: + name = self.columns.names[lnum] + row.append(pprint_thing(name or "")) + else: + row.append("") + + tags = {} + j = len(row) + for i, v in enumerate(values): + if i in records: + if records[i] > 1: + tags[j] = template.format(span=records[i]) + else: + continue + j += 1 + row.append(v) + self.write_tr(row, indent, self.indent_delta, tags=tags, header=True) + else: + # see gh-22579 + # Column misalignment also occurs for + # a standard index when the columns index is named. + # Initially fill row with blank cells before column names. + # TODO: Refactor to remove code duplication with code block + # above for columns MultiIndex. + row = [""] * (self.row_levels - 1) + if self.fmt.index or self.show_col_idx_names: + # see gh-22747 + # If to_html(index_names=False) do not show columns + # index names. + # TODO: Refactor to use _get_column_name_list from + # DataFrameFormatter class. + if self.fmt.show_index_names: + row.append(self.columns.name or "") + else: + row.append("") + row.extend(self._get_columns_formatted_values()) + align = self.fmt.justify + + if is_truncated_horizontally: + ins_col = self.row_levels + self.fmt.tr_col_num + row.insert(ins_col, "...") + + self.write_tr(row, indent, self.indent_delta, header=True, align=align) + + def _write_row_header(self, indent: int) -> None: + is_truncated_horizontally = self.fmt.is_truncated_horizontally + row = [x if x is not None else "" for x in self.frame.index.names] + [""] * ( + self.ncols + (1 if is_truncated_horizontally else 0) + ) + self.write_tr(row, indent, self.indent_delta, header=True) + + def _write_header(self, indent: int) -> None: + self.write("", indent) + + if self.fmt.header: + self._write_col_header(indent + self.indent_delta) + + if self.show_row_idx_names: + self._write_row_header(indent + self.indent_delta) + + self.write("", indent) + + def _get_formatted_values(self) -> dict[int, list[str]]: + with option_context("display.max_colwidth", None): + fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)} + return fmt_values + + def _write_body(self, indent: int) -> None: + self.write("", indent) + fmt_values = self._get_formatted_values() + + # write values + if self.fmt.index and isinstance(self.frame.index, MultiIndex): + self._write_hierarchical_rows(fmt_values, indent + self.indent_delta) + else: + self._write_regular_rows(fmt_values, indent + self.indent_delta) + + self.write("", indent) + + def _write_regular_rows( + self, fmt_values: Mapping[int, list[str]], indent: int + ) -> None: + is_truncated_horizontally = self.fmt.is_truncated_horizontally + is_truncated_vertically = self.fmt.is_truncated_vertically + + nrows = len(self.fmt.tr_frame) + + if self.fmt.index: + fmt = self.fmt._get_formatter("__index__") + if fmt is not None: + index_values = self.fmt.tr_frame.index.map(fmt) + else: + # only reached with non-Multi index + index_values = self.fmt.tr_frame.index._format_flat(include_name=False) + + row: list[str] = [] + for i in range(nrows): + if is_truncated_vertically and i == (self.fmt.tr_row_num): + str_sep_row = ["..."] * len(row) + self.write_tr( + str_sep_row, + indent, + self.indent_delta, + tags=None, + nindex_levels=self.row_levels, + ) + + row = [] + if self.fmt.index: + row.append(index_values[i]) + # see gh-22579 + # Column misalignment also occurs for + # a standard index when the columns index is named. + # Add blank cell before data cells. + elif self.show_col_idx_names: + row.append("") + row.extend(fmt_values[j][i] for j in range(self.ncols)) + + if is_truncated_horizontally: + dot_col_ix = self.fmt.tr_col_num + self.row_levels + row.insert(dot_col_ix, "...") + self.write_tr( + row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels + ) + + def _write_hierarchical_rows( + self, fmt_values: Mapping[int, list[str]], indent: int + ) -> None: + template = 'rowspan="{span}" valign="top"' + + is_truncated_horizontally = self.fmt.is_truncated_horizontally + is_truncated_vertically = self.fmt.is_truncated_vertically + frame = self.fmt.tr_frame + nrows = len(frame) + + assert isinstance(frame.index, MultiIndex) + idx_values = frame.index._format_multi(sparsify=False, include_names=False) + idx_values = list(zip(*idx_values)) + + if self.fmt.sparsify: + # GH3547 + sentinel = lib.no_default + levels = frame.index._format_multi(sparsify=sentinel, include_names=False) + + level_lengths = get_level_lengths(levels, sentinel) + inner_lvl = len(level_lengths) - 1 + if is_truncated_vertically: + # Insert ... row and adjust idx_values and + # level_lengths to take this into account. + ins_row = self.fmt.tr_row_num + inserted = False + for lnum, records in enumerate(level_lengths): + rec_new = {} + for tag, span in list(records.items()): + if tag >= ins_row: + rec_new[tag + 1] = span + elif tag + span > ins_row: + rec_new[tag] = span + 1 + + # GH 14882 - Make sure insertion done once + if not inserted: + dot_row = list(idx_values[ins_row - 1]) + dot_row[-1] = "..." + idx_values.insert(ins_row, tuple(dot_row)) + inserted = True + else: + dot_row = list(idx_values[ins_row]) + dot_row[inner_lvl - lnum] = "..." + idx_values[ins_row] = tuple(dot_row) + else: + rec_new[tag] = span + # If ins_row lies between tags, all cols idx cols + # receive ... + if tag + span == ins_row: + rec_new[ins_row] = 1 + if lnum == 0: + idx_values.insert( + ins_row, tuple(["..."] * len(level_lengths)) + ) + + # GH 14882 - Place ... in correct level + elif inserted: + dot_row = list(idx_values[ins_row]) + dot_row[inner_lvl - lnum] = "..." + idx_values[ins_row] = tuple(dot_row) + level_lengths[lnum] = rec_new + + level_lengths[inner_lvl][ins_row] = 1 + for ix_col in fmt_values: + fmt_values[ix_col].insert(ins_row, "...") + nrows += 1 + + for i in range(nrows): + row = [] + tags = {} + + sparse_offset = 0 + j = 0 + for records, v in zip(level_lengths, idx_values[i]): + if i in records: + if records[i] > 1: + tags[j] = template.format(span=records[i]) + else: + sparse_offset += 1 + continue + + j += 1 + row.append(v) + + row.extend(fmt_values[j][i] for j in range(self.ncols)) + if is_truncated_horizontally: + row.insert( + self.row_levels - sparse_offset + self.fmt.tr_col_num, "..." + ) + self.write_tr( + row, + indent, + self.indent_delta, + tags=tags, + nindex_levels=len(levels) - sparse_offset, + ) + else: + row = [] + for i in range(len(frame)): + if is_truncated_vertically and i == (self.fmt.tr_row_num): + str_sep_row = ["..."] * len(row) + self.write_tr( + str_sep_row, + indent, + self.indent_delta, + tags=None, + nindex_levels=self.row_levels, + ) + + idx_values = list( + zip(*frame.index._format_multi(sparsify=False, include_names=False)) + ) + row = [] + row.extend(idx_values[i]) + row.extend(fmt_values[j][i] for j in range(self.ncols)) + if is_truncated_horizontally: + row.insert(self.row_levels + self.fmt.tr_col_num, "...") + self.write_tr( + row, + indent, + self.indent_delta, + tags=None, + nindex_levels=frame.index.nlevels, + ) + + +class NotebookFormatter(HTMLFormatter): + """ + Internal class for formatting output data in html for display in Jupyter + Notebooks. This class is intended for functionality specific to + DataFrame._repr_html_() and DataFrame.to_html(notebook=True) + """ + + def _get_formatted_values(self) -> dict[int, list[str]]: + return {i: self.fmt.format_col(i) for i in range(self.ncols)} + + def _get_columns_formatted_values(self) -> list[str]: + # only reached with non-Multi Index + return self.columns._format_flat(include_name=False) + + def write_style(self) -> None: + # We use the "scoped" attribute here so that the desired + # style properties for the data frame are not then applied + # throughout the entire notebook. + template_first = """\ + """ + template_select = """\ + .dataframe %s { + %s: %s; + }""" + element_props = [ + ("tbody tr th:only-of-type", "vertical-align", "middle"), + ("tbody tr th", "vertical-align", "top"), + ] + if isinstance(self.columns, MultiIndex): + element_props.append(("thead tr th", "text-align", "left")) + if self.show_row_idx_names: + element_props.append( + ("thead tr:last-of-type th", "text-align", "right") + ) + else: + element_props.append(("thead th", "text-align", "right")) + template_mid = "\n\n".join(template_select % t for t in element_props) + template = dedent(f"{template_first}\n{template_mid}\n{template_last}") + self.write(template) + + def render(self) -> list[str]: + self.write("
") + self.write_style() + super().render() + self.write("
") + return self.elements diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/info.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/info.py new file mode 100644 index 0000000000000000000000000000000000000000..552affbd053f2bed3f4d5f678ddf8eb293f65b01 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/info.py @@ -0,0 +1,1101 @@ +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +import sys +from textwrap import dedent +from typing import TYPE_CHECKING + +from pandas._config import get_option + +from pandas.io.formats import format as fmt +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from collections.abc import ( + Iterable, + Iterator, + Mapping, + Sequence, + ) + + from pandas._typing import ( + Dtype, + WriteBuffer, + ) + + from pandas import ( + DataFrame, + Index, + Series, + ) + + +frame_max_cols_sub = dedent( + """\ + max_cols : int, optional + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used.""" +) + + +show_counts_sub = dedent( + """\ + show_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the DataFrame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts.""" +) + + +frame_examples_sub = dedent( + """\ + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] + >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, + ... "float_col": float_values}) + >>> df + int_col text_col float_col + 0 1 alpha 0.00 + 1 2 beta 0.25 + 2 3 gamma 0.50 + 3 4 delta 0.75 + 4 5 epsilon 1.00 + + Prints information of all columns: + + >>> df.info(verbose=True) + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 248.0+ bytes + + Prints a summary of columns count and its dtypes but not per column + information: + + >>> df.info(verbose=False) + + RangeIndex: 5 entries, 0 to 4 + Columns: 3 entries, int_col to float_col + dtypes: float64(1), int64(1), object(1) + memory usage: 248.0+ bytes + + Pipe output of DataFrame.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> df.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big DataFrames and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> df = pd.DataFrame({ + ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) + ... }) + >>> df.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object + dtypes: object(3) + memory usage: 22.9+ MB + + >>> df.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object + dtypes: object(3) + memory usage: 165.9 MB""" +) + + +frame_see_also_sub = dedent( + """\ + DataFrame.describe: Generate descriptive statistics of DataFrame + columns. + DataFrame.memory_usage: Memory usage of DataFrame columns.""" +) + + +frame_sub_kwargs = { + "klass": "DataFrame", + "type_sub": " and columns", + "max_cols_sub": frame_max_cols_sub, + "show_counts_sub": show_counts_sub, + "examples_sub": frame_examples_sub, + "see_also_sub": frame_see_also_sub, + "version_added_sub": "", +} + + +series_examples_sub = dedent( + """\ + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> s = pd.Series(text_values, index=int_values) + >>> s.info() + + Index: 5 entries, 1 to 5 + Series name: None + Non-Null Count Dtype + -------------- ----- + 5 non-null object + dtypes: object(1) + memory usage: 80.0+ bytes + + Prints a summary excluding information about its values: + + >>> s.info(verbose=False) + + Index: 5 entries, 1 to 5 + dtypes: object(1) + memory usage: 80.0+ bytes + + Pipe output of Series.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> s.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big Series and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6)) + >>> s.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object + dtypes: object(1) + memory usage: 7.6+ MB + + >>> s.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object + dtypes: object(1) + memory usage: 55.3 MB""" +) + + +series_see_also_sub = dedent( + """\ + Series.describe: Generate descriptive statistics of Series. + Series.memory_usage: Memory usage of Series.""" +) + + +series_sub_kwargs = { + "klass": "Series", + "type_sub": "", + "max_cols_sub": "", + "show_counts_sub": show_counts_sub, + "examples_sub": series_examples_sub, + "see_also_sub": series_see_also_sub, + "version_added_sub": "\n.. versionadded:: 1.4.0\n", +} + + +INFO_DOCSTRING = dedent( + """ + Print a concise summary of a {klass}. + + This method prints information about a {klass} including + the index dtype{type_sub}, non-null values and memory usage. + {version_added_sub}\ + + Parameters + ---------- + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + {max_cols_sub} + memory_usage : bool, str, optional + Specifies whether total memory usage of the {klass} + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + + True always show memory usage. False never shows memory usage. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed + at the cost of computational resources. See the + :ref:`Frequently Asked Questions ` for more + details. + {show_counts_sub} + + Returns + ------- + None + This method prints a summary of a {klass} and returns None. + + See Also + -------- + {see_also_sub} + + Examples + -------- + {examples_sub} + """ +) + + +def _put_str(s: str | Dtype, space: int) -> str: + """ + Make string of specified length, padding to the right if necessary. + + Parameters + ---------- + s : Union[str, Dtype] + String to be formatted. + space : int + Length to force string to be of. + + Returns + ------- + str + String coerced to given length. + + Examples + -------- + >>> pd.io.formats.info._put_str("panda", 6) + 'panda ' + >>> pd.io.formats.info._put_str("panda", 4) + 'pand' + """ + return str(s)[:space].ljust(space) + + +def _sizeof_fmt(num: float, size_qualifier: str) -> str: + """ + Return size in human readable format. + + Parameters + ---------- + num : int + Size in bytes. + size_qualifier : str + Either empty, or '+' (if lower bound). + + Returns + ------- + str + Size in human readable format. + + Examples + -------- + >>> _sizeof_fmt(23028, '') + '22.5 KB' + + >>> _sizeof_fmt(23028, '+') + '22.5+ KB' + """ + for x in ["bytes", "KB", "MB", "GB", "TB"]: + if num < 1024.0: + return f"{num:3.1f}{size_qualifier} {x}" + num /= 1024.0 + return f"{num:3.1f}{size_qualifier} PB" + + +def _initialize_memory_usage( + memory_usage: bool | str | None = None, +) -> bool | str: + """Get memory usage based on inputs and display options.""" + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + return memory_usage + + +class _BaseInfo(ABC): + """ + Base class for DataFrameInfo and SeriesInfo. + + Parameters + ---------- + data : DataFrame or Series + Either dataframe or series. + memory_usage : bool or str, optional + If "deep", introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. + """ + + data: DataFrame | Series + memory_usage: bool | str + + @property + @abstractmethod + def dtypes(self) -> Iterable[Dtype]: + """ + Dtypes. + + Returns + ------- + dtypes : sequence + Dtype of each of the DataFrame's columns (or one series column). + """ + + @property + @abstractmethod + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + + @property + @abstractmethod + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns or column (if series).""" + + @property + @abstractmethod + def memory_usage_bytes(self) -> int: + """ + Memory usage in bytes. + + Returns + ------- + memory_usage_bytes : int + Object's total memory usage in bytes. + """ + + @property + def memory_usage_string(self) -> str: + """Memory usage in a form of human readable string.""" + return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n" + + @property + def size_qualifier(self) -> str: + size_qualifier = "" + if self.memory_usage: + if self.memory_usage != "deep": + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + if ( + "object" in self.dtype_counts + or self.data.index._is_memory_usage_qualified() + ): + size_qualifier = "+" + return size_qualifier + + @abstractmethod + def render( + self, + *, + buf: WriteBuffer[str] | None, + max_cols: int | None, + verbose: bool | None, + show_counts: bool | None, + ) -> None: + pass + + +class DataFrameInfo(_BaseInfo): + """ + Class storing dataframe-specific info. + """ + + def __init__( + self, + data: DataFrame, + memory_usage: bool | str | None = None, + ) -> None: + self.data: DataFrame = data + self.memory_usage = _initialize_memory_usage(memory_usage) + + @property + def dtype_counts(self) -> Mapping[str, int]: + return _get_dataframe_dtype_counts(self.data) + + @property + def dtypes(self) -> Iterable[Dtype]: + """ + Dtypes. + + Returns + ------- + dtypes + Dtype of each of the DataFrame's columns. + """ + return self.data.dtypes + + @property + def ids(self) -> Index: + """ + Column names. + + Returns + ------- + ids : Index + DataFrame's column names. + """ + return self.data.columns + + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return len(self.ids) + + @property + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns or column (if series).""" + return self.data.count() + + @property + def memory_usage_bytes(self) -> int: + deep = self.memory_usage == "deep" + return self.data.memory_usage(index=True, deep=deep).sum() + + def render( + self, + *, + buf: WriteBuffer[str] | None, + max_cols: int | None, + verbose: bool | None, + show_counts: bool | None, + ) -> None: + printer = _DataFrameInfoPrinter( + info=self, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) + printer.to_buffer(buf) + + +class SeriesInfo(_BaseInfo): + """ + Class storing series-specific info. + """ + + def __init__( + self, + data: Series, + memory_usage: bool | str | None = None, + ) -> None: + self.data: Series = data + self.memory_usage = _initialize_memory_usage(memory_usage) + + def render( + self, + *, + buf: WriteBuffer[str] | None = None, + max_cols: int | None = None, + verbose: bool | None = None, + show_counts: bool | None = None, + ) -> None: + if max_cols is not None: + raise ValueError( + "Argument `max_cols` can only be passed " + "in DataFrame.info, not Series.info" + ) + printer = _SeriesInfoPrinter( + info=self, + verbose=verbose, + show_counts=show_counts, + ) + printer.to_buffer(buf) + + @property + def non_null_counts(self) -> Sequence[int]: + return [self.data.count()] + + @property + def dtypes(self) -> Iterable[Dtype]: + return [self.data.dtypes] + + @property + def dtype_counts(self) -> Mapping[str, int]: + from pandas.core.frame import DataFrame + + return _get_dataframe_dtype_counts(DataFrame(self.data)) + + @property + def memory_usage_bytes(self) -> int: + """Memory usage in bytes. + + Returns + ------- + memory_usage_bytes : int + Object's total memory usage in bytes. + """ + deep = self.memory_usage == "deep" + return self.data.memory_usage(index=True, deep=deep) + + +class _InfoPrinterAbstract: + """ + Class for printing dataframe or series info. + """ + + def to_buffer(self, buf: WriteBuffer[str] | None = None) -> None: + """Save dataframe info into buffer.""" + table_builder = self._create_table_builder() + lines = table_builder.get_lines() + if buf is None: # pragma: no cover + buf = sys.stdout + fmt.buffer_put_lines(buf, lines) + + @abstractmethod + def _create_table_builder(self) -> _TableBuilderAbstract: + """Create instance of table builder.""" + + +class _DataFrameInfoPrinter(_InfoPrinterAbstract): + """ + Class for printing dataframe info. + + Parameters + ---------- + info : DataFrameInfo + Instance of DataFrameInfo. + max_cols : int, optional + When to switch from the verbose to the truncated output. + verbose : bool, optional + Whether to print the full summary. + show_counts : bool, optional + Whether to show the non-null counts. + """ + + def __init__( + self, + info: DataFrameInfo, + max_cols: int | None = None, + verbose: bool | None = None, + show_counts: bool | None = None, + ) -> None: + self.info = info + self.data = info.data + self.verbose = verbose + self.max_cols = self._initialize_max_cols(max_cols) + self.show_counts = self._initialize_show_counts(show_counts) + + @property + def max_rows(self) -> int: + """Maximum info rows to be displayed.""" + return get_option("display.max_info_rows", len(self.data) + 1) + + @property + def exceeds_info_cols(self) -> bool: + """Check if number of columns to be summarized does not exceed maximum.""" + return bool(self.col_count > self.max_cols) + + @property + def exceeds_info_rows(self) -> bool: + """Check if number of rows to be summarized does not exceed maximum.""" + return bool(len(self.data) > self.max_rows) + + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return self.info.col_count + + def _initialize_max_cols(self, max_cols: int | None) -> int: + if max_cols is None: + return get_option("display.max_info_columns", self.col_count + 1) + return max_cols + + def _initialize_show_counts(self, show_counts: bool | None) -> bool: + if show_counts is None: + return bool(not self.exceeds_info_cols and not self.exceeds_info_rows) + else: + return show_counts + + def _create_table_builder(self) -> _DataFrameTableBuilder: + """ + Create instance of table builder based on verbosity and display settings. + """ + if self.verbose: + return _DataFrameTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) + elif self.verbose is False: # specifically set to False, not necessarily None + return _DataFrameTableBuilderNonVerbose(info=self.info) + elif self.exceeds_info_cols: + return _DataFrameTableBuilderNonVerbose(info=self.info) + else: + return _DataFrameTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) + + +class _SeriesInfoPrinter(_InfoPrinterAbstract): + """Class for printing series info. + + Parameters + ---------- + info : SeriesInfo + Instance of SeriesInfo. + verbose : bool, optional + Whether to print the full summary. + show_counts : bool, optional + Whether to show the non-null counts. + """ + + def __init__( + self, + info: SeriesInfo, + verbose: bool | None = None, + show_counts: bool | None = None, + ) -> None: + self.info = info + self.data = info.data + self.verbose = verbose + self.show_counts = self._initialize_show_counts(show_counts) + + def _create_table_builder(self) -> _SeriesTableBuilder: + """ + Create instance of table builder based on verbosity. + """ + if self.verbose or self.verbose is None: + return _SeriesTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) + else: + return _SeriesTableBuilderNonVerbose(info=self.info) + + def _initialize_show_counts(self, show_counts: bool | None) -> bool: + if show_counts is None: + return True + else: + return show_counts + + +class _TableBuilderAbstract(ABC): + """ + Abstract builder for info table. + """ + + _lines: list[str] + info: _BaseInfo + + @abstractmethod + def get_lines(self) -> list[str]: + """Product in a form of list of lines (strings).""" + + @property + def data(self) -> DataFrame | Series: + return self.info.data + + @property + def dtypes(self) -> Iterable[Dtype]: + """Dtypes of each of the DataFrame's columns.""" + return self.info.dtypes + + @property + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + return self.info.dtype_counts + + @property + def display_memory_usage(self) -> bool: + """Whether to display memory usage.""" + return bool(self.info.memory_usage) + + @property + def memory_usage_string(self) -> str: + """Memory usage string with proper size qualifier.""" + return self.info.memory_usage_string + + @property + def non_null_counts(self) -> Sequence[int]: + return self.info.non_null_counts + + def add_object_type_line(self) -> None: + """Add line with string representation of dataframe to the table.""" + self._lines.append(str(type(self.data))) + + def add_index_range_line(self) -> None: + """Add line with range of indices to the table.""" + self._lines.append(self.data.index._summary()) + + def add_dtypes_line(self) -> None: + """Add summary line with dtypes present in dataframe.""" + collected_dtypes = [ + f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) + ] + self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") + + +class _DataFrameTableBuilder(_TableBuilderAbstract): + """ + Abstract builder for dataframe info table. + + Parameters + ---------- + info : DataFrameInfo. + Instance of DataFrameInfo. + """ + + def __init__(self, *, info: DataFrameInfo) -> None: + self.info: DataFrameInfo = info + + def get_lines(self) -> list[str]: + self._lines = [] + if self.col_count == 0: + self._fill_empty_info() + else: + self._fill_non_empty_info() + return self._lines + + def _fill_empty_info(self) -> None: + """Add lines to the info table, pertaining to empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self._lines.append(f"Empty {type(self.data).__name__}\n") + + @abstractmethod + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + + @property + def data(self) -> DataFrame: + """DataFrame.""" + return self.info.data + + @property + def ids(self) -> Index: + """Dataframe columns.""" + return self.info.ids + + @property + def col_count(self) -> int: + """Number of dataframe columns to be summarized.""" + return self.info.col_count + + def add_memory_usage_line(self) -> None: + """Add line containing memory usage.""" + self._lines.append(f"memory usage: {self.memory_usage_string}") + + +class _DataFrameTableBuilderNonVerbose(_DataFrameTableBuilder): + """ + Dataframe info table builder for non-verbose output. + """ + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + def add_columns_summary_line(self) -> None: + self._lines.append(self.ids._summary(name="Columns")) + + +class _TableBuilderVerboseMixin(_TableBuilderAbstract): + """ + Mixin for verbose info output. + """ + + SPACING: str = " " * 2 + strrows: Sequence[Sequence[str]] + gross_column_widths: Sequence[int] + with_counts: bool + + @property + @abstractmethod + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + + @property + def header_column_widths(self) -> Sequence[int]: + """Widths of header columns (only titles).""" + return [len(col) for col in self.headers] + + def _get_gross_column_widths(self) -> Sequence[int]: + """Get widths of columns containing both headers and actual content.""" + body_column_widths = self._get_body_column_widths() + return [ + max(*widths) + for widths in zip(self.header_column_widths, body_column_widths) + ] + + def _get_body_column_widths(self) -> Sequence[int]: + """Get widths of table content columns.""" + strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) + return [max(len(x) for x in col) for col in strcols] + + def _gen_rows(self) -> Iterator[Sequence[str]]: + """ + Generator function yielding rows content. + + Each element represents a row comprising a sequence of strings. + """ + if self.with_counts: + return self._gen_rows_with_counts() + else: + return self._gen_rows_without_counts() + + @abstractmethod + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + + @abstractmethod + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + + def add_header_line(self) -> None: + header_line = self.SPACING.join( + [ + _put_str(header, col_width) + for header, col_width in zip(self.headers, self.gross_column_widths) + ] + ) + self._lines.append(header_line) + + def add_separator_line(self) -> None: + separator_line = self.SPACING.join( + [ + _put_str("-" * header_colwidth, gross_colwidth) + for header_colwidth, gross_colwidth in zip( + self.header_column_widths, self.gross_column_widths + ) + ] + ) + self._lines.append(separator_line) + + def add_body_lines(self) -> None: + for row in self.strrows: + body_line = self.SPACING.join( + [ + _put_str(col, gross_colwidth) + for col, gross_colwidth in zip(row, self.gross_column_widths) + ] + ) + self._lines.append(body_line) + + def _gen_non_null_counts(self) -> Iterator[str]: + """Iterator with string representation of non-null counts.""" + for count in self.non_null_counts: + yield f"{count} non-null" + + def _gen_dtypes(self) -> Iterator[str]: + """Iterator with string representation of column dtypes.""" + for dtype in self.dtypes: + yield pprint_thing(dtype) + + +class _DataFrameTableBuilderVerbose(_DataFrameTableBuilder, _TableBuilderVerboseMixin): + """ + Dataframe info table builder for verbose output. + """ + + def __init__( + self, + *, + info: DataFrameInfo, + with_counts: bool, + ) -> None: + self.info = info + self.with_counts = with_counts + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + @property + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + if self.with_counts: + return [" # ", "Column", "Non-Null Count", "Dtype"] + return [" # ", "Column", "Dtype"] + + def add_columns_summary_line(self) -> None: + self._lines.append(f"Data columns (total {self.col_count} columns):") + + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + yield from zip( + self._gen_line_numbers(), + self._gen_columns(), + self._gen_dtypes(), + ) + + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + yield from zip( + self._gen_line_numbers(), + self._gen_columns(), + self._gen_non_null_counts(), + self._gen_dtypes(), + ) + + def _gen_line_numbers(self) -> Iterator[str]: + """Iterator with string representation of column numbers.""" + for i, _ in enumerate(self.ids): + yield f" {i}" + + def _gen_columns(self) -> Iterator[str]: + """Iterator with string representation of column names.""" + for col in self.ids: + yield pprint_thing(col) + + +class _SeriesTableBuilder(_TableBuilderAbstract): + """ + Abstract builder for series info table. + + Parameters + ---------- + info : SeriesInfo. + Instance of SeriesInfo. + """ + + def __init__(self, *, info: SeriesInfo) -> None: + self.info: SeriesInfo = info + + def get_lines(self) -> list[str]: + self._lines = [] + self._fill_non_empty_info() + return self._lines + + @property + def data(self) -> Series: + """Series.""" + return self.info.data + + def add_memory_usage_line(self) -> None: + """Add line containing memory usage.""" + self._lines.append(f"memory usage: {self.memory_usage_string}") + + @abstractmethod + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty series.""" + + +class _SeriesTableBuilderNonVerbose(_SeriesTableBuilder): + """ + Series info table builder for non-verbose output. + """ + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty series.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + +class _SeriesTableBuilderVerbose(_SeriesTableBuilder, _TableBuilderVerboseMixin): + """ + Series info table builder for verbose output. + """ + + def __init__( + self, + *, + info: SeriesInfo, + with_counts: bool, + ) -> None: + self.info = info + self.with_counts = with_counts + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty series.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_series_name_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + def add_series_name_line(self) -> None: + self._lines.append(f"Series name: {self.data.name}") + + @property + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + if self.with_counts: + return ["Non-Null Count", "Dtype"] + return ["Dtype"] + + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + yield from self._gen_dtypes() + + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + yield from zip( + self._gen_non_null_counts(), + self._gen_dtypes(), + ) + + +def _get_dataframe_dtype_counts(df: DataFrame) -> Mapping[str, int]: + """ + Create mapping between datatypes and their number of occurrences. + """ + # groupby dtype.name to collect e.g. Categorical columns + return df.dtypes.value_counts().groupby(lambda x: x.name).sum() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/printing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/printing.py new file mode 100644 index 0000000000000000000000000000000000000000..2cc9368f8846a6423655040673df283d111efeda --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/printing.py @@ -0,0 +1,572 @@ +""" +Printing tools. +""" +from __future__ import annotations + +from collections.abc import ( + Iterable, + Mapping, + Sequence, +) +import sys +from typing import ( + Any, + Callable, + TypeVar, + Union, +) +from unicodedata import east_asian_width + +from pandas._config import get_option + +from pandas.core.dtypes.inference import is_sequence + +from pandas.io.formats.console import get_console_size + +EscapeChars = Union[Mapping[str, str], Iterable[str]] +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") + + +def adjoin(space: int, *lists: list[str], **kwargs) -> str: + """ + Glues together two sets of strings using the amount of space requested. + The idea is to prettify. + + ---------- + space : int + number of spaces for padding + lists : str + list of str which being joined + strlen : callable + function used to calculate the length of each str. Needed for unicode + handling. + justfunc : callable + function used to justify str. Needed for unicode handling. + """ + strlen = kwargs.pop("strlen", len) + justfunc = kwargs.pop("justfunc", _adj_justify) + + newLists = [] + lengths = [max(map(strlen, x)) + space for x in lists[:-1]] + # not the last one + lengths.append(max(map(len, lists[-1]))) + maxLen = max(map(len, lists)) + for i, lst in enumerate(lists): + nl = justfunc(lst, lengths[i], mode="left") + nl = ([" " * lengths[i]] * (maxLen - len(lst))) + nl + newLists.append(nl) + toJoin = zip(*newLists) + return "\n".join("".join(lines) for lines in toJoin) + + +def _adj_justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]: + """ + Perform ljust, center, rjust against string or list-like + """ + if mode == "left": + return [x.ljust(max_len) for x in texts] + elif mode == "center": + return [x.center(max_len) for x in texts] + else: + return [x.rjust(max_len) for x in texts] + + +# Unicode consolidation +# --------------------- +# +# pprinting utility functions for generating Unicode text or +# bytes(3.x)/str(2.x) representations of objects. +# Try to use these as much as possible rather than rolling your own. +# +# When to use +# ----------- +# +# 1) If you're writing code internal to pandas (no I/O directly involved), +# use pprint_thing(). +# +# It will always return unicode text which can handled by other +# parts of the package without breakage. +# +# 2) if you need to write something out to file, use +# pprint_thing_encoded(encoding). +# +# If no encoding is specified, it defaults to utf-8. Since encoding pure +# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're +# working with straight ascii. + + +def _pprint_seq( + seq: Sequence, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds +) -> str: + """ + internal. pprinter for iterables. you should probably use pprint_thing() + rather than calling this directly. + + bounds length of printed sequence, depending on options + """ + if isinstance(seq, set): + fmt = "{{{body}}}" + else: + fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})" + + if max_seq_items is False: + nitems = len(seq) + else: + nitems = max_seq_items or get_option("max_seq_items") or len(seq) + + s = iter(seq) + # handle sets, no slicing + r = [ + pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds) + for i in range(min(nitems, len(seq))) + ] + body = ", ".join(r) + + if nitems < len(seq): + body += ", ..." + elif isinstance(seq, tuple) and len(seq) == 1: + body += "," + + return fmt.format(body=body) + + +def _pprint_dict( + seq: Mapping, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds +) -> str: + """ + internal. pprinter for iterables. you should probably use pprint_thing() + rather than calling this directly. + """ + fmt = "{{{things}}}" + pairs = [] + + pfmt = "{key}: {val}" + + if max_seq_items is False: + nitems = len(seq) + else: + nitems = max_seq_items or get_option("max_seq_items") or len(seq) + + for k, v in list(seq.items())[:nitems]: + pairs.append( + pfmt.format( + key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), + val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), + ) + ) + + if nitems < len(seq): + return fmt.format(things=", ".join(pairs) + ", ...") + else: + return fmt.format(things=", ".join(pairs)) + + +def pprint_thing( + thing: Any, + _nest_lvl: int = 0, + escape_chars: EscapeChars | None = None, + default_escapes: bool = False, + quote_strings: bool = False, + max_seq_items: int | None = None, +) -> str: + """ + This function is the sanctioned way of converting objects + to a string representation and properly handles nested sequences. + + Parameters + ---------- + thing : anything to be formatted + _nest_lvl : internal use only. pprint_thing() is mutually-recursive + with pprint_sequence, this argument is used to keep track of the + current nesting level, and limit it. + escape_chars : list or dict, optional + Characters to escape. If a dict is passed the values are the + replacements + default_escapes : bool, default False + Whether the input escape characters replaces or adds to the defaults + max_seq_items : int or None, default None + Pass through to other pretty printers to limit sequence printing + + Returns + ------- + str + """ + + def as_escaped_string( + thing: Any, escape_chars: EscapeChars | None = escape_chars + ) -> str: + translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} + if isinstance(escape_chars, dict): + if default_escapes: + translate.update(escape_chars) + else: + translate = escape_chars + escape_chars = list(escape_chars.keys()) + else: + escape_chars = escape_chars or () + + result = str(thing) + for c in escape_chars: + result = result.replace(c, translate[c]) + return result + + if hasattr(thing, "__next__"): + return str(thing) + elif isinstance(thing, dict) and _nest_lvl < get_option( + "display.pprint_nest_depth" + ): + result = _pprint_dict( + thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items + ) + elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"): + result = _pprint_seq( + thing, + _nest_lvl, + escape_chars=escape_chars, + quote_strings=quote_strings, + max_seq_items=max_seq_items, + ) + elif isinstance(thing, str) and quote_strings: + result = f"'{as_escaped_string(thing)}'" + else: + result = as_escaped_string(thing) + + return result + + +def pprint_thing_encoded( + object, encoding: str = "utf-8", errors: str = "replace" +) -> bytes: + value = pprint_thing(object) # get unicode representation of object + return value.encode(encoding, errors) + + +def enable_data_resource_formatter(enable: bool) -> None: + if "IPython" not in sys.modules: + # definitely not in IPython + return + from IPython import get_ipython + + ip = get_ipython() + if ip is None: + # still not in IPython + return + + formatters = ip.display_formatter.formatters + mimetype = "application/vnd.dataresource+json" + + if enable: + if mimetype not in formatters: + # define tableschema formatter + from IPython.core.formatters import BaseFormatter + from traitlets import ObjectName + + class TableSchemaFormatter(BaseFormatter): + print_method = ObjectName("_repr_data_resource_") + _return_type = (dict,) + + # register it: + formatters[mimetype] = TableSchemaFormatter() + # enable it if it's been disabled: + formatters[mimetype].enabled = True + # unregister tableschema mime-type + elif mimetype in formatters: + formatters[mimetype].enabled = False + + +def default_pprint(thing: Any, max_seq_items: int | None = None) -> str: + return pprint_thing( + thing, + escape_chars=("\t", "\r", "\n"), + quote_strings=True, + max_seq_items=max_seq_items, + ) + + +def format_object_summary( + obj, + formatter: Callable, + is_justify: bool = True, + name: str | None = None, + indent_for_name: bool = True, + line_break_each_value: bool = False, +) -> str: + """ + Return the formatted obj as a unicode string + + Parameters + ---------- + obj : object + must be iterable and support __getitem__ + formatter : callable + string formatter for an element + is_justify : bool + should justify the display + name : name, optional + defaults to the class name of the obj + indent_for_name : bool, default True + Whether subsequent lines should be indented to + align with the name. + line_break_each_value : bool, default False + If True, inserts a line break for each value of ``obj``. + If False, only break lines when the a line of values gets wider + than the display width. + + Returns + ------- + summary string + """ + display_width, _ = get_console_size() + if display_width is None: + display_width = get_option("display.width") or 80 + if name is None: + name = type(obj).__name__ + + if indent_for_name: + name_len = len(name) + space1 = f'\n{(" " * (name_len + 1))}' + space2 = f'\n{(" " * (name_len + 2))}' + else: + space1 = "\n" + space2 = "\n " # space for the opening '[' + + n = len(obj) + if line_break_each_value: + # If we want to vertically align on each value of obj, we need to + # separate values by a line break and indent the values + sep = ",\n " + " " * len(name) + else: + sep = "," + max_seq_items = get_option("display.max_seq_items") or n + + # are we a truncated display + is_truncated = n > max_seq_items + + # adj can optionally handle unicode eastern asian width + adj = get_adjustment() + + def _extend_line( + s: str, line: str, value: str, display_width: int, next_line_prefix: str + ) -> tuple[str, str]: + if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width: + s += line.rstrip() + line = next_line_prefix + line += value + return s, line + + def best_len(values: list[str]) -> int: + if values: + return max(adj.len(x) for x in values) + else: + return 0 + + close = ", " + + if n == 0: + summary = f"[]{close}" + elif n == 1 and not line_break_each_value: + first = formatter(obj[0]) + summary = f"[{first}]{close}" + elif n == 2 and not line_break_each_value: + first = formatter(obj[0]) + last = formatter(obj[-1]) + summary = f"[{first}, {last}]{close}" + else: + if max_seq_items == 1: + # If max_seq_items=1 show only last element + head = [] + tail = [formatter(x) for x in obj[-1:]] + elif n > max_seq_items: + n = min(max_seq_items // 2, 10) + head = [formatter(x) for x in obj[:n]] + tail = [formatter(x) for x in obj[-n:]] + else: + head = [] + tail = [formatter(x) for x in obj] + + # adjust all values to max length if needed + if is_justify: + if line_break_each_value: + # Justify each string in the values of head and tail, so the + # strings will right align when head and tail are stacked + # vertically. + head, tail = _justify(head, tail) + elif is_truncated or not ( + len(", ".join(head)) < display_width + and len(", ".join(tail)) < display_width + ): + # Each string in head and tail should align with each other + max_length = max(best_len(head), best_len(tail)) + head = [x.rjust(max_length) for x in head] + tail = [x.rjust(max_length) for x in tail] + # If we are not truncated and we are only a single + # line, then don't justify + + if line_break_each_value: + # Now head and tail are of type List[Tuple[str]]. Below we + # convert them into List[str], so there will be one string per + # value. Also truncate items horizontally if wider than + # max_space + max_space = display_width - len(space2) + value = tail[0] + max_items = 1 + for num_items in reversed(range(1, len(value) + 1)): + pprinted_seq = _pprint_seq(value, max_seq_items=num_items) + if len(pprinted_seq) < max_space: + max_items = num_items + break + head = [_pprint_seq(x, max_seq_items=max_items) for x in head] + tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail] + + summary = "" + line = space2 + + for head_value in head: + word = head_value + sep + " " + summary, line = _extend_line(summary, line, word, display_width, space2) + + if is_truncated: + # remove trailing space of last line + summary += line.rstrip() + space2 + "..." + line = space2 + + for tail_item in tail[:-1]: + word = tail_item + sep + " " + summary, line = _extend_line(summary, line, word, display_width, space2) + + # last value: no sep added + 1 space of width used for trailing ',' + summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2) + summary += line + + # right now close is either '' or ', ' + # Now we want to include the ']', but not the maybe space. + close = "]" + close.rstrip(" ") + summary += close + + if len(summary) > (display_width) or line_break_each_value: + summary += space1 + else: # one row + summary += " " + + # remove initial space + summary = "[" + summary[len(space2) :] + + return summary + + +def _justify( + head: list[Sequence[str]], tail: list[Sequence[str]] +) -> tuple[list[tuple[str, ...]], list[tuple[str, ...]]]: + """ + Justify items in head and tail, so they are right-aligned when stacked. + + Parameters + ---------- + head : list-like of list-likes of strings + tail : list-like of list-likes of strings + + Returns + ------- + tuple of list of tuples of strings + Same as head and tail, but items are right aligned when stacked + vertically. + + Examples + -------- + >>> _justify([['a', 'b']], [['abc', 'abcd']]) + ([(' a', ' b')], [('abc', 'abcd')]) + """ + combined = head + tail + + # For each position for the sequences in ``combined``, + # find the length of the largest string. + max_length = [0] * len(combined[0]) + for inner_seq in combined: + length = [len(item) for item in inner_seq] + max_length = [max(x, y) for x, y in zip(max_length, length)] + + # justify each item in each list-like in head and tail using max_length + head_tuples = [ + tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in head + ] + tail_tuples = [ + tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in tail + ] + return head_tuples, tail_tuples + + +class PrettyDict(dict[_KT, _VT]): + """Dict extension to support abbreviated __repr__""" + + def __repr__(self) -> str: + return pprint_thing(self) + + +class _TextAdjustment: + def __init__(self) -> None: + self.encoding = get_option("display.encoding") + + def len(self, text: str) -> int: + return len(text) + + def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]: + """ + Perform ljust, center, rjust against string or list-like + """ + if mode == "left": + return [x.ljust(max_len) for x in texts] + elif mode == "center": + return [x.center(max_len) for x in texts] + else: + return [x.rjust(max_len) for x in texts] + + def adjoin(self, space: int, *lists, **kwargs) -> str: + return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs) + + +class _EastAsianTextAdjustment(_TextAdjustment): + def __init__(self) -> None: + super().__init__() + if get_option("display.unicode.ambiguous_as_wide"): + self.ambiguous_width = 2 + else: + self.ambiguous_width = 1 + + # Definition of East Asian Width + # https://unicode.org/reports/tr11/ + # Ambiguous width can be changed by option + self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1} + + def len(self, text: str) -> int: + """ + Calculate display width considering unicode East Asian Width + """ + if not isinstance(text, str): + return len(text) + + return sum( + self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text + ) + + def justify( + self, texts: Iterable[str], max_len: int, mode: str = "right" + ) -> list[str]: + # re-calculate padding space per str considering East Asian Width + def _get_pad(t): + return max_len - self.len(t) + len(t) + + if mode == "left": + return [x.ljust(_get_pad(x)) for x in texts] + elif mode == "center": + return [x.center(_get_pad(x)) for x in texts] + else: + return [x.rjust(_get_pad(x)) for x in texts] + + +def get_adjustment() -> _TextAdjustment: + use_east_asian_width = get_option("display.unicode.east_asian_width") + if use_east_asian_width: + return _EastAsianTextAdjustment() + else: + return _TextAdjustment() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/string.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/string.py new file mode 100644 index 0000000000000000000000000000000000000000..cdad388592717dff79fde61bf35a12c0635034c1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/string.py @@ -0,0 +1,206 @@ +""" +Module for formatting output data in console (to string). +""" +from __future__ import annotations + +from shutil import get_terminal_size +from typing import TYPE_CHECKING + +import numpy as np + +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from collections.abc import Iterable + + from pandas.io.formats.format import DataFrameFormatter + + +class StringFormatter: + """Formatter for string representation of a dataframe.""" + + def __init__(self, fmt: DataFrameFormatter, line_width: int | None = None) -> None: + self.fmt = fmt + self.adj = fmt.adj + self.frame = fmt.frame + self.line_width = line_width + + def to_string(self) -> str: + text = self._get_string_representation() + if self.fmt.should_show_dimensions: + text = f"{text}{self.fmt.dimensions_info}" + return text + + def _get_strcols(self) -> list[list[str]]: + strcols = self.fmt.get_strcols() + if self.fmt.is_truncated: + strcols = self._insert_dot_separators(strcols) + return strcols + + def _get_string_representation(self) -> str: + if self.fmt.frame.empty: + return self._empty_info_line + + strcols = self._get_strcols() + + if self.line_width is None: + # no need to wrap around just print the whole frame + return self.adj.adjoin(1, *strcols) + + if self._need_to_wrap_around: + return self._join_multiline(strcols) + + return self._fit_strcols_to_terminal_width(strcols) + + @property + def _empty_info_line(self) -> str: + return ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {pprint_thing(self.frame.columns)}\n" + f"Index: {pprint_thing(self.frame.index)}" + ) + + @property + def _need_to_wrap_around(self) -> bool: + return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0) + + def _insert_dot_separators(self, strcols: list[list[str]]) -> list[list[str]]: + str_index = self.fmt._get_formatted_index(self.fmt.tr_frame) + index_length = len(str_index) + + if self.fmt.is_truncated_horizontally: + strcols = self._insert_dot_separator_horizontal(strcols, index_length) + + if self.fmt.is_truncated_vertically: + strcols = self._insert_dot_separator_vertical(strcols, index_length) + + return strcols + + @property + def _adjusted_tr_col_num(self) -> int: + return self.fmt.tr_col_num + 1 if self.fmt.index else self.fmt.tr_col_num + + def _insert_dot_separator_horizontal( + self, strcols: list[list[str]], index_length: int + ) -> list[list[str]]: + strcols.insert(self._adjusted_tr_col_num, [" ..."] * index_length) + return strcols + + def _insert_dot_separator_vertical( + self, strcols: list[list[str]], index_length: int + ) -> list[list[str]]: + n_header_rows = index_length - len(self.fmt.tr_frame) + row_num = self.fmt.tr_row_num + for ix, col in enumerate(strcols): + cwidth = self.adj.len(col[row_num]) + + if self.fmt.is_truncated_horizontally: + is_dot_col = ix == self._adjusted_tr_col_num + else: + is_dot_col = False + + if cwidth > 3 or is_dot_col: + dots = "..." + else: + dots = ".." + + if ix == 0 and self.fmt.index: + dot_mode = "left" + elif is_dot_col: + cwidth = 4 + dot_mode = "right" + else: + dot_mode = "right" + + dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0] + col.insert(row_num + n_header_rows, dot_str) + return strcols + + def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str: + lwidth = self.line_width + adjoin_width = 1 + strcols = list(strcols_input) + + if self.fmt.index: + idx = strcols.pop(0) + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + + col_widths = [ + np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 + for col in strcols + ] + + assert lwidth is not None + col_bins = _binify(col_widths, lwidth) + nbins = len(col_bins) + + str_lst = [] + start = 0 + for i, end in enumerate(col_bins): + row = strcols[start:end] + if self.fmt.index: + row.insert(0, idx) + if nbins > 1: + nrows = len(row[-1]) + if end <= len(strcols) and i < nbins - 1: + row.append([" \\"] + [" "] * (nrows - 1)) + else: + row.append([" "] * nrows) + str_lst.append(self.adj.adjoin(adjoin_width, *row)) + start = end + return "\n\n".join(str_lst) + + def _fit_strcols_to_terminal_width(self, strcols: list[list[str]]) -> str: + from pandas import Series + + lines = self.adj.adjoin(1, *strcols).split("\n") + max_len = Series(lines).str.len().max() + # plus truncate dot col + width, _ = get_terminal_size() + dif = max_len - width + # '+ 1' to avoid too wide repr (GH PR #17023) + adj_dif = dif + 1 + col_lens = Series([Series(ele).str.len().max() for ele in strcols]) + n_cols = len(col_lens) + counter = 0 + while adj_dif > 0 and n_cols > 1: + counter += 1 + mid = round(n_cols / 2) + mid_ix = col_lens.index[mid] + col_len = col_lens[mid_ix] + # adjoin adds one + adj_dif -= col_len + 1 + col_lens = col_lens.drop(mid_ix) + n_cols = len(col_lens) + + # subtract index column + max_cols_fitted = n_cols - self.fmt.index + # GH-21180. Ensure that we print at least two. + max_cols_fitted = max(max_cols_fitted, 2) + self.fmt.max_cols_fitted = max_cols_fitted + + # Call again _truncate to cut frame appropriately + # and then generate string representation + self.fmt.truncate() + strcols = self._get_strcols() + return self.adj.adjoin(1, *strcols) + + +def _binify(cols: list[int], line_width: int) -> list[int]: + adjoin_width = 1 + bins = [] + curr_width = 0 + i_last_column = len(cols) - 1 + for i, w in enumerate(cols): + w_adjoined = w + adjoin_width + curr_width += w_adjoined + if i_last_column == i: + wrap = curr_width + 1 > line_width and i > 0 + else: + wrap = curr_width + 2 > line_width and i > 0 + if wrap: + bins.append(i) + curr_width = w_adjoined + + bins.append(len(cols)) + return bins diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/style.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/style.py new file mode 100644 index 0000000000000000000000000000000000000000..b62f7581ac2205c8c1821b6030b56f13a77c6379 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/style.py @@ -0,0 +1,4136 @@ +""" +Module for applying conditional formatting to DataFrames and Series. +""" +from __future__ import annotations + +from contextlib import contextmanager +import copy +from functools import partial +import operator +from typing import ( + TYPE_CHECKING, + Any, + Callable, + overload, +) +import warnings + +import numpy as np + +from pandas._config import get_option + +from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import ( + Substitution, + doc, +) +from pandas.util._exceptions import find_stack_level + +import pandas as pd +from pandas import ( + IndexSlice, + RangeIndex, +) +import pandas.core.common as com +from pandas.core.frame import ( + DataFrame, + Series, +) +from pandas.core.generic import NDFrame +from pandas.core.shared_docs import _shared_docs + +from pandas.io.formats.format import save_to_buffer + +jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") + +from pandas.io.formats.style_render import ( + CSSProperties, + CSSStyles, + ExtFormatter, + StylerRenderer, + Subset, + Tooltips, + format_table_styles, + maybe_convert_css_to_tuples, + non_reducing_slice, + refactor_levels, +) + +if TYPE_CHECKING: + from collections.abc import ( + Generator, + Hashable, + Sequence, + ) + + from matplotlib.colors import Colormap + + from pandas._typing import ( + Axis, + AxisInt, + FilePath, + IndexLabel, + IntervalClosedType, + Level, + QuantileInterpolation, + Scalar, + StorageOptions, + WriteBuffer, + WriteExcelBuffer, + ) + + from pandas import ExcelWriter + +try: + import matplotlib as mpl + import matplotlib.pyplot as plt + + has_mpl = True +except ImportError: + has_mpl = False + + +@contextmanager +def _mpl(func: Callable) -> Generator[tuple[Any, Any], None, None]: + if has_mpl: + yield plt, mpl + else: + raise ImportError(f"{func.__name__} requires matplotlib.") + + +#### +# Shared Doc Strings + +subset_args = """subset : label, array-like, IndexSlice, optional + A valid 2d input to `DataFrame.loc[]`, or, in the case of a 1d input + or single key, to `DataFrame.loc[:, ]` where the columns are + prioritised, to limit ``data`` to *before* applying the function.""" + +properties_args = """props : str, default None + CSS properties to use for highlighting. If ``props`` is given, ``color`` + is not used.""" + +coloring_args = """color : str, default '{default}' + Background color to use for highlighting.""" + +buffering_args = """buf : str, path object, file-like object, optional + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``write()`` function. If ``None``, the result is + returned as a string.""" + +encoding_args = """encoding : str, optional + Character encoding setting for file output (and meta tags if available). + Defaults to ``pandas.options.styler.render.encoding`` value of "utf-8".""" + +# +### + + +class Styler(StylerRenderer): + r""" + Helps style a DataFrame or Series according to the data with HTML and CSS. + + Parameters + ---------- + data : Series or DataFrame + Data to be styled - either a Series or DataFrame. + precision : int, optional + Precision to round floats to. If not given defaults to + ``pandas.options.styler.format.precision``. + + .. versionchanged:: 1.4.0 + table_styles : list-like, default None + List of {selector: (attr, value)} dicts; see Notes. + uuid : str, default None + A unique identifier to avoid CSS collisions; generated automatically. + caption : str, tuple, default None + String caption to attach to the table. Tuple only used for LaTeX dual captions. + table_attributes : str, default None + Items that show up in the opening ```` tag + in addition to automatic (by default) id. + cell_ids : bool, default True + If True, each cell will have an ``id`` attribute in their HTML tag. + The ``id`` takes the form ``T__row_col`` + where ```` is the unique identifier, ```` is the row + number and ```` is the column number. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied, and falls back to + ``pandas.options.styler.format.na_rep``. + + uuid_len : int, default 5 + If ``uuid`` is not specified, the length of the ``uuid`` to randomly generate + expressed in hex characters, in range [0, 32]. + decimal : str, optional + Character used as decimal separator for floats, complex and integers. If not + given uses ``pandas.options.styler.format.decimal``. + + .. versionadded:: 1.3.0 + + thousands : str, optional, default None + Character used as thousands separator for floats, complex and integers. If not + given uses ``pandas.options.styler.format.thousands``. + + .. versionadded:: 1.3.0 + + escape : str, optional + Use 'html' to replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` + in cell display string with HTML-safe sequences. + Use 'latex' to replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, + ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with + LaTeX-safe sequences. Use 'latex-math' to replace the characters + the same way as in 'latex' mode, except for math substrings, + which either are surrounded by two characters ``$`` or start with + the character ``\(`` and end with ``\)``. + If not given uses ``pandas.options.styler.format.escape``. + + .. versionadded:: 1.3.0 + formatter : str, callable, dict, optional + Object to define how values are displayed. See ``Styler.format``. If not given + uses ``pandas.options.styler.format.formatter``. + + .. versionadded:: 1.4.0 + + Attributes + ---------- + env : Jinja2 jinja2.Environment + template_html : Jinja2 Template + template_html_table : Jinja2 Template + template_html_style : Jinja2 Template + template_latex : Jinja2 Template + loader : Jinja2 Loader + + See Also + -------- + DataFrame.style : Return a Styler object containing methods for building + a styled HTML representation for the DataFrame. + + Notes + ----- + Most styling will be done by passing style functions into + ``Styler.apply`` or ``Styler.map``. Style functions should + return values with strings containing CSS ``'attr: value'`` that will + be applied to the indicated cells. + + If using in the Jupyter notebook, Styler has defined a ``_repr_html_`` + to automatically render itself. Otherwise call Styler.to_html to get + the generated HTML. + + CSS classes are attached to the generated HTML + + * Index and Column names include ``index_name`` and ``level`` + where `k` is its level in a MultiIndex + * Index label cells include + + * ``row_heading`` + * ``row`` where `n` is the numeric position of the row + * ``level`` where `k` is the level in a MultiIndex + + * Column label cells include + * ``col_heading`` + * ``col`` where `n` is the numeric position of the column + * ``level`` where `k` is the level in a MultiIndex + + * Blank cells include ``blank`` + * Data cells include ``data`` + * Trimmed cells include ``col_trim`` or ``row_trim``. + + Any, or all, or these classes can be renamed by using the ``css_class_names`` + argument in ``Styler.set_table_classes``, giving a value such as + *{"row": "MY_ROW_CLASS", "col_trim": "", "row_trim": ""}*. + + Examples + -------- + >>> df = pd.DataFrame([[1.0, 2.0, 3.0], [4, 5, 6]], index=['a', 'b'], + ... columns=['A', 'B', 'C']) + >>> pd.io.formats.style.Styler(df, precision=2, + ... caption="My table") # doctest: +SKIP + + Please see: + `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. + """ + + def __init__( + self, + data: DataFrame | Series, + precision: int | None = None, + table_styles: CSSStyles | None = None, + uuid: str | None = None, + caption: str | tuple | list | None = None, + table_attributes: str | None = None, + cell_ids: bool = True, + na_rep: str | None = None, + uuid_len: int = 5, + decimal: str | None = None, + thousands: str | None = None, + escape: str | None = None, + formatter: ExtFormatter | None = None, + ) -> None: + super().__init__( + data=data, + uuid=uuid, + uuid_len=uuid_len, + table_styles=table_styles, + table_attributes=table_attributes, + caption=caption, + cell_ids=cell_ids, + precision=precision, + ) + + # validate ordered args + thousands = thousands or get_option("styler.format.thousands") + decimal = decimal or get_option("styler.format.decimal") + na_rep = na_rep or get_option("styler.format.na_rep") + escape = escape or get_option("styler.format.escape") + formatter = formatter or get_option("styler.format.formatter") + # precision is handled by superclass as default for performance + + self.format( + formatter=formatter, + precision=precision, + na_rep=na_rep, + escape=escape, + decimal=decimal, + thousands=thousands, + ) + + def concat(self, other: Styler) -> Styler: + """ + Append another Styler to combine the output into a single table. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + other : Styler + The other Styler object which has already been styled and formatted. The + data for this Styler must have the same columns as the original, and the + number of index levels must also be the same to render correctly. + + Returns + ------- + Styler + + Notes + ----- + The purpose of this method is to extend existing styled dataframes with other + metrics that may be useful but may not conform to the original's structure. + For example adding a sub total row, or displaying metrics such as means, + variance or counts. + + Styles that are applied using the ``apply``, ``map``, ``apply_index`` + and ``map_index``, and formatting applied with ``format`` and + ``format_index`` will be preserved. + + .. warning:: + Only the output methods ``to_html``, ``to_string`` and ``to_latex`` + currently work with concatenated Stylers. + + Other output methods, including ``to_excel``, **do not** work with + concatenated Stylers. + + The following should be noted: + + - ``table_styles``, ``table_attributes``, ``caption`` and ``uuid`` are all + inherited from the original Styler and not ``other``. + - hidden columns and hidden index levels will be inherited from the + original Styler + - ``css`` will be inherited from the original Styler, and the value of + keys ``data``, ``row_heading`` and ``row`` will be prepended with + ``foot0_``. If more concats are chained, their styles will be prepended + with ``foot1_``, ''foot_2'', etc., and if a concatenated style have + another concatanated style, the second style will be prepended with + ``foot{parent}_foot{child}_``. + + A common use case is to concatenate user defined functions with + ``DataFrame.agg`` or with described statistics via ``DataFrame.describe``. + See examples. + + Examples + -------- + A common use case is adding totals rows, or otherwise, via methods calculated + in ``DataFrame.agg``. + + >>> df = pd.DataFrame([[4, 6], [1, 9], [3, 4], [5, 5], [9, 6]], + ... columns=["Mike", "Jim"], + ... index=["Mon", "Tue", "Wed", "Thurs", "Fri"]) + >>> styler = df.style.concat(df.agg(["sum"]).style) # doctest: +SKIP + + .. figure:: ../../_static/style/footer_simple.png + + Since the concatenated object is a Styler the existing functionality can be + used to conditionally format it as well as the original. + + >>> descriptors = df.agg(["sum", "mean", lambda s: s.dtype]) + >>> descriptors.index = ["Total", "Average", "dtype"] + >>> other = (descriptors.style + ... .highlight_max(axis=1, subset=(["Total", "Average"], slice(None))) + ... .format(subset=("Average", slice(None)), precision=2, decimal=",") + ... .map(lambda v: "font-weight: bold;")) + >>> styler = (df.style + ... .highlight_max(color="salmon") + ... .set_table_styles([{"selector": ".foot_row0", + ... "props": "border-top: 1px solid black;"}])) + >>> styler.concat(other) # doctest: +SKIP + + .. figure:: ../../_static/style/footer_extended.png + + When ``other`` has fewer index levels than the original Styler it is possible + to extend the index in ``other``, with placeholder levels. + + >>> df = pd.DataFrame([[1], [2]], + ... index=pd.MultiIndex.from_product([[0], [1, 2]])) + >>> descriptors = df.agg(["sum"]) + >>> descriptors.index = pd.MultiIndex.from_product([[""], descriptors.index]) + >>> df.style.concat(descriptors.style) # doctest: +SKIP + """ + if not isinstance(other, Styler): + raise TypeError("`other` must be of type `Styler`") + if not self.data.columns.equals(other.data.columns): + raise ValueError("`other.data` must have same columns as `Styler.data`") + if not self.data.index.nlevels == other.data.index.nlevels: + raise ValueError( + "number of index levels must be same in `other` " + "as in `Styler`. See documentation for suggestions." + ) + self.concatenated.append(other) + return self + + def _repr_html_(self) -> str | None: + """ + Hooks into Jupyter notebook rich display system, which calls _repr_html_ by + default if an object is returned at the end of a cell. + """ + if get_option("styler.render.repr") == "html": + return self.to_html() + return None + + def _repr_latex_(self) -> str | None: + if get_option("styler.render.repr") == "latex": + return self.to_latex() + return None + + def set_tooltips( + self, + ttips: DataFrame, + props: CSSProperties | None = None, + css_class: str | None = None, + ) -> Styler: + """ + Set the DataFrame of strings on ``Styler`` generating ``:hover`` tooltips. + + These string based tooltips are only applicable to ``
`` HTML elements, + and cannot be used for column or index headers. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + ttips : DataFrame + DataFrame containing strings that will be translated to tooltips, mapped + by identical column and index values that must exist on the underlying + Styler data. None, NaN values, and empty strings will be ignored and + not affect the rendered HTML. + props : list-like or str, optional + List of (attr, value) tuples or a valid CSS string. If ``None`` adopts + the internal default values described in notes. + css_class : str, optional + Name of the tooltip class used in CSS, should conform to HTML standards. + Only useful if integrating tooltips with external CSS. If ``None`` uses the + internal default value 'pd-t'. + + Returns + ------- + Styler + + Notes + ----- + Tooltips are created by adding `` to each data cell + and then manipulating the table level CSS to attach pseudo hover and pseudo + after selectors to produce the required the results. + + The default properties for the tooltip CSS class are: + + - visibility: hidden + - position: absolute + - z-index: 1 + - background-color: black + - color: white + - transform: translate(-20px, -20px) + + The property 'visibility: hidden;' is a key prerequisite to the hover + functionality, and should always be included in any manual properties + specification, using the ``props`` argument. + + Tooltips are not designed to be efficient, and can add large amounts of + additional HTML for larger tables, since they also require that ``cell_ids`` + is forced to `True`. + + Examples + -------- + Basic application + + >>> df = pd.DataFrame(data=[[0, 1], [2, 3]]) + >>> ttips = pd.DataFrame( + ... data=[["Min", ""], [np.nan, "Max"]], columns=df.columns, index=df.index + ... ) + >>> s = df.style.set_tooltips(ttips).to_html() + + Optionally controlling the tooltip visual display + + >>> df.style.set_tooltips(ttips, css_class='tt-add', props=[ + ... ('visibility', 'hidden'), + ... ('position', 'absolute'), + ... ('z-index', 1)]) # doctest: +SKIP + >>> df.style.set_tooltips(ttips, css_class='tt-add', + ... props='visibility:hidden; position:absolute; z-index:1;') + ... # doctest: +SKIP + """ + if not self.cell_ids: + # tooltips not optimised for individual cell check. requires reasonable + # redesign and more extensive code for a feature that might be rarely used. + raise NotImplementedError( + "Tooltips can only render with 'cell_ids' is True." + ) + if not ttips.index.is_unique or not ttips.columns.is_unique: + raise KeyError( + "Tooltips render only if `ttips` has unique index and columns." + ) + if self.tooltips is None: # create a default instance if necessary + self.tooltips = Tooltips() + self.tooltips.tt_data = ttips + if props: + self.tooltips.class_properties = props + if css_class: + self.tooltips.class_name = css_class + + return self + + @doc( + NDFrame.to_excel, + klass="Styler", + storage_options=_shared_docs["storage_options"], + storage_options_versionadded="1.5.0", + ) + def to_excel( + self, + excel_writer: FilePath | WriteExcelBuffer | ExcelWriter, + sheet_name: str = "Sheet1", + na_rep: str = "", + float_format: str | None = None, + columns: Sequence[Hashable] | None = None, + header: Sequence[Hashable] | bool = True, + index: bool = True, + index_label: IndexLabel | None = None, + startrow: int = 0, + startcol: int = 0, + engine: str | None = None, + merge_cells: bool = True, + encoding: str | None = None, + inf_rep: str = "inf", + verbose: bool = True, + freeze_panes: tuple[int, int] | None = None, + storage_options: StorageOptions | None = None, + ) -> None: + from pandas.io.formats.excel import ExcelFormatter + + formatter = ExcelFormatter( + self, + na_rep=na_rep, + cols=columns, + header=header, + float_format=float_format, + index=index, + index_label=index_label, + merge_cells=merge_cells, + inf_rep=inf_rep, + ) + formatter.write( + excel_writer, + sheet_name=sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + engine=engine, + storage_options=storage_options, + ) + + @overload + def to_latex( + self, + buf: FilePath | WriteBuffer[str], + *, + column_format: str | None = ..., + position: str | None = ..., + position_float: str | None = ..., + hrules: bool | None = ..., + clines: str | None = ..., + label: str | None = ..., + caption: str | tuple | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + multirow_align: str | None = ..., + multicol_align: str | None = ..., + siunitx: bool = ..., + environment: str | None = ..., + encoding: str | None = ..., + convert_css: bool = ..., + ) -> None: + ... + + @overload + def to_latex( + self, + buf: None = ..., + *, + column_format: str | None = ..., + position: str | None = ..., + position_float: str | None = ..., + hrules: bool | None = ..., + clines: str | None = ..., + label: str | None = ..., + caption: str | tuple | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + multirow_align: str | None = ..., + multicol_align: str | None = ..., + siunitx: bool = ..., + environment: str | None = ..., + encoding: str | None = ..., + convert_css: bool = ..., + ) -> str: + ... + + def to_latex( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + column_format: str | None = None, + position: str | None = None, + position_float: str | None = None, + hrules: bool | None = None, + clines: str | None = None, + label: str | None = None, + caption: str | tuple | None = None, + sparse_index: bool | None = None, + sparse_columns: bool | None = None, + multirow_align: str | None = None, + multicol_align: str | None = None, + siunitx: bool = False, + environment: str | None = None, + encoding: str | None = None, + convert_css: bool = False, + ) -> str | None: + r""" + Write Styler to a file, buffer or string in LaTeX format. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + buf : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``write()`` function. If None, the result is + returned as a string. + column_format : str, optional + The LaTeX column specification placed in location: + + \\begin{tabular}{} + + Defaults to 'l' for index and + non-numeric data columns, and, for numeric data columns, + to 'r' by default, or 'S' if ``siunitx`` is ``True``. + position : str, optional + The LaTeX positional argument (e.g. 'h!') for tables, placed in location: + + ``\\begin{table}[]``. + position_float : {"centering", "raggedleft", "raggedright"}, optional + The LaTeX float command placed in location: + + \\begin{table}[] + + \\ + + Cannot be used if ``environment`` is "longtable". + hrules : bool + Set to `True` to add \\toprule, \\midrule and \\bottomrule from the + {booktabs} LaTeX package. + Defaults to ``pandas.options.styler.latex.hrules``, which is `False`. + + .. versionchanged:: 1.4.0 + clines : str, optional + Use to control adding \\cline commands for the index labels separation. + Possible values are: + + - `None`: no cline commands are added (default). + - `"all;data"`: a cline is added for every index value extending the + width of the table, including data entries. + - `"all;index"`: as above with lines extending only the width of the + index entries. + - `"skip-last;data"`: a cline is added for each index value except the + last level (which is never sparsified), extending the widtn of the + table. + - `"skip-last;index"`: as above with lines extending only the width of the + index entries. + + .. versionadded:: 1.4.0 + label : str, optional + The LaTeX label included as: \\label{
}. + If tuple, i.e ("full caption", "short caption"), the caption included + as: \\caption[]{}. + sparse_index : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each row. + Defaults to ``pandas.options.styler.sparse.index``, which is `True`. + sparse_columns : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each + column. Defaults to ``pandas.options.styler.sparse.columns``, which + is `True`. + multirow_align : {"c", "t", "b", "naive"}, optional + If sparsifying hierarchical MultiIndexes whether to align text centrally, + at the top or bottom using the multirow package. If not given defaults to + ``pandas.options.styler.latex.multirow_align``, which is `"c"`. + If "naive" is given renders without multirow. + + .. versionchanged:: 1.4.0 + multicol_align : {"r", "c", "l", "naive-l", "naive-r"}, optional + If sparsifying hierarchical MultiIndex columns whether to align text at + the left, centrally, or at the right. If not given defaults to + ``pandas.options.styler.latex.multicol_align``, which is "r". + If a naive option is given renders without multicol. + Pipe decorators can also be added to non-naive values to draw vertical + rules, e.g. "\|r" will draw a rule on the left side of right aligned merged + cells. + + .. versionchanged:: 1.4.0 + siunitx : bool, default False + Set to ``True`` to structure LaTeX compatible with the {siunitx} package. + environment : str, optional + If given, the environment that will replace 'table' in ``\\begin{table}``. + If 'longtable' is specified then a more suitable template is + rendered. If not given defaults to + ``pandas.options.styler.latex.environment``, which is `None`. + + .. versionadded:: 1.4.0 + encoding : str, optional + Character encoding setting. Defaults + to ``pandas.options.styler.render.encoding``, which is "utf-8". + convert_css : bool, default False + Convert simple cell-styles from CSS to LaTeX format. Any CSS not found in + conversion table is dropped. A style can be forced by adding option + `--latex`. See notes. + + Returns + ------- + str or None + If `buf` is None, returns the result as a string. Otherwise returns `None`. + + See Also + -------- + Styler.format: Format the text display value of cells. + + Notes + ----- + **Latex Packages** + + For the following features we recommend the following LaTeX inclusions: + + ===================== ========================================================== + Feature Inclusion + ===================== ========================================================== + sparse columns none: included within default {tabular} environment + sparse rows \\usepackage{multirow} + hrules \\usepackage{booktabs} + colors \\usepackage[table]{xcolor} + siunitx \\usepackage{siunitx} + bold (with siunitx) | \\usepackage{etoolbox} + | \\robustify\\bfseries + | \\sisetup{detect-all = true} *(within {document})* + italic (with siunitx) | \\usepackage{etoolbox} + | \\robustify\\itshape + | \\sisetup{detect-all = true} *(within {document})* + environment \\usepackage{longtable} if arg is "longtable" + | or any other relevant environment package + hyperlinks \\usepackage{hyperref} + ===================== ========================================================== + + **Cell Styles** + + LaTeX styling can only be rendered if the accompanying styling functions have + been constructed with appropriate LaTeX commands. All styling + functionality is built around the concept of a CSS ``(, )`` + pair (see `Table Visualization <../../user_guide/style.ipynb>`_), and this + should be replaced by a LaTeX + ``(, )`` approach. Each cell will be styled individually + using nested LaTeX commands with their accompanied options. + + For example the following code will highlight and bold a cell in HTML-CSS: + + >>> df = pd.DataFrame([[1,2], [3,4]]) + >>> s = df.style.highlight_max(axis=None, + ... props='background-color:red; font-weight:bold;') + >>> s.to_html() # doctest: +SKIP + + The equivalent using LaTeX only commands is the following: + + >>> s = df.style.highlight_max(axis=None, + ... props='cellcolor:{red}; bfseries: ;') + >>> s.to_latex() # doctest: +SKIP + + Internally these structured LaTeX ``(, )`` pairs + are translated to the + ``display_value`` with the default structure: + ``\ ``. + Where there are multiple commands the latter is nested recursively, so that + the above example highlighted cell is rendered as + ``\cellcolor{red} \bfseries 4``. + + Occasionally this format does not suit the applied command, or + combination of LaTeX packages that is in use, so additional flags can be + added to the ````, within the tuple, to result in different + positions of required braces (the **default** being the same as ``--nowrap``): + + =================================== ============================================ + Tuple Format Output Structure + =================================== ============================================ + (,) \\ + (, ``--nowrap``) \\ + (, ``--rwrap``) \\{} + (, ``--wrap``) {\\ } + (, ``--lwrap``) {\\} + (, ``--dwrap``) {\\}{} + =================================== ============================================ + + For example the `textbf` command for font-weight + should always be used with `--rwrap` so ``('textbf', '--rwrap')`` will render a + working cell, wrapped with braces, as ``\textbf{}``. + + A more comprehensive example is as follows: + + >>> df = pd.DataFrame([[1, 2.2, "dogs"], [3, 4.4, "cats"], [2, 6.6, "cows"]], + ... index=["ix1", "ix2", "ix3"], + ... columns=["Integers", "Floats", "Strings"]) + >>> s = df.style.highlight_max( + ... props='cellcolor:[HTML]{FFFF00}; color:{red};' + ... 'textit:--rwrap; textbf:--rwrap;' + ... ) + >>> s.to_latex() # doctest: +SKIP + + .. figure:: ../../_static/style/latex_1.png + + **Table Styles** + + Internally Styler uses its ``table_styles`` object to parse the + ``column_format``, ``position``, ``position_float``, and ``label`` + input arguments. These arguments are added to table styles in the format: + + .. code-block:: python + + set_table_styles([ + {"selector": "column_format", "props": f":{column_format};"}, + {"selector": "position", "props": f":{position};"}, + {"selector": "position_float", "props": f":{position_float};"}, + {"selector": "label", "props": f":{{{label.replace(':','§')}}};"} + ], overwrite=False) + + Exception is made for the ``hrules`` argument which, in fact, controls all three + commands: ``toprule``, ``bottomrule`` and ``midrule`` simultaneously. Instead of + setting ``hrules`` to ``True``, it is also possible to set each + individual rule definition, by manually setting the ``table_styles``, + for example below we set a regular ``toprule``, set an ``hline`` for + ``bottomrule`` and exclude the ``midrule``: + + .. code-block:: python + + set_table_styles([ + {'selector': 'toprule', 'props': ':toprule;'}, + {'selector': 'bottomrule', 'props': ':hline;'}, + ], overwrite=False) + + If other ``commands`` are added to table styles they will be detected, and + positioned immediately above the '\\begin{tabular}' command. For example to + add odd and even row coloring, from the {colortbl} package, in format + ``\rowcolors{1}{pink}{red}``, use: + + .. code-block:: python + + set_table_styles([ + {'selector': 'rowcolors', 'props': ':{1}{pink}{red};'} + ], overwrite=False) + + A more comprehensive example using these arguments is as follows: + + >>> df.columns = pd.MultiIndex.from_tuples([ + ... ("Numeric", "Integers"), + ... ("Numeric", "Floats"), + ... ("Non-Numeric", "Strings") + ... ]) + >>> df.index = pd.MultiIndex.from_tuples([ + ... ("L0", "ix1"), ("L0", "ix2"), ("L1", "ix3") + ... ]) + >>> s = df.style.highlight_max( + ... props='cellcolor:[HTML]{FFFF00}; color:{red}; itshape:; bfseries:;' + ... ) + >>> s.to_latex( + ... column_format="rrrrr", position="h", position_float="centering", + ... hrules=True, label="table:5", caption="Styled LaTeX Table", + ... multirow_align="t", multicol_align="r" + ... ) # doctest: +SKIP + + .. figure:: ../../_static/style/latex_2.png + + **Formatting** + + To format values :meth:`Styler.format` should be used prior to calling + `Styler.to_latex`, as well as other methods such as :meth:`Styler.hide` + for example: + + >>> s.clear() + >>> s.table_styles = [] + >>> s.caption = None + >>> s.format({ + ... ("Numeric", "Integers"): '\${}', + ... ("Numeric", "Floats"): '{:.3f}', + ... ("Non-Numeric", "Strings"): str.upper + ... }) # doctest: +SKIP + Numeric Non-Numeric + Integers Floats Strings + L0 ix1 $1 2.200 DOGS + ix2 $3 4.400 CATS + L1 ix3 $2 6.600 COWS + + >>> s.to_latex() # doctest: +SKIP + \begin{tabular}{llrrl} + {} & {} & \multicolumn{2}{r}{Numeric} & {Non-Numeric} \\ + {} & {} & {Integers} & {Floats} & {Strings} \\ + \multirow[c]{2}{*}{L0} & ix1 & \\$1 & 2.200 & DOGS \\ + & ix2 & \$3 & 4.400 & CATS \\ + L1 & ix3 & \$2 & 6.600 & COWS \\ + \end{tabular} + + **CSS Conversion** + + This method can convert a Styler constructured with HTML-CSS to LaTeX using + the following limited conversions. + + ================== ==================== ============= ========================== + CSS Attribute CSS value LaTeX Command LaTeX Options + ================== ==================== ============= ========================== + font-weight | bold | bfseries + | bolder | bfseries + font-style | italic | itshape + | oblique | slshape + background-color | red cellcolor | {red}--lwrap + | #fe01ea | [HTML]{FE01EA}--lwrap + | #f0e | [HTML]{FF00EE}--lwrap + | rgb(128,255,0) | [rgb]{0.5,1,0}--lwrap + | rgba(128,0,0,0.5) | [rgb]{0.5,0,0}--lwrap + | rgb(25%,255,50%) | [rgb]{0.25,1,0.5}--lwrap + color | red color | {red} + | #fe01ea | [HTML]{FE01EA} + | #f0e | [HTML]{FF00EE} + | rgb(128,255,0) | [rgb]{0.5,1,0} + | rgba(128,0,0,0.5) | [rgb]{0.5,0,0} + | rgb(25%,255,50%) | [rgb]{0.25,1,0.5} + ================== ==================== ============= ========================== + + It is also possible to add user-defined LaTeX only styles to a HTML-CSS Styler + using the ``--latex`` flag, and to add LaTeX parsing options that the + converter will detect within a CSS-comment. + + >>> df = pd.DataFrame([[1]]) + >>> df.style.set_properties( + ... **{"font-weight": "bold /* --dwrap */", "Huge": "--latex--rwrap"} + ... ).to_latex(convert_css=True) # doctest: +SKIP + \begin{tabular}{lr} + {} & {0} \\ + 0 & {\bfseries}{\Huge{1}} \\ + \end{tabular} + + Examples + -------- + Below we give a complete step by step example adding some advanced features + and noting some common gotchas. + + First we create the DataFrame and Styler as usual, including MultiIndex rows + and columns, which allow for more advanced formatting options: + + >>> cidx = pd.MultiIndex.from_arrays([ + ... ["Equity", "Equity", "Equity", "Equity", + ... "Stats", "Stats", "Stats", "Stats", "Rating"], + ... ["Energy", "Energy", "Consumer", "Consumer", "", "", "", "", ""], + ... ["BP", "Shell", "H&M", "Unilever", + ... "Std Dev", "Variance", "52w High", "52w Low", ""] + ... ]) + >>> iidx = pd.MultiIndex.from_arrays([ + ... ["Equity", "Equity", "Equity", "Equity"], + ... ["Energy", "Energy", "Consumer", "Consumer"], + ... ["BP", "Shell", "H&M", "Unilever"] + ... ]) + >>> styler = pd.DataFrame([ + ... [1, 0.8, 0.66, 0.72, 32.1678, 32.1678**2, 335.12, 240.89, "Buy"], + ... [0.8, 1.0, 0.69, 0.79, 1.876, 1.876**2, 14.12, 19.78, "Hold"], + ... [0.66, 0.69, 1.0, 0.86, 7, 7**2, 210.9, 140.6, "Buy"], + ... [0.72, 0.79, 0.86, 1.0, 213.76, 213.76**2, 2807, 3678, "Sell"], + ... ], columns=cidx, index=iidx).style + + Second we will format the display and, since our table is quite wide, will + hide the repeated level-0 of the index: + + >>> (styler.format(subset="Equity", precision=2) + ... .format(subset="Stats", precision=1, thousands=",") + ... .format(subset="Rating", formatter=str.upper) + ... .format_index(escape="latex", axis=1) + ... .format_index(escape="latex", axis=0) + ... .hide(level=0, axis=0)) # doctest: +SKIP + + Note that one of the string entries of the index and column headers is "H&M". + Without applying the `escape="latex"` option to the `format_index` method the + resultant LaTeX will fail to render, and the error returned is quite + difficult to debug. Using the appropriate escape the "&" is converted to "\\&". + + Thirdly we will apply some (CSS-HTML) styles to our object. We will use a + builtin method and also define our own method to highlight the stock + recommendation: + + >>> def rating_color(v): + ... if v == "Buy": color = "#33ff85" + ... elif v == "Sell": color = "#ff5933" + ... else: color = "#ffdd33" + ... return f"color: {color}; font-weight: bold;" + >>> (styler.background_gradient(cmap="inferno", subset="Equity", vmin=0, vmax=1) + ... .map(rating_color, subset="Rating")) # doctest: +SKIP + + All the above styles will work with HTML (see below) and LaTeX upon conversion: + + .. figure:: ../../_static/style/latex_stocks_html.png + + However, we finally want to add one LaTeX only style + (from the {graphicx} package), that is not easy to convert from CSS and + pandas does not support it. Notice the `--latex` flag used here, + as well as `--rwrap` to ensure this is formatted correctly and + not ignored upon conversion. + + >>> styler.map_index( + ... lambda v: "rotatebox:{45}--rwrap--latex;", level=2, axis=1 + ... ) # doctest: +SKIP + + Finally we render our LaTeX adding in other options as required: + + >>> styler.to_latex( + ... caption="Selected stock correlation and simple statistics.", + ... clines="skip-last;data", + ... convert_css=True, + ... position_float="centering", + ... multicol_align="|c|", + ... hrules=True, + ... ) # doctest: +SKIP + \begin{table} + \centering + \caption{Selected stock correlation and simple statistics.} + \begin{tabular}{llrrrrrrrrl} + \toprule + & & \multicolumn{4}{|c|}{Equity} & \multicolumn{4}{|c|}{Stats} & Rating \\ + & & \multicolumn{2}{|c|}{Energy} & \multicolumn{2}{|c|}{Consumer} & + \multicolumn{4}{|c|}{} & \\ + & & \rotatebox{45}{BP} & \rotatebox{45}{Shell} & \rotatebox{45}{H\&M} & + \rotatebox{45}{Unilever} & \rotatebox{45}{Std Dev} & \rotatebox{45}{Variance} & + \rotatebox{45}{52w High} & \rotatebox{45}{52w Low} & \rotatebox{45}{} \\ + \midrule + \multirow[c]{2}{*}{Energy} & BP & {\cellcolor[HTML]{FCFFA4}} + \color[HTML]{000000} 1.00 & {\cellcolor[HTML]{FCA50A}} \color[HTML]{000000} + 0.80 & {\cellcolor[HTML]{EB6628}} \color[HTML]{F1F1F1} 0.66 & + {\cellcolor[HTML]{F68013}} \color[HTML]{F1F1F1} 0.72 & 32.2 & 1,034.8 & 335.1 + & 240.9 & \color[HTML]{33FF85} \bfseries BUY \\ + & Shell & {\cellcolor[HTML]{FCA50A}} \color[HTML]{000000} 0.80 & + {\cellcolor[HTML]{FCFFA4}} \color[HTML]{000000} 1.00 & + {\cellcolor[HTML]{F1731D}} \color[HTML]{F1F1F1} 0.69 & + {\cellcolor[HTML]{FCA108}} \color[HTML]{000000} 0.79 & 1.9 & 3.5 & 14.1 & + 19.8 & \color[HTML]{FFDD33} \bfseries HOLD \\ + \cline{1-11} + \multirow[c]{2}{*}{Consumer} & H\&M & {\cellcolor[HTML]{EB6628}} + \color[HTML]{F1F1F1} 0.66 & {\cellcolor[HTML]{F1731D}} \color[HTML]{F1F1F1} + 0.69 & {\cellcolor[HTML]{FCFFA4}} \color[HTML]{000000} 1.00 & + {\cellcolor[HTML]{FAC42A}} \color[HTML]{000000} 0.86 & 7.0 & 49.0 & 210.9 & + 140.6 & \color[HTML]{33FF85} \bfseries BUY \\ + & Unilever & {\cellcolor[HTML]{F68013}} \color[HTML]{F1F1F1} 0.72 & + {\cellcolor[HTML]{FCA108}} \color[HTML]{000000} 0.79 & + {\cellcolor[HTML]{FAC42A}} \color[HTML]{000000} 0.86 & + {\cellcolor[HTML]{FCFFA4}} \color[HTML]{000000} 1.00 & 213.8 & 45,693.3 & + 2,807.0 & 3,678.0 & \color[HTML]{FF5933} \bfseries SELL \\ + \cline{1-11} + \bottomrule + \end{tabular} + \end{table} + + .. figure:: ../../_static/style/latex_stocks.png + """ + obj = self._copy(deepcopy=True) # manipulate table_styles on obj, not self + + table_selectors = ( + [style["selector"] for style in self.table_styles] + if self.table_styles is not None + else [] + ) + + if column_format is not None: + # add more recent setting to table_styles + obj.set_table_styles( + [{"selector": "column_format", "props": f":{column_format}"}], + overwrite=False, + ) + elif "column_format" in table_selectors: + pass # adopt what has been previously set in table_styles + else: + # create a default: set float, complex, int cols to 'r' ('S'), index to 'l' + _original_columns = self.data.columns + self.data.columns = RangeIndex(stop=len(self.data.columns)) + numeric_cols = self.data._get_numeric_data().columns.to_list() + self.data.columns = _original_columns + column_format = "" + for level in range(self.index.nlevels): + column_format += "" if self.hide_index_[level] else "l" + for ci, _ in enumerate(self.data.columns): + if ci not in self.hidden_columns: + column_format += ( + ("r" if not siunitx else "S") if ci in numeric_cols else "l" + ) + obj.set_table_styles( + [{"selector": "column_format", "props": f":{column_format}"}], + overwrite=False, + ) + + if position: + obj.set_table_styles( + [{"selector": "position", "props": f":{position}"}], + overwrite=False, + ) + + if position_float: + if environment == "longtable": + raise ValueError( + "`position_float` cannot be used in 'longtable' `environment`" + ) + if position_float not in ["raggedright", "raggedleft", "centering"]: + raise ValueError( + f"`position_float` should be one of " + f"'raggedright', 'raggedleft', 'centering', " + f"got: '{position_float}'" + ) + obj.set_table_styles( + [{"selector": "position_float", "props": f":{position_float}"}], + overwrite=False, + ) + + hrules = get_option("styler.latex.hrules") if hrules is None else hrules + if hrules: + obj.set_table_styles( + [ + {"selector": "toprule", "props": ":toprule"}, + {"selector": "midrule", "props": ":midrule"}, + {"selector": "bottomrule", "props": ":bottomrule"}, + ], + overwrite=False, + ) + + if label: + obj.set_table_styles( + [{"selector": "label", "props": f":{{{label.replace(':', '§')}}}"}], + overwrite=False, + ) + + if caption: + obj.set_caption(caption) + + if sparse_index is None: + sparse_index = get_option("styler.sparse.index") + if sparse_columns is None: + sparse_columns = get_option("styler.sparse.columns") + environment = environment or get_option("styler.latex.environment") + multicol_align = multicol_align or get_option("styler.latex.multicol_align") + multirow_align = multirow_align or get_option("styler.latex.multirow_align") + latex = obj._render_latex( + sparse_index=sparse_index, + sparse_columns=sparse_columns, + multirow_align=multirow_align, + multicol_align=multicol_align, + environment=environment, + convert_css=convert_css, + siunitx=siunitx, + clines=clines, + ) + + encoding = ( + (encoding or get_option("styler.render.encoding")) + if isinstance(buf, str) # i.e. a filepath + else encoding + ) + return save_to_buffer(latex, buf=buf, encoding=encoding) + + @overload + def to_html( + self, + buf: FilePath | WriteBuffer[str], + *, + table_uuid: str | None = ..., + table_attributes: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + bold_headers: bool = ..., + caption: str | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + encoding: str | None = ..., + doctype_html: bool = ..., + exclude_styles: bool = ..., + **kwargs, + ) -> None: + ... + + @overload + def to_html( + self, + buf: None = ..., + *, + table_uuid: str | None = ..., + table_attributes: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + bold_headers: bool = ..., + caption: str | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + encoding: str | None = ..., + doctype_html: bool = ..., + exclude_styles: bool = ..., + **kwargs, + ) -> str: + ... + + @Substitution(buf=buffering_args, encoding=encoding_args) + def to_html( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + table_uuid: str | None = None, + table_attributes: str | None = None, + sparse_index: bool | None = None, + sparse_columns: bool | None = None, + bold_headers: bool = False, + caption: str | None = None, + max_rows: int | None = None, + max_columns: int | None = None, + encoding: str | None = None, + doctype_html: bool = False, + exclude_styles: bool = False, + **kwargs, + ) -> str | None: + """ + Write Styler to a file, buffer or string in HTML-CSS format. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + %(buf)s + table_uuid : str, optional + Id attribute assigned to the HTML element in the format: + + ``
`` + + If not given uses Styler's initially assigned value. + table_attributes : str, optional + Attributes to assign within the `
` HTML element in the format: + + ``
>`` + + If not given defaults to Styler's preexisting value. + sparse_index : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each row. + Defaults to ``pandas.options.styler.sparse.index`` value. + + .. versionadded:: 1.4.0 + sparse_columns : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each + column. Defaults to ``pandas.options.styler.sparse.columns`` value. + + .. versionadded:: 1.4.0 + bold_headers : bool, optional + Adds "font-weight: bold;" as a CSS property to table style header cells. + + .. versionadded:: 1.4.0 + caption : str, optional + Set, or overwrite, the caption on Styler before rendering. + + .. versionadded:: 1.4.0 + max_rows : int, optional + The maximum number of rows that will be rendered. Defaults to + ``pandas.options.styler.render.max_rows/max_columns``. + + .. versionadded:: 1.4.0 + max_columns : int, optional + The maximum number of columns that will be rendered. Defaults to + ``pandas.options.styler.render.max_columns``, which is None. + + Rows and columns may be reduced if the number of total elements is + large. This value is set to ``pandas.options.styler.render.max_elements``, + which is 262144 (18 bit browser rendering). + + .. versionadded:: 1.4.0 + %(encoding)s + doctype_html : bool, default False + Whether to output a fully structured HTML file including all + HTML elements, or just the core `` +
+ + + + + + + ... + """ + obj = self._copy(deepcopy=True) # manipulate table_styles on obj, not self + + if table_uuid: + obj.set_uuid(table_uuid) + + if table_attributes: + obj.set_table_attributes(table_attributes) + + if sparse_index is None: + sparse_index = get_option("styler.sparse.index") + if sparse_columns is None: + sparse_columns = get_option("styler.sparse.columns") + + if bold_headers: + obj.set_table_styles( + [{"selector": "th", "props": "font-weight: bold;"}], overwrite=False + ) + + if caption is not None: + obj.set_caption(caption) + + # Build HTML string.. + html = obj._render_html( + sparse_index=sparse_index, + sparse_columns=sparse_columns, + max_rows=max_rows, + max_cols=max_columns, + exclude_styles=exclude_styles, + encoding=encoding or get_option("styler.render.encoding"), + doctype_html=doctype_html, + **kwargs, + ) + + return save_to_buffer( + html, buf=buf, encoding=(encoding if buf is not None else None) + ) + + @overload + def to_string( + self, + buf: FilePath | WriteBuffer[str], + *, + encoding: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + delimiter: str = ..., + ) -> None: + ... + + @overload + def to_string( + self, + buf: None = ..., + *, + encoding: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + delimiter: str = ..., + ) -> str: + ... + + @Substitution(buf=buffering_args, encoding=encoding_args) + def to_string( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + encoding: str | None = None, + sparse_index: bool | None = None, + sparse_columns: bool | None = None, + max_rows: int | None = None, + max_columns: int | None = None, + delimiter: str = " ", + ) -> str | None: + """ + Write Styler to a file, buffer or string in text format. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + %(buf)s + %(encoding)s + sparse_index : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each row. + Defaults to ``pandas.options.styler.sparse.index`` value. + sparse_columns : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each + column. Defaults to ``pandas.options.styler.sparse.columns`` value. + max_rows : int, optional + The maximum number of rows that will be rendered. Defaults to + ``pandas.options.styler.render.max_rows``, which is None. + max_columns : int, optional + The maximum number of columns that will be rendered. Defaults to + ``pandas.options.styler.render.max_columns``, which is None. + + Rows and columns may be reduced if the number of total elements is + large. This value is set to ``pandas.options.styler.render.max_elements``, + which is 262144 (18 bit browser rendering). + delimiter : str, default single space + The separator between data elements. + + Returns + ------- + str or None + If `buf` is None, returns the result as a string. Otherwise returns `None`. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df.style.to_string() + ' A B\\n0 1 3\\n1 2 4\\n' + """ + obj = self._copy(deepcopy=True) + + if sparse_index is None: + sparse_index = get_option("styler.sparse.index") + if sparse_columns is None: + sparse_columns = get_option("styler.sparse.columns") + + text = obj._render_string( + sparse_columns=sparse_columns, + sparse_index=sparse_index, + max_rows=max_rows, + max_cols=max_columns, + delimiter=delimiter, + ) + return save_to_buffer( + text, buf=buf, encoding=(encoding if buf is not None else None) + ) + + def set_td_classes(self, classes: DataFrame) -> Styler: + """ + Set the ``class`` attribute of ``
 AB
`` HTML elements. + + Parameters + ---------- + classes : DataFrame + DataFrame containing strings that will be translated to CSS classes, + mapped by identical column and index key values that must exist on the + underlying Styler data. None, NaN values, and empty strings will + be ignored and not affect the rendered HTML. + + Returns + ------- + Styler + + See Also + -------- + Styler.set_table_styles: Set the table styles included within the ``' + '' + ' ' + ' ' + ' ' + ' ' + ' ' + ' ' + '
0
1
' + """ + if not classes.index.is_unique or not classes.columns.is_unique: + raise KeyError( + "Classes render only if `classes` has unique index and columns." + ) + classes = classes.reindex_like(self.data) + + for r, row_tup in enumerate(classes.itertuples()): + for c, value in enumerate(row_tup[1:]): + if not (pd.isna(value) or value == ""): + self.cell_context[(r, c)] = str(value) + + return self + + def _update_ctx(self, attrs: DataFrame) -> None: + """ + Update the state of the ``Styler`` for data cells. + + Collects a mapping of {index_label: [('', ''), ..]}. + + Parameters + ---------- + attrs : DataFrame + should contain strings of ': ;: ' + Whitespace shouldn't matter and the final trailing ';' shouldn't + matter. + """ + if not self.index.is_unique or not self.columns.is_unique: + raise KeyError( + "`Styler.apply` and `.map` are not compatible " + "with non-unique index or columns." + ) + + for cn in attrs.columns: + j = self.columns.get_loc(cn) + ser = attrs[cn] + for rn, c in ser.items(): + if not c or pd.isna(c): + continue + css_list = maybe_convert_css_to_tuples(c) + i = self.index.get_loc(rn) + self.ctx[(i, j)].extend(css_list) + + def _update_ctx_header(self, attrs: DataFrame, axis: AxisInt) -> None: + """ + Update the state of the ``Styler`` for header cells. + + Collects a mapping of {index_label: [('', ''), ..]}. + + Parameters + ---------- + attrs : Series + Should contain strings of ': ;: ', and an + integer index. + Whitespace shouldn't matter and the final trailing ';' shouldn't + matter. + axis : int + Identifies whether the ctx object being updated is the index or columns + """ + for j in attrs.columns: + ser = attrs[j] + for i, c in ser.items(): + if not c: + continue + css_list = maybe_convert_css_to_tuples(c) + if axis == 0: + self.ctx_index[(i, j)].extend(css_list) + else: + self.ctx_columns[(j, i)].extend(css_list) + + def _copy(self, deepcopy: bool = False) -> Styler: + """ + Copies a Styler, allowing for deepcopy or shallow copy + + Copying a Styler aims to recreate a new Styler object which contains the same + data and styles as the original. + + Data dependent attributes [copied and NOT exported]: + - formatting (._display_funcs) + - hidden index values or column values (.hidden_rows, .hidden_columns) + - tooltips + - cell_context (cell css classes) + - ctx (cell css styles) + - caption + - concatenated stylers + + Non-data dependent attributes [copied and exported]: + - css + - hidden index state and hidden columns state (.hide_index_, .hide_columns_) + - table_attributes + - table_styles + - applied styles (_todo) + + """ + # GH 40675, 52728 + styler = type(self)( + self.data, # populates attributes 'data', 'columns', 'index' as shallow + ) + shallow = [ # simple string or boolean immutables + "hide_index_", + "hide_columns_", + "hide_column_names", + "hide_index_names", + "table_attributes", + "cell_ids", + "caption", + "uuid", + "uuid_len", + "template_latex", # also copy templates if these have been customised + "template_html_style", + "template_html_table", + "template_html", + ] + deep = [ # nested lists or dicts + "css", + "concatenated", + "_display_funcs", + "_display_funcs_index", + "_display_funcs_columns", + "hidden_rows", + "hidden_columns", + "ctx", + "ctx_index", + "ctx_columns", + "cell_context", + "_todo", + "table_styles", + "tooltips", + ] + + for attr in shallow: + setattr(styler, attr, getattr(self, attr)) + + for attr in deep: + val = getattr(self, attr) + setattr(styler, attr, copy.deepcopy(val) if deepcopy else val) + + return styler + + def __copy__(self) -> Styler: + return self._copy(deepcopy=False) + + def __deepcopy__(self, memo) -> Styler: + return self._copy(deepcopy=True) + + def clear(self) -> None: + """ + Reset the ``Styler``, removing any previously applied styles. + + Returns None. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, np.nan]}) + + After any added style: + + >>> df.style.highlight_null(color='yellow') # doctest: +SKIP + + Remove it with: + + >>> df.style.clear() # doctest: +SKIP + + Please see: + `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. + """ + # create default GH 40675 + clean_copy = Styler(self.data, uuid=self.uuid) + clean_attrs = [a for a in clean_copy.__dict__ if not callable(a)] + self_attrs = [a for a in self.__dict__ if not callable(a)] # maybe more attrs + for attr in clean_attrs: + setattr(self, attr, getattr(clean_copy, attr)) + for attr in set(self_attrs).difference(clean_attrs): + delattr(self, attr) + + def _apply( + self, + func: Callable, + axis: Axis | None = 0, + subset: Subset | None = None, + **kwargs, + ) -> Styler: + subset = slice(None) if subset is None else subset + subset = non_reducing_slice(subset) + data = self.data.loc[subset] + if data.empty: + result = DataFrame() + elif axis is None: + result = func(data, **kwargs) + if not isinstance(result, DataFrame): + if not isinstance(result, np.ndarray): + raise TypeError( + f"Function {repr(func)} must return a DataFrame or ndarray " + f"when passed to `Styler.apply` with axis=None" + ) + if data.shape != result.shape: + raise ValueError( + f"Function {repr(func)} returned ndarray with wrong shape.\n" + f"Result has shape: {result.shape}\n" + f"Expected shape: {data.shape}" + ) + result = DataFrame(result, index=data.index, columns=data.columns) + else: + axis = self.data._get_axis_number(axis) + if axis == 0: + result = data.apply(func, axis=0, **kwargs) + else: + result = data.T.apply(func, axis=0, **kwargs).T # see GH 42005 + + if isinstance(result, Series): + raise ValueError( + f"Function {repr(func)} resulted in the apply method collapsing to a " + f"Series.\nUsually, this is the result of the function returning a " + f"single value, instead of list-like." + ) + msg = ( + f"Function {repr(func)} created invalid {{0}} labels.\nUsually, this is " + f"the result of the function returning a " + f"{'Series' if axis is not None else 'DataFrame'} which contains invalid " + f"labels, or returning an incorrectly shaped, list-like object which " + f"cannot be mapped to labels, possibly due to applying the function along " + f"the wrong axis.\n" + f"Result {{0}} has shape: {{1}}\n" + f"Expected {{0}} shape: {{2}}" + ) + if not all(result.index.isin(data.index)): + raise ValueError(msg.format("index", result.index.shape, data.index.shape)) + if not all(result.columns.isin(data.columns)): + raise ValueError( + msg.format("columns", result.columns.shape, data.columns.shape) + ) + self._update_ctx(result) + return self + + @Substitution(subset=subset_args) + def apply( + self, + func: Callable, + axis: Axis | None = 0, + subset: Subset | None = None, + **kwargs, + ) -> Styler: + """ + Apply a CSS-styling function column-wise, row-wise, or table-wise. + + Updates the HTML representation with the result. + + Parameters + ---------- + func : function + ``func`` should take a Series if ``axis`` in [0,1] and return a list-like + object of same length, or a Series, not necessarily of same length, with + valid index labels considering ``subset``. + ``func`` should take a DataFrame if ``axis`` is ``None`` and return either + an ndarray with the same shape or a DataFrame, not necessarily of the same + shape, with valid index and columns labels considering ``subset``. + + .. versionchanged:: 1.3.0 + + .. versionchanged:: 1.4.0 + + axis : {0 or 'index', 1 or 'columns', None}, default 0 + Apply to each column (``axis=0`` or ``'index'``), to each row + (``axis=1`` or ``'columns'``), or to the entire DataFrame at once + with ``axis=None``. + %(subset)s + **kwargs : dict + Pass along to ``func``. + + Returns + ------- + Styler + + See Also + -------- + Styler.map_index: Apply a CSS-styling function to headers elementwise. + Styler.apply_index: Apply a CSS-styling function to headers level-wise. + Styler.map: Apply a CSS-styling function elementwise. + + Notes + ----- + The elements of the output of ``func`` should be CSS styles as strings, in the + format 'attribute: value; attribute2: value2; ...' or, + if nothing is to be applied to that element, an empty string or ``None``. + + This is similar to ``DataFrame.apply``, except that ``axis=None`` + applies the function to the entire DataFrame at once, + rather than column-wise or row-wise. + + Examples + -------- + >>> def highlight_max(x, color): + ... return np.where(x == np.nanmax(x.to_numpy()), f"color: {color};", None) + >>> df = pd.DataFrame(np.random.randn(5, 2), columns=["A", "B"]) + >>> df.style.apply(highlight_max, color='red') # doctest: +SKIP + >>> df.style.apply(highlight_max, color='blue', axis=1) # doctest: +SKIP + >>> df.style.apply(highlight_max, color='green', axis=None) # doctest: +SKIP + + Using ``subset`` to restrict application to a single column or multiple columns + + >>> df.style.apply(highlight_max, color='red', subset="A") + ... # doctest: +SKIP + >>> df.style.apply(highlight_max, color='red', subset=["A", "B"]) + ... # doctest: +SKIP + + Using a 2d input to ``subset`` to select rows in addition to columns + + >>> df.style.apply(highlight_max, color='red', subset=([0, 1, 2], slice(None))) + ... # doctest: +SKIP + >>> df.style.apply(highlight_max, color='red', subset=(slice(0, 5, 2), "A")) + ... # doctest: +SKIP + + Using a function which returns a Series / DataFrame of unequal length but + containing valid index labels + + >>> df = pd.DataFrame([[1, 2], [3, 4], [4, 6]], index=["A1", "A2", "Total"]) + >>> total_style = pd.Series("font-weight: bold;", index=["Total"]) + >>> df.style.apply(lambda s: total_style) # doctest: +SKIP + + See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for + more details. + """ + self._todo.append( + (lambda instance: getattr(instance, "_apply"), (func, axis, subset), kwargs) + ) + return self + + def _apply_index( + self, + func: Callable, + axis: Axis = 0, + level: Level | list[Level] | None = None, + method: str = "apply", + **kwargs, + ) -> Styler: + axis = self.data._get_axis_number(axis) + obj = self.index if axis == 0 else self.columns + + levels_ = refactor_levels(level, obj) + data = DataFrame(obj.to_list()).loc[:, levels_] + + if method == "apply": + result = data.apply(func, axis=0, **kwargs) + elif method == "map": + result = data.map(func, **kwargs) + + self._update_ctx_header(result, axis) + return self + + @doc( + this="apply", + wise="level-wise", + alt="map", + altwise="elementwise", + func="take a Series and return a string array of the same length", + input_note="the index as a Series, if an Index, or a level of a MultiIndex", + output_note="an identically sized array of CSS styles as strings", + var="s", + ret='np.where(s == "B", "background-color: yellow;", "")', + ret2='["background-color: yellow;" if "x" in v else "" for v in s]', + ) + def apply_index( + self, + func: Callable, + axis: AxisInt | str = 0, + level: Level | list[Level] | None = None, + **kwargs, + ) -> Styler: + """ + Apply a CSS-styling function to the index or column headers, {wise}. + + Updates the HTML representation with the result. + + .. versionadded:: 1.4.0 + + .. versionadded:: 2.1.0 + Styler.applymap_index was deprecated and renamed to Styler.map_index. + + Parameters + ---------- + func : function + ``func`` should {func}. + axis : {{0, 1, "index", "columns"}} + The headers over which to apply the function. + level : int, str, list, optional + If index is MultiIndex the level(s) over which to apply the function. + **kwargs : dict + Pass along to ``func``. + + Returns + ------- + Styler + + See Also + -------- + Styler.{alt}_index: Apply a CSS-styling function to headers {altwise}. + Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. + Styler.map: Apply a CSS-styling function elementwise. + + Notes + ----- + Each input to ``func`` will be {input_note}. The output of ``func`` should be + {output_note}, in the format 'attribute: value; attribute2: value2; ...' + or, if nothing is to be applied to that element, an empty string or ``None``. + + Examples + -------- + Basic usage to conditionally highlight values in the index. + + >>> df = pd.DataFrame([[1,2], [3,4]], index=["A", "B"]) + >>> def color_b(s): + ... return {ret} + >>> df.style.{this}_index(color_b) # doctest: +SKIP + + .. figure:: ../../_static/style/appmaphead1.png + + Selectively applying to specific levels of MultiIndex columns. + + >>> midx = pd.MultiIndex.from_product([['ix', 'jy'], [0, 1], ['x3', 'z4']]) + >>> df = pd.DataFrame([np.arange(8)], columns=midx) + >>> def highlight_x({var}): + ... return {ret2} + >>> df.style.{this}_index(highlight_x, axis="columns", level=[0, 2]) + ... # doctest: +SKIP + + .. figure:: ../../_static/style/appmaphead2.png + """ + self._todo.append( + ( + lambda instance: getattr(instance, "_apply_index"), + (func, axis, level, "apply"), + kwargs, + ) + ) + return self + + @doc( + apply_index, + this="map", + wise="elementwise", + alt="apply", + altwise="level-wise", + func="take a scalar and return a string", + input_note="an index value, if an Index, or a level value of a MultiIndex", + output_note="CSS styles as a string", + var="v", + ret='"background-color: yellow;" if v == "B" else None', + ret2='"background-color: yellow;" if "x" in v else None', + ) + def map_index( + self, + func: Callable, + axis: AxisInt | str = 0, + level: Level | list[Level] | None = None, + **kwargs, + ) -> Styler: + self._todo.append( + ( + lambda instance: getattr(instance, "_apply_index"), + (func, axis, level, "map"), + kwargs, + ) + ) + return self + + def applymap_index( + self, + func: Callable, + axis: AxisInt | str = 0, + level: Level | list[Level] | None = None, + **kwargs, + ) -> Styler: + """ + Apply a CSS-styling function to the index or column headers, elementwise. + + .. deprecated:: 2.1.0 + + Styler.applymap_index has been deprecated. Use Styler.map_index instead. + + Parameters + ---------- + func : function + ``func`` should take a scalar and return a string. + axis : {{0, 1, "index", "columns"}} + The headers over which to apply the function. + level : int, str, list, optional + If index is MultiIndex the level(s) over which to apply the function. + **kwargs : dict + Pass along to ``func``. + + Returns + ------- + Styler + """ + warnings.warn( + "Styler.applymap_index has been deprecated. Use Styler.map_index instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.map_index(func, axis, level, **kwargs) + + def _map(self, func: Callable, subset: Subset | None = None, **kwargs) -> Styler: + func = partial(func, **kwargs) # map doesn't take kwargs? + if subset is None: + subset = IndexSlice[:] + subset = non_reducing_slice(subset) + result = self.data.loc[subset].map(func) + self._update_ctx(result) + return self + + @Substitution(subset=subset_args) + def map(self, func: Callable, subset: Subset | None = None, **kwargs) -> Styler: + """ + Apply a CSS-styling function elementwise. + + Updates the HTML representation with the result. + + Parameters + ---------- + func : function + ``func`` should take a scalar and return a string. + %(subset)s + **kwargs : dict + Pass along to ``func``. + + Returns + ------- + Styler + + See Also + -------- + Styler.map_index: Apply a CSS-styling function to headers elementwise. + Styler.apply_index: Apply a CSS-styling function to headers level-wise. + Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise. + + Notes + ----- + The elements of the output of ``func`` should be CSS styles as strings, in the + format 'attribute: value; attribute2: value2; ...' or, + if nothing is to be applied to that element, an empty string or ``None``. + + Examples + -------- + >>> def color_negative(v, color): + ... return f"color: {color};" if v < 0 else None + >>> df = pd.DataFrame(np.random.randn(5, 2), columns=["A", "B"]) + >>> df.style.map(color_negative, color='red') # doctest: +SKIP + + Using ``subset`` to restrict application to a single column or multiple columns + + >>> df.style.map(color_negative, color='red', subset="A") + ... # doctest: +SKIP + >>> df.style.map(color_negative, color='red', subset=["A", "B"]) + ... # doctest: +SKIP + + Using a 2d input to ``subset`` to select rows in addition to columns + + >>> df.style.map(color_negative, color='red', + ... subset=([0,1,2], slice(None))) # doctest: +SKIP + >>> df.style.map(color_negative, color='red', subset=(slice(0,5,2), "A")) + ... # doctest: +SKIP + + See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for + more details. + """ + self._todo.append( + (lambda instance: getattr(instance, "_map"), (func, subset), kwargs) + ) + return self + + @Substitution(subset=subset_args) + def applymap( + self, func: Callable, subset: Subset | None = None, **kwargs + ) -> Styler: + """ + Apply a CSS-styling function elementwise. + + .. deprecated:: 2.1.0 + + Styler.applymap has been deprecated. Use Styler.map instead. + + Parameters + ---------- + func : function + ``func`` should take a scalar and return a string. + %(subset)s + **kwargs : dict + Pass along to ``func``. + + Returns + ------- + Styler + """ + warnings.warn( + "Styler.applymap has been deprecated. Use Styler.map instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return self.map(func, subset, **kwargs) + + def set_table_attributes(self, attributes: str) -> Styler: + """ + Set the table attributes added to the ```` HTML element. + + These are items in addition to automatic (by default) ``id`` attribute. + + Parameters + ---------- + attributes : str + + Returns + ------- + Styler + + See Also + -------- + Styler.set_table_styles: Set the table styles included within the `` block + + Parameters + ---------- + sparsify_index : bool + Whether index_headers section will add rowspan attributes (>1) to elements. + + Returns + ------- + body : list + The associated HTML elements needed for template rendering. + """ + rlabels = self.data.index.tolist() + if not isinstance(self.data.index, MultiIndex): + rlabels = [[x] for x in rlabels] + + body: list = [] + visible_row_count: int = 0 + for r, row_tup in [ + z for z in enumerate(self.data.itertuples()) if z[0] not in self.hidden_rows + ]: + visible_row_count += 1 + if self._check_trim( + visible_row_count, + max_rows, + body, + "row", + ): + break + + body_row = self._generate_body_row( + (r, row_tup, rlabels), max_cols, idx_lengths + ) + body.append(body_row) + return body + + def _check_trim( + self, + count: int, + max: int, + obj: list, + element: str, + css: str | None = None, + value: str = "...", + ) -> bool: + """ + Indicates whether to break render loops and append a trimming indicator + + Parameters + ---------- + count : int + The loop count of previous visible items. + max : int + The allowable rendered items in the loop. + obj : list + The current render collection of the rendered items. + element : str + The type of element to append in the case a trimming indicator is needed. + css : str, optional + The css to add to the trimming indicator element. + value : str, optional + The value of the elements display if necessary. + + Returns + ------- + result : bool + Whether a trimming element was required and appended. + """ + if count > max: + if element == "row": + obj.append(self._generate_trimmed_row(max)) + else: + obj.append(_element(element, css, value, True, attributes="")) + return True + return False + + def _generate_trimmed_row(self, max_cols: int) -> list: + """ + When a render has too many rows we generate a trimming row containing "..." + + Parameters + ---------- + max_cols : int + Number of permissible columns + + Returns + ------- + list of elements + """ + index_headers = [ + _element( + "th", + ( + f"{self.css['row_heading']} {self.css['level']}{c} " + f"{self.css['row_trim']}" + ), + "...", + not self.hide_index_[c], + attributes="", + ) + for c in range(self.data.index.nlevels) + ] + + data: list = [] + visible_col_count: int = 0 + for c, _ in enumerate(self.columns): + data_element_visible = c not in self.hidden_columns + if data_element_visible: + visible_col_count += 1 + if self._check_trim( + visible_col_count, + max_cols, + data, + "td", + f"{self.css['data']} {self.css['row_trim']} {self.css['col_trim']}", + ): + break + + data.append( + _element( + "td", + f"{self.css['data']} {self.css['col']}{c} {self.css['row_trim']}", + "...", + data_element_visible, + attributes="", + ) + ) + + return index_headers + data + + def _generate_body_row( + self, + iter: tuple, + max_cols: int, + idx_lengths: dict, + ): + """ + Generate a regular row for the body section of appropriate format. + + +--------------------------------------------+---------------------------+ + | index_header_0 ... index_header_n | data_by_column ... | + +--------------------------------------------+---------------------------+ + + Parameters + ---------- + iter : tuple + Iterable from outer scope: row number, row data tuple, row index labels. + max_cols : int + Number of permissible columns. + idx_lengths : dict + A map of the sparsification structure of the index + + Returns + ------- + list of elements + """ + r, row_tup, rlabels = iter + + index_headers = [] + for c, value in enumerate(rlabels[r]): + header_element_visible = ( + _is_visible(r, c, idx_lengths) and not self.hide_index_[c] + ) + header_element = _element( + "th", + ( + f"{self.css['row_heading']} {self.css['level']}{c} " + f"{self.css['row']}{r}" + ), + value, + header_element_visible, + display_value=self._display_funcs_index[(r, c)](value), + attributes=( + f'rowspan="{idx_lengths.get((c, r), 0)}"' + if idx_lengths.get((c, r), 0) > 1 + else "" + ), + ) + + if self.cell_ids: + header_element[ + "id" + ] = f"{self.css['level']}{c}_{self.css['row']}{r}" # id is given + if ( + header_element_visible + and (r, c) in self.ctx_index + and self.ctx_index[r, c] + ): + # always add id if a style is specified + header_element["id"] = f"{self.css['level']}{c}_{self.css['row']}{r}" + self.cellstyle_map_index[tuple(self.ctx_index[r, c])].append( + f"{self.css['level']}{c}_{self.css['row']}{r}" + ) + + index_headers.append(header_element) + + data: list = [] + visible_col_count: int = 0 + for c, value in enumerate(row_tup[1:]): + data_element_visible = ( + c not in self.hidden_columns and r not in self.hidden_rows + ) + if data_element_visible: + visible_col_count += 1 + if self._check_trim( + visible_col_count, + max_cols, + data, + "td", + f"{self.css['data']} {self.css['row']}{r} {self.css['col_trim']}", + ): + break + + # add custom classes from cell context + cls = "" + if (r, c) in self.cell_context: + cls = " " + self.cell_context[r, c] + + data_element = _element( + "td", + ( + f"{self.css['data']} {self.css['row']}{r} " + f"{self.css['col']}{c}{cls}" + ), + value, + data_element_visible, + attributes="", + display_value=self._display_funcs[(r, c)](value), + ) + + if self.cell_ids: + data_element["id"] = f"{self.css['row']}{r}_{self.css['col']}{c}" + if data_element_visible and (r, c) in self.ctx and self.ctx[r, c]: + # always add id if needed due to specified style + data_element["id"] = f"{self.css['row']}{r}_{self.css['col']}{c}" + self.cellstyle_map[tuple(self.ctx[r, c])].append( + f"{self.css['row']}{r}_{self.css['col']}{c}" + ) + + data.append(data_element) + + return index_headers + data + + def _translate_latex(self, d: dict, clines: str | None) -> None: + r""" + Post-process the default render dict for the LaTeX template format. + + Processing items included are: + - Remove hidden columns from the non-headers part of the body. + - Place cellstyles directly in td cells rather than use cellstyle_map. + - Remove hidden indexes or reinsert missing th elements if part of multiindex + or multirow sparsification (so that \multirow and \multicol work correctly). + """ + index_levels = self.index.nlevels + visible_index_level_n = index_levels - sum(self.hide_index_) + d["head"] = [ + [ + {**col, "cellstyle": self.ctx_columns[r, c - visible_index_level_n]} + for c, col in enumerate(row) + if col["is_visible"] + ] + for r, row in enumerate(d["head"]) + ] + + def _concatenated_visible_rows(obj, n, row_indices): + """ + Extract all visible row indices recursively from concatenated stylers. + """ + row_indices.extend( + [r + n for r in range(len(obj.index)) if r not in obj.hidden_rows] + ) + n += len(obj.index) + for concatenated in obj.concatenated: + n = _concatenated_visible_rows(concatenated, n, row_indices) + return n + + def concatenated_visible_rows(obj): + row_indices: list[int] = [] + _concatenated_visible_rows(obj, 0, row_indices) + # TODO try to consolidate the concat visible rows + # methods to a single function / recursion for simplicity + return row_indices + + body = [] + for r, row in zip(concatenated_visible_rows(self), d["body"]): + # note: cannot enumerate d["body"] because rows were dropped if hidden + # during _translate_body so must zip to acquire the true r-index associated + # with the ctx obj which contains the cell styles. + if all(self.hide_index_): + row_body_headers = [] + else: + row_body_headers = [ + { + **col, + "display_value": col["display_value"] + if col["is_visible"] + else "", + "cellstyle": self.ctx_index[r, c], + } + for c, col in enumerate(row[:index_levels]) + if (col["type"] == "th" and not self.hide_index_[c]) + ] + + row_body_cells = [ + {**col, "cellstyle": self.ctx[r, c]} + for c, col in enumerate(row[index_levels:]) + if (col["is_visible"] and col["type"] == "td") + ] + + body.append(row_body_headers + row_body_cells) + d["body"] = body + + # clines are determined from info on index_lengths and hidden_rows and input + # to a dict defining which row clines should be added in the template. + if clines not in [ + None, + "all;data", + "all;index", + "skip-last;data", + "skip-last;index", + ]: + raise ValueError( + f"`clines` value of {clines} is invalid. Should either be None or one " + f"of 'all;data', 'all;index', 'skip-last;data', 'skip-last;index'." + ) + if clines is not None: + data_len = len(row_body_cells) if "data" in clines and d["body"] else 0 + + d["clines"] = defaultdict(list) + visible_row_indexes: list[int] = [ + r for r in range(len(self.data.index)) if r not in self.hidden_rows + ] + visible_index_levels: list[int] = [ + i for i in range(index_levels) if not self.hide_index_[i] + ] + for rn, r in enumerate(visible_row_indexes): + for lvln, lvl in enumerate(visible_index_levels): + if lvl == index_levels - 1 and "skip-last" in clines: + continue + idx_len = d["index_lengths"].get((lvl, r), None) + if idx_len is not None: # i.e. not a sparsified entry + d["clines"][rn + idx_len].append( + f"\\cline{{{lvln+1}-{len(visible_index_levels)+data_len}}}" + ) + + def format( + self, + formatter: ExtFormatter | None = None, + subset: Subset | None = None, + na_rep: str | None = None, + precision: int | None = None, + decimal: str = ".", + thousands: str | None = None, + escape: str | None = None, + hyperlinks: str | None = None, + ) -> StylerRenderer: + r""" + Format the text display value of cells. + + Parameters + ---------- + formatter : str, callable, dict or None + Object to define how values are displayed. See notes. + subset : label, array-like, IndexSlice, optional + A valid 2d input to `DataFrame.loc[]`, or, in the case of a 1d input + or single key, to `DataFrame.loc[:, ]` where the columns are + prioritised, to limit ``data`` to *before* applying the function. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied. + precision : int, optional + Floating point precision to use for display purposes, if not determined by + the specified ``formatter``. + + .. versionadded:: 1.3.0 + + decimal : str, default "." + Character used as decimal separator for floats, complex and integers. + + .. versionadded:: 1.3.0 + + thousands : str, optional, default None + Character used as thousands separator for floats, complex and integers. + + .. versionadded:: 1.3.0 + + escape : str, optional + Use 'html' to replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` + in cell display string with HTML-safe sequences. + Use 'latex' to replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, + ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with + LaTeX-safe sequences. + Use 'latex-math' to replace the characters the same way as in 'latex' mode, + except for math substrings, which either are surrounded + by two characters ``$`` or start with the character ``\(`` and + end with ``\)``. Escaping is done before ``formatter``. + + .. versionadded:: 1.3.0 + + hyperlinks : {"html", "latex"}, optional + Convert string patterns containing https://, http://, ftp:// or www. to + HTML tags as clickable URL hyperlinks if "html", or LaTeX \href + commands if "latex". + + .. versionadded:: 1.4.0 + + Returns + ------- + Styler + + See Also + -------- + Styler.format_index: Format the text display value of index labels. + + Notes + ----- + This method assigns a formatting function, ``formatter``, to each cell in the + DataFrame. If ``formatter`` is ``None``, then the default formatter is used. + If a callable then that function should take a data value as input and return + a displayable representation, such as a string. If ``formatter`` is + given as a string this is assumed to be a valid Python format specification + and is wrapped to a callable as ``string.format(x)``. If a ``dict`` is given, + keys should correspond to column names, and values should be string or + callable, as above. + + The default formatter currently expresses floats and complex numbers with the + pandas display precision unless using the ``precision`` argument here. The + default formatter does not adjust the representation of missing values unless + the ``na_rep`` argument is used. + + The ``subset`` argument defines which region to apply the formatting function + to. If the ``formatter`` argument is given in dict form but does not include + all columns within the subset then these columns will have the default formatter + applied. Any columns in the formatter dict excluded from the subset will + be ignored. + + When using a ``formatter`` string the dtypes must be compatible, otherwise a + `ValueError` will be raised. + + When instantiating a Styler, default formatting can be applied be setting the + ``pandas.options``: + + - ``styler.format.formatter``: default None. + - ``styler.format.na_rep``: default None. + - ``styler.format.precision``: default 6. + - ``styler.format.decimal``: default ".". + - ``styler.format.thousands``: default None. + - ``styler.format.escape``: default None. + + .. warning:: + `Styler.format` is ignored when using the output format `Styler.to_excel`, + since Excel and Python have inherrently different formatting structures. + However, it is possible to use the `number-format` pseudo CSS attribute + to force Excel permissible formatting. See examples. + + Examples + -------- + Using ``na_rep`` and ``precision`` with the default ``formatter`` + + >>> df = pd.DataFrame([[np.nan, 1.0, 'A'], [2.0, np.nan, 3.0]]) + >>> df.style.format(na_rep='MISS', precision=3) # doctest: +SKIP + 0 1 2 + 0 MISS 1.000 A + 1 2.000 MISS 3.000 + + Using a ``formatter`` specification on consistent column dtypes + + >>> df.style.format('{:.2f}', na_rep='MISS', subset=[0,1]) # doctest: +SKIP + 0 1 2 + 0 MISS 1.00 A + 1 2.00 MISS 3.000000 + + Using the default ``formatter`` for unspecified columns + + >>> df.style.format({0: '{:.2f}', 1: '£ {:.1f}'}, na_rep='MISS', precision=1) + ... # doctest: +SKIP + 0 1 2 + 0 MISS £ 1.0 A + 1 2.00 MISS 3.0 + + Multiple ``na_rep`` or ``precision`` specifications under the default + ``formatter``. + + >>> (df.style.format(na_rep='MISS', precision=1, subset=[0]) + ... .format(na_rep='PASS', precision=2, subset=[1, 2])) # doctest: +SKIP + 0 1 2 + 0 MISS 1.00 A + 1 2.0 PASS 3.00 + + Using a callable ``formatter`` function. + + >>> func = lambda s: 'STRING' if isinstance(s, str) else 'FLOAT' + >>> df.style.format({0: '{:.1f}', 2: func}, precision=4, na_rep='MISS') + ... # doctest: +SKIP + 0 1 2 + 0 MISS 1.0000 STRING + 1 2.0 MISS FLOAT + + Using a ``formatter`` with HTML ``escape`` and ``na_rep``. + + >>> df = pd.DataFrame([['
', '"A&B"', None]]) + >>> s = df.style.format( + ... '
{0}', escape="html", na_rep="NA" + ... ) + >>> s.to_html() # doctest: +SKIP + ... +
+ + + ... + + Using a ``formatter`` with ``escape`` in 'latex' mode. + + >>> df = pd.DataFrame([["123"], ["~ ^"], ["$%#"]]) + >>> df.style.format("\\textbf{{{}}}", escape="latex").to_latex() + ... # doctest: +SKIP + \begin{tabular}{ll} + & 0 \\ + 0 & \textbf{123} \\ + 1 & \textbf{\textasciitilde \space \textasciicircum } \\ + 2 & \textbf{\$\%\#} \\ + \end{tabular} + + Applying ``escape`` in 'latex-math' mode. In the example below + we enter math mode using the character ``$``. + + >>> df = pd.DataFrame([[r"$\sum_{i=1}^{10} a_i$ a~b $\alpha \ + ... = \frac{\beta}{\zeta^2}$"], ["%#^ $ \$x^2 $"]]) + >>> df.style.format(escape="latex-math").to_latex() + ... # doctest: +SKIP + \begin{tabular}{ll} + & 0 \\ + 0 & $\sum_{i=1}^{10} a_i$ a\textasciitilde b $\alpha = \frac{\beta}{\zeta^2}$ \\ + 1 & \%\#\textasciicircum \space $ \$x^2 $ \\ + \end{tabular} + + We can use the character ``\(`` to enter math mode and the character ``\)`` + to close math mode. + + >>> df = pd.DataFrame([[r"\(\sum_{i=1}^{10} a_i\) a~b \(\alpha \ + ... = \frac{\beta}{\zeta^2}\)"], ["%#^ \( \$x^2 \)"]]) + >>> df.style.format(escape="latex-math").to_latex() + ... # doctest: +SKIP + \begin{tabular}{ll} + & 0 \\ + 0 & \(\sum_{i=1}^{10} a_i\) a\textasciitilde b \(\alpha + = \frac{\beta}{\zeta^2}\) \\ + 1 & \%\#\textasciicircum \space \( \$x^2 \) \\ + \end{tabular} + + If we have in one DataFrame cell a combination of both shorthands + for math formulas, the shorthand with the sign ``$`` will be applied. + + >>> df = pd.DataFrame([[r"\( x^2 \) $x^2$"], \ + ... [r"$\frac{\beta}{\zeta}$ \(\frac{\beta}{\zeta}\)"]]) + >>> df.style.format(escape="latex-math").to_latex() + ... # doctest: +SKIP + \begin{tabular}{ll} + & 0 \\ + 0 & \textbackslash ( x\textasciicircum 2 \textbackslash ) $x^2$ \\ + 1 & $\frac{\beta}{\zeta}$ \textbackslash (\textbackslash + frac\{\textbackslash beta\}\{\textbackslash zeta\}\textbackslash ) \\ + \end{tabular} + + Pandas defines a `number-format` pseudo CSS attribute instead of the `.format` + method to create `to_excel` permissible formatting. Note that semi-colons are + CSS protected characters but used as separators in Excel's format string. + Replace semi-colons with the section separator character (ASCII-245) when + defining the formatting here. + + >>> df = pd.DataFrame({"A": [1, 0, -1]}) + >>> pseudo_css = "number-format: 0§[Red](0)§-§@;" + >>> filename = "formatted_file.xlsx" + >>> df.style.map(lambda v: pseudo_css).to_excel(filename) # doctest: +SKIP + + .. figure:: ../../_static/style/format_excel_css.png + """ + if all( + ( + formatter is None, + subset is None, + precision is None, + decimal == ".", + thousands is None, + na_rep is None, + escape is None, + hyperlinks is None, + ) + ): + self._display_funcs.clear() + return self # clear the formatter / revert to default and avoid looping + + subset = slice(None) if subset is None else subset + subset = non_reducing_slice(subset) + data = self.data.loc[subset] + + if not isinstance(formatter, dict): + formatter = {col: formatter for col in data.columns} + + cis = self.columns.get_indexer_for(data.columns) + ris = self.index.get_indexer_for(data.index) + for ci in cis: + format_func = _maybe_wrap_formatter( + formatter.get(self.columns[ci]), + na_rep=na_rep, + precision=precision, + decimal=decimal, + thousands=thousands, + escape=escape, + hyperlinks=hyperlinks, + ) + for ri in ris: + self._display_funcs[(ri, ci)] = format_func + + return self + + def format_index( + self, + formatter: ExtFormatter | None = None, + axis: Axis = 0, + level: Level | list[Level] | None = None, + na_rep: str | None = None, + precision: int | None = None, + decimal: str = ".", + thousands: str | None = None, + escape: str | None = None, + hyperlinks: str | None = None, + ) -> StylerRenderer: + r""" + Format the text display value of index labels or column headers. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + formatter : str, callable, dict or None + Object to define how values are displayed. See notes. + axis : {0, "index", 1, "columns"} + Whether to apply the formatter to the index or column headers. + level : int, str, list + The level(s) over which to apply the generic formatter. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied. + precision : int, optional + Floating point precision to use for display purposes, if not determined by + the specified ``formatter``. + decimal : str, default "." + Character used as decimal separator for floats, complex and integers. + thousands : str, optional, default None + Character used as thousands separator for floats, complex and integers. + escape : str, optional + Use 'html' to replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` + in cell display string with HTML-safe sequences. + Use 'latex' to replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, + ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with + LaTeX-safe sequences. + Escaping is done before ``formatter``. + hyperlinks : {"html", "latex"}, optional + Convert string patterns containing https://, http://, ftp:// or www. to + HTML tags as clickable URL hyperlinks if "html", or LaTeX \href + commands if "latex". + + Returns + ------- + Styler + + See Also + -------- + Styler.format: Format the text display value of data cells. + + Notes + ----- + This method assigns a formatting function, ``formatter``, to each level label + in the DataFrame's index or column headers. If ``formatter`` is ``None``, + then the default formatter is used. + If a callable then that function should take a label value as input and return + a displayable representation, such as a string. If ``formatter`` is + given as a string this is assumed to be a valid Python format specification + and is wrapped to a callable as ``string.format(x)``. If a ``dict`` is given, + keys should correspond to MultiIndex level numbers or names, and values should + be string or callable, as above. + + The default formatter currently expresses floats and complex numbers with the + pandas display precision unless using the ``precision`` argument here. The + default formatter does not adjust the representation of missing values unless + the ``na_rep`` argument is used. + + The ``level`` argument defines which levels of a MultiIndex to apply the + method to. If the ``formatter`` argument is given in dict form but does + not include all levels within the level argument then these unspecified levels + will have the default formatter applied. Any levels in the formatter dict + specifically excluded from the level argument will be ignored. + + When using a ``formatter`` string the dtypes must be compatible, otherwise a + `ValueError` will be raised. + + .. warning:: + `Styler.format_index` is ignored when using the output format + `Styler.to_excel`, since Excel and Python have inherrently different + formatting structures. + However, it is possible to use the `number-format` pseudo CSS attribute + to force Excel permissible formatting. See documentation for `Styler.format`. + + Examples + -------- + Using ``na_rep`` and ``precision`` with the default ``formatter`` + + >>> df = pd.DataFrame([[1, 2, 3]], columns=[2.0, np.nan, 4.0]) + >>> df.style.format_index(axis=1, na_rep='MISS', precision=3) # doctest: +SKIP + 2.000 MISS 4.000 + 0 1 2 3 + + Using a ``formatter`` specification on consistent dtypes in a level + + >>> df.style.format_index('{:.2f}', axis=1, na_rep='MISS') # doctest: +SKIP + 2.00 MISS 4.00 + 0 1 2 3 + + Using the default ``formatter`` for unspecified levels + + >>> df = pd.DataFrame([[1, 2, 3]], + ... columns=pd.MultiIndex.from_arrays([["a", "a", "b"],[2, np.nan, 4]])) + >>> df.style.format_index({0: lambda v: v.upper()}, axis=1, precision=1) + ... # doctest: +SKIP + A B + 2.0 nan 4.0 + 0 1 2 3 + + Using a callable ``formatter`` function. + + >>> func = lambda s: 'STRING' if isinstance(s, str) else 'FLOAT' + >>> df.style.format_index(func, axis=1, na_rep='MISS') + ... # doctest: +SKIP + STRING STRING + FLOAT MISS FLOAT + 0 1 2 3 + + Using a ``formatter`` with HTML ``escape`` and ``na_rep``. + + >>> df = pd.DataFrame([[1, 2, 3]], columns=['"A"', 'A&B', None]) + >>> s = df.style.format_index('$ {0}', axis=1, escape="html", na_rep="NA") + ... # doctest: +SKIP + + + or element. + """ + if "display_value" not in kwargs: + kwargs["display_value"] = value + return { + "type": html_element, + "value": value, + "class": html_class, + "is_visible": is_visible, + **kwargs, + } + + +def _get_trimming_maximums( + rn, + cn, + max_elements, + max_rows=None, + max_cols=None, + scaling_factor: float = 0.8, +) -> tuple[int, int]: + """ + Recursively reduce the number of rows and columns to satisfy max elements. + + Parameters + ---------- + rn, cn : int + The number of input rows / columns + max_elements : int + The number of allowable elements + max_rows, max_cols : int, optional + Directly specify an initial maximum rows or columns before compression. + scaling_factor : float + Factor at which to reduce the number of rows / columns to fit. + + Returns + ------- + rn, cn : tuple + New rn and cn values that satisfy the max_elements constraint + """ + + def scale_down(rn, cn): + if cn >= rn: + return rn, int(cn * scaling_factor) + else: + return int(rn * scaling_factor), cn + + if max_rows: + rn = max_rows if rn > max_rows else rn + if max_cols: + cn = max_cols if cn > max_cols else cn + + while rn * cn > max_elements: + rn, cn = scale_down(rn, cn) + + return rn, cn + + +def _get_level_lengths( + index: Index, + sparsify: bool, + max_index: int, + hidden_elements: Sequence[int] | None = None, +): + """ + Given an index, find the level length for each element. + + Parameters + ---------- + index : Index + Index or columns to determine lengths of each element + sparsify : bool + Whether to hide or show each distinct element in a MultiIndex + max_index : int + The maximum number of elements to analyse along the index due to trimming + hidden_elements : sequence of int + Index positions of elements hidden from display in the index affecting + length + + Returns + ------- + Dict : + Result is a dictionary of (level, initial_position): span + """ + if isinstance(index, MultiIndex): + levels = index._format_multi(sparsify=lib.no_default, include_names=False) + else: + levels = index._format_flat(include_name=False) + + if hidden_elements is None: + hidden_elements = [] + + lengths = {} + if not isinstance(index, MultiIndex): + for i, value in enumerate(levels): + if i not in hidden_elements: + lengths[(0, i)] = 1 + return lengths + + for i, lvl in enumerate(levels): + visible_row_count = 0 # used to break loop due to display trimming + for j, row in enumerate(lvl): + if visible_row_count > max_index: + break + if not sparsify: + # then lengths will always equal 1 since no aggregation. + if j not in hidden_elements: + lengths[(i, j)] = 1 + visible_row_count += 1 + elif (row is not lib.no_default) and (j not in hidden_elements): + # this element has not been sparsified so must be the start of section + last_label = j + lengths[(i, last_label)] = 1 + visible_row_count += 1 + elif row is not lib.no_default: + # even if the above is hidden, keep track of it in case length > 1 and + # later elements are visible + last_label = j + lengths[(i, last_label)] = 0 + elif j not in hidden_elements: + # then element must be part of sparsified section and is visible + visible_row_count += 1 + if visible_row_count > max_index: + break # do not add a length since the render trim limit reached + if lengths[(i, last_label)] == 0: + # if previous iteration was first-of-section but hidden then offset + last_label = j + lengths[(i, last_label)] = 1 + else: + # else add to previous iteration + lengths[(i, last_label)] += 1 + + non_zero_lengths = { + element: length for element, length in lengths.items() if length >= 1 + } + + return non_zero_lengths + + +def _is_visible(idx_row, idx_col, lengths) -> bool: + """ + Index -> {(idx_row, idx_col): bool}). + """ + return (idx_col, idx_row) in lengths + + +def format_table_styles(styles: CSSStyles) -> CSSStyles: + """ + looks for multiple CSS selectors and separates them: + [{'selector': 'td, th', 'props': 'a:v;'}] + ---> [{'selector': 'td', 'props': 'a:v;'}, + {'selector': 'th', 'props': 'a:v;'}] + """ + return [ + {"selector": selector, "props": css_dict["props"]} + for css_dict in styles + for selector in css_dict["selector"].split(",") + ] + + +def _default_formatter(x: Any, precision: int, thousands: bool = False) -> Any: + """ + Format the display of a value + + Parameters + ---------- + x : Any + Input variable to be formatted + precision : Int + Floating point precision used if ``x`` is float or complex. + thousands : bool, default False + Whether to group digits with thousands separated with ",". + + Returns + ------- + value : Any + Matches input type, or string if input is float or complex or int with sep. + """ + if is_float(x) or is_complex(x): + return f"{x:,.{precision}f}" if thousands else f"{x:.{precision}f}" + elif is_integer(x): + return f"{x:,}" if thousands else str(x) + return x + + +def _wrap_decimal_thousands( + formatter: Callable, decimal: str, thousands: str | None +) -> Callable: + """ + Takes a string formatting function and wraps logic to deal with thousands and + decimal parameters, in the case that they are non-standard and that the input + is a (float, complex, int). + """ + + def wrapper(x): + if is_float(x) or is_integer(x) or is_complex(x): + if decimal != "." and thousands is not None and thousands != ",": + return ( + formatter(x) + .replace(",", "§_§-") # rare string to avoid "," <-> "." clash. + .replace(".", decimal) + .replace("§_§-", thousands) + ) + elif decimal != "." and (thousands is None or thousands == ","): + return formatter(x).replace(".", decimal) + elif decimal == "." and thousands is not None and thousands != ",": + return formatter(x).replace(",", thousands) + return formatter(x) + + return wrapper + + +def _str_escape(x, escape): + """if escaping: only use on str, else return input""" + if isinstance(x, str): + if escape == "html": + return escape_html(x) + elif escape == "latex": + return _escape_latex(x) + elif escape == "latex-math": + return _escape_latex_math(x) + else: + raise ValueError( + f"`escape` only permitted in {{'html', 'latex', 'latex-math'}}, \ +got {escape}" + ) + return x + + +def _render_href(x, format): + """uses regex to detect a common URL pattern and converts to href tag in format.""" + if isinstance(x, str): + if format == "html": + href = '{0}' + elif format == "latex": + href = r"\href{{{0}}}{{{0}}}" + else: + raise ValueError("``hyperlinks`` format can only be 'html' or 'latex'") + pat = r"((http|ftp)s?:\/\/|www.)[\w/\-?=%.:@]+\.[\w/\-&?=%.,':;~!@#$*()\[\]]+" + return re.sub(pat, lambda m: href.format(m.group(0)), x) + return x + + +def _maybe_wrap_formatter( + formatter: BaseFormatter | None = None, + na_rep: str | None = None, + precision: int | None = None, + decimal: str = ".", + thousands: str | None = None, + escape: str | None = None, + hyperlinks: str | None = None, +) -> Callable: + """ + Allows formatters to be expressed as str, callable or None, where None returns + a default formatting function. wraps with na_rep, and precision where they are + available. + """ + # Get initial func from input string, input callable, or from default factory + if isinstance(formatter, str): + func_0 = lambda x: formatter.format(x) + elif callable(formatter): + func_0 = formatter + elif formatter is None: + precision = ( + get_option("styler.format.precision") if precision is None else precision + ) + func_0 = partial( + _default_formatter, precision=precision, thousands=(thousands is not None) + ) + else: + raise TypeError(f"'formatter' expected str or callable, got {type(formatter)}") + + # Replace chars if escaping + if escape is not None: + func_1 = lambda x: func_0(_str_escape(x, escape=escape)) + else: + func_1 = func_0 + + # Replace decimals and thousands if non-standard inputs detected + if decimal != "." or (thousands is not None and thousands != ","): + func_2 = _wrap_decimal_thousands(func_1, decimal=decimal, thousands=thousands) + else: + func_2 = func_1 + + # Render links + if hyperlinks is not None: + func_3 = lambda x: func_2(_render_href(x, format=hyperlinks)) + else: + func_3 = func_2 + + # Replace missing values if na_rep + if na_rep is None: + return func_3 + else: + return lambda x: na_rep if (isna(x) is True) else func_3(x) + + +def non_reducing_slice(slice_: Subset): + """ + Ensure that a slice doesn't reduce to a Series or Scalar. + + Any user-passed `subset` should have this called on it + to make sure we're always working with DataFrames. + """ + # default to column slice, like DataFrame + # ['A', 'B'] -> IndexSlices[:, ['A', 'B']] + kinds = (ABCSeries, np.ndarray, Index, list, str) + if isinstance(slice_, kinds): + slice_ = IndexSlice[:, slice_] + + def pred(part) -> bool: + """ + Returns + ------- + bool + True if slice does *not* reduce, + False if `part` is a tuple. + """ + # true when slice does *not* reduce, False when part is a tuple, + # i.e. MultiIndex slice + if isinstance(part, tuple): + # GH#39421 check for sub-slice: + return any((isinstance(s, slice) or is_list_like(s)) for s in part) + else: + return isinstance(part, slice) or is_list_like(part) + + if not is_list_like(slice_): + if not isinstance(slice_, slice): + # a 1-d slice, like df.loc[1] + slice_ = [[slice_]] + else: + # slice(a, b, c) + slice_ = [slice_] # to tuplize later + else: + # error: Item "slice" of "Union[slice, Sequence[Any]]" has no attribute + # "__iter__" (not iterable) -> is specifically list_like in conditional + slice_ = [p if pred(p) else [p] for p in slice_] # type: ignore[union-attr] + return tuple(slice_) + + +def maybe_convert_css_to_tuples(style: CSSProperties) -> CSSList: + """ + Convert css-string to sequence of tuples format if needed. + 'color:red; border:1px solid black;' -> [('color', 'red'), + ('border','1px solid red')] + """ + if isinstance(style, str): + s = style.split(";") + try: + return [ + (x.split(":")[0].strip(), x.split(":")[1].strip()) + for x in s + if x.strip() != "" + ] + except IndexError: + raise ValueError( + "Styles supplied as string must follow CSS rule formats, " + f"for example 'attr: val;'. '{style}' was given." + ) + return style + + +def refactor_levels( + level: Level | list[Level] | None, + obj: Index, +) -> list[int]: + """ + Returns a consistent levels arg for use in ``hide_index`` or ``hide_columns``. + + Parameters + ---------- + level : int, str, list + Original ``level`` arg supplied to above methods. + obj: + Either ``self.index`` or ``self.columns`` + + Returns + ------- + list : refactored arg with a list of levels to hide + """ + if level is None: + levels_: list[int] = list(range(obj.nlevels)) + elif isinstance(level, int): + levels_ = [level] + elif isinstance(level, str): + levels_ = [obj._get_level_number(level)] + elif isinstance(level, list): + levels_ = [ + obj._get_level_number(lev) if not isinstance(lev, int) else lev + for lev in level + ] + else: + raise ValueError("`level` must be of type `int`, `str` or list of such") + return levels_ + + +class Tooltips: + """ + An extension to ``Styler`` that allows for and manipulates tooltips on hover + of ``
<div></div>"A&B"NA$ "A"$ A&BNA + ... + + Using a ``formatter`` with LaTeX ``escape``. + + >>> df = pd.DataFrame([[1, 2, 3]], columns=["123", "~", "$%#"]) + >>> df.style.format_index("\\textbf{{{}}}", escape="latex", axis=1).to_latex() + ... # doctest: +SKIP + \begin{tabular}{lrrr} + {} & {\textbf{123}} & {\textbf{\textasciitilde }} & {\textbf{\$\%\#}} \\ + 0 & 1 & 2 & 3 \\ + \end{tabular} + """ + axis = self.data._get_axis_number(axis) + if axis == 0: + display_funcs_, obj = self._display_funcs_index, self.index + else: + display_funcs_, obj = self._display_funcs_columns, self.columns + levels_ = refactor_levels(level, obj) + + if all( + ( + formatter is None, + level is None, + precision is None, + decimal == ".", + thousands is None, + na_rep is None, + escape is None, + hyperlinks is None, + ) + ): + display_funcs_.clear() + return self # clear the formatter / revert to default and avoid looping + + if not isinstance(formatter, dict): + formatter = {level: formatter for level in levels_} + else: + formatter = { + obj._get_level_number(level): formatter_ + for level, formatter_ in formatter.items() + } + + for lvl in levels_: + format_func = _maybe_wrap_formatter( + formatter.get(lvl), + na_rep=na_rep, + precision=precision, + decimal=decimal, + thousands=thousands, + escape=escape, + hyperlinks=hyperlinks, + ) + + for idx in [(i, lvl) if axis == 0 else (lvl, i) for i in range(len(obj))]: + display_funcs_[idx] = format_func + + return self + + def relabel_index( + self, + labels: Sequence | Index, + axis: Axis = 0, + level: Level | list[Level] | None = None, + ) -> StylerRenderer: + r""" + Relabel the index, or column header, keys to display a set of specified values. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + labels : list-like or Index + New labels to display. Must have same length as the underlying values not + hidden. + axis : {"index", 0, "columns", 1} + Apply to the index or columns. + level : int, str, list, optional + The level(s) over which to apply the new labels. If `None` will apply + to all levels of an Index or MultiIndex which are not hidden. + + Returns + ------- + Styler + + See Also + -------- + Styler.format_index: Format the text display value of index or column headers. + Styler.hide: Hide the index, column headers, or specified data from display. + + Notes + ----- + As part of Styler, this method allows the display of an index to be + completely user-specified without affecting the underlying DataFrame data, + index, or column headers. This means that the flexibility of indexing is + maintained whilst the final display is customisable. + + Since Styler is designed to be progressively constructed with method chaining, + this method is adapted to react to the **currently specified hidden elements**. + This is useful because it means one does not have to specify all the new + labels if the majority of an index, or column headers, have already been hidden. + The following produce equivalent display (note the length of ``labels`` in + each case). + + .. code-block:: python + + # relabel first, then hide + df = pd.DataFrame({"col": ["a", "b", "c"]}) + df.style.relabel_index(["A", "B", "C"]).hide([0,1]) + # hide first, then relabel + df = pd.DataFrame({"col": ["a", "b", "c"]}) + df.style.hide([0,1]).relabel_index(["C"]) + + This method should be used, rather than :meth:`Styler.format_index`, in one of + the following cases (see examples): + + - A specified set of labels are required which are not a function of the + underlying index keys. + - The function of the underlying index keys requires a counter variable, + such as those available upon enumeration. + + Examples + -------- + Basic use + + >>> df = pd.DataFrame({"col": ["a", "b", "c"]}) + >>> df.style.relabel_index(["A", "B", "C"]) # doctest: +SKIP + col + A a + B b + C c + + Chaining with pre-hidden elements + + >>> df.style.hide([0,1]).relabel_index(["C"]) # doctest: +SKIP + col + C c + + Using a MultiIndex + + >>> midx = pd.MultiIndex.from_product([[0, 1], [0, 1], [0, 1]]) + >>> df = pd.DataFrame({"col": list(range(8))}, index=midx) + >>> styler = df.style # doctest: +SKIP + col + 0 0 0 0 + 1 1 + 1 0 2 + 1 3 + 1 0 0 4 + 1 5 + 1 0 6 + 1 7 + >>> styler.hide((midx.get_level_values(0)==0)|(midx.get_level_values(1)==0)) + ... # doctest: +SKIP + >>> styler.hide(level=[0,1]) # doctest: +SKIP + >>> styler.relabel_index(["binary6", "binary7"]) # doctest: +SKIP + col + binary6 6 + binary7 7 + + We can also achieve the above by indexing first and then re-labeling + + >>> styler = df.loc[[(1,1,0), (1,1,1)]].style + >>> styler.hide(level=[0,1]).relabel_index(["binary6", "binary7"]) + ... # doctest: +SKIP + col + binary6 6 + binary7 7 + + Defining a formatting function which uses an enumeration counter. Also note + that the value of the index key is passed in the case of string labels so it + can also be inserted into the label, using curly brackets (or double curly + brackets if the string if pre-formatted), + + >>> df = pd.DataFrame({"samples": np.random.rand(10)}) + >>> styler = df.loc[np.random.randint(0,10,3)].style + >>> styler.relabel_index([f"sample{i+1} ({{}})" for i in range(3)]) + ... # doctest: +SKIP + samples + sample1 (5) 0.315811 + sample2 (0) 0.495941 + sample3 (2) 0.067946 + """ + axis = self.data._get_axis_number(axis) + if axis == 0: + display_funcs_, obj = self._display_funcs_index, self.index + hidden_labels, hidden_lvls = self.hidden_rows, self.hide_index_ + else: + display_funcs_, obj = self._display_funcs_columns, self.columns + hidden_labels, hidden_lvls = self.hidden_columns, self.hide_columns_ + visible_len = len(obj) - len(set(hidden_labels)) + if len(labels) != visible_len: + raise ValueError( + "``labels`` must be of length equal to the number of " + f"visible labels along ``axis`` ({visible_len})." + ) + + if level is None: + level = [i for i in range(obj.nlevels) if not hidden_lvls[i]] + levels_ = refactor_levels(level, obj) + + def alias_(x, value): + if isinstance(value, str): + return value.format(x) + return value + + for ai, i in enumerate([i for i in range(len(obj)) if i not in hidden_labels]): + if len(levels_) == 1: + idx = (i, levels_[0]) if axis == 0 else (levels_[0], i) + display_funcs_[idx] = partial(alias_, value=labels[ai]) + else: + for aj, lvl in enumerate(levels_): + idx = (i, lvl) if axis == 0 else (lvl, i) + display_funcs_[idx] = partial(alias_, value=labels[ai][aj]) + + return self + + +def _element( + html_element: str, + html_class: str | None, + value: Any, + is_visible: bool, + **kwargs, +) -> dict: + """ + Template to return container with information for a `` cells in the HTML result. + + Parameters + ---------- + css_name: str, default "pd-t" + Name of the CSS class that controls visualisation of tooltips. + css_props: list-like, default; see Notes + List of (attr, value) tuples defining properties of the CSS class. + tooltips: DataFrame, default empty + DataFrame of strings aligned with underlying Styler data for tooltip + display. + + Notes + ----- + The default properties for the tooltip CSS class are: + + - visibility: hidden + - position: absolute + - z-index: 1 + - background-color: black + - color: white + - transform: translate(-20px, -20px) + + Hidden visibility is a key prerequisite to the hover functionality, and should + always be included in any manual properties specification. + """ + + def __init__( + self, + css_props: CSSProperties = [ + ("visibility", "hidden"), + ("position", "absolute"), + ("z-index", 1), + ("background-color", "black"), + ("color", "white"), + ("transform", "translate(-20px, -20px)"), + ], + css_name: str = "pd-t", + tooltips: DataFrame = DataFrame(), + ) -> None: + self.class_name = css_name + self.class_properties = css_props + self.tt_data = tooltips + self.table_styles: CSSStyles = [] + + @property + def _class_styles(self): + """ + Combine the ``_Tooltips`` CSS class name and CSS properties to the format + required to extend the underlying ``Styler`` `table_styles` to allow + tooltips to render in HTML. + + Returns + ------- + styles : List + """ + return [ + { + "selector": f".{self.class_name}", + "props": maybe_convert_css_to_tuples(self.class_properties), + } + ] + + def _pseudo_css(self, uuid: str, name: str, row: int, col: int, text: str): + """ + For every table data-cell that has a valid tooltip (not None, NaN or + empty string) must create two pseudo CSS entries for the specific + element id which are added to overall table styles: + an on hover visibility change and a content change + dependent upon the user's chosen display string. + + For example: + [{"selector": "T__row1_col1:hover .pd-t", + "props": [("visibility", "visible")]}, + {"selector": "T__row1_col1 .pd-t::after", + "props": [("content", "Some Valid Text String")]}] + + Parameters + ---------- + uuid: str + The uuid of the Styler instance + name: str + The css-name of the class used for styling tooltips + row : int + The row index of the specified tooltip string data + col : int + The col index of the specified tooltip string data + text : str + The textual content of the tooltip to be displayed in HTML. + + Returns + ------- + pseudo_css : List + """ + selector_id = "#T_" + uuid + "_row" + str(row) + "_col" + str(col) + return [ + { + "selector": selector_id + f":hover .{name}", + "props": [("visibility", "visible")], + }, + { + "selector": selector_id + f" .{name}::after", + "props": [("content", f'"{text}"')], + }, + ] + + def _translate(self, styler: StylerRenderer, d: dict): + """ + Mutate the render dictionary to allow for tooltips: + + - Add ```` HTML element to each data cells ``display_value``. Ignores + headers. + - Add table level CSS styles to control pseudo classes. + + Parameters + ---------- + styler_data : DataFrame + Underlying ``Styler`` DataFrame used for reindexing. + uuid : str + The underlying ``Styler`` uuid for CSS id. + d : dict + The dictionary prior to final render + + Returns + ------- + render_dict : Dict + """ + self.tt_data = self.tt_data.reindex_like(styler.data) + if self.tt_data.empty: + return d + + name = self.class_name + mask = (self.tt_data.isna()) | (self.tt_data.eq("")) # empty string = no ttip + self.table_styles = [ + style + for sublist in [ + self._pseudo_css(styler.uuid, name, i, j, str(self.tt_data.iloc[i, j])) + for i in range(len(self.tt_data.index)) + for j in range(len(self.tt_data.columns)) + if not ( + mask.iloc[i, j] + or i in styler.hidden_rows + or j in styler.hidden_columns + ) + ] + for style in sublist + ] + + if self.table_styles: + # add span class to every cell only if at least 1 non-empty tooltip + for row in d["body"]: + for item in row: + if item["type"] == "td": + item["display_value"] = ( + str(item["display_value"]) + + f'' + ) + d["table_styles"].extend(self._class_styles) + d["table_styles"].extend(self.table_styles) + + return d + + +def _parse_latex_table_wrapping(table_styles: CSSStyles, caption: str | None) -> bool: + """ + Indicate whether LaTeX {tabular} should be wrapped with a {table} environment. + + Parses the `table_styles` and detects any selectors which must be included outside + of {tabular}, i.e. indicating that wrapping must occur, and therefore return True, + or if a caption exists and requires similar. + """ + IGNORED_WRAPPERS = ["toprule", "midrule", "bottomrule", "column_format"] + # ignored selectors are included with {tabular} so do not need wrapping + return ( + table_styles is not None + and any(d["selector"] not in IGNORED_WRAPPERS for d in table_styles) + ) or caption is not None + + +def _parse_latex_table_styles(table_styles: CSSStyles, selector: str) -> str | None: + """ + Return the first 'props' 'value' from ``tables_styles`` identified by ``selector``. + + Examples + -------- + >>> table_styles = [{'selector': 'foo', 'props': [('attr','value')]}, + ... {'selector': 'bar', 'props': [('attr', 'overwritten')]}, + ... {'selector': 'bar', 'props': [('a1', 'baz'), ('a2', 'ignore')]}] + >>> _parse_latex_table_styles(table_styles, selector='bar') + 'baz' + + Notes + ----- + The replacement of "§" with ":" is to avoid the CSS problem where ":" has structural + significance and cannot be used in LaTeX labels, but is often required by them. + """ + for style in table_styles[::-1]: # in reverse for most recently applied style + if style["selector"] == selector: + return str(style["props"][0][1]).replace("§", ":") + return None + + +def _parse_latex_cell_styles( + latex_styles: CSSList, display_value: str, convert_css: bool = False +) -> str: + r""" + Mutate the ``display_value`` string including LaTeX commands from ``latex_styles``. + + This method builds a recursive latex chain of commands based on the + CSSList input, nested around ``display_value``. + + If a CSS style is given as ('', '') this is translated to + '\{display_value}', and this value is treated as the + display value for the next iteration. + + The most recent style forms the inner component, for example for styles: + `[('c1', 'o1'), ('c2', 'o2')]` this returns: `\c1o1{\c2o2{display_value}}` + + Sometimes latex commands have to be wrapped with curly braces in different ways: + We create some parsing flags to identify the different behaviours: + + - `--rwrap` : `\{}` + - `--wrap` : `{\ }` + - `--nowrap` : `\ ` + - `--lwrap` : `{\} ` + - `--dwrap` : `{\}{}` + + For example for styles: + `[('c1', 'o1--wrap'), ('c2', 'o2')]` this returns: `{\c1o1 \c2o2{display_value}} + """ + if convert_css: + latex_styles = _parse_latex_css_conversion(latex_styles) + for command, options in latex_styles[::-1]: # in reverse for most recent style + formatter = { + "--wrap": f"{{\\{command}--to_parse {display_value}}}", + "--nowrap": f"\\{command}--to_parse {display_value}", + "--lwrap": f"{{\\{command}--to_parse}} {display_value}", + "--rwrap": f"\\{command}--to_parse{{{display_value}}}", + "--dwrap": f"{{\\{command}--to_parse}}{{{display_value}}}", + } + display_value = f"\\{command}{options} {display_value}" + for arg in ["--nowrap", "--wrap", "--lwrap", "--rwrap", "--dwrap"]: + if arg in str(options): + display_value = formatter[arg].replace( + "--to_parse", _parse_latex_options_strip(value=options, arg=arg) + ) + break # only ever one purposeful entry + return display_value + + +def _parse_latex_header_span( + cell: dict[str, Any], + multirow_align: str, + multicol_align: str, + wrap: bool = False, + convert_css: bool = False, +) -> str: + r""" + Refactor the cell `display_value` if a 'colspan' or 'rowspan' attribute is present. + + 'rowspan' and 'colspan' do not occur simultaneouly. If they are detected then + the `display_value` is altered to a LaTeX `multirow` or `multicol` command + respectively, with the appropriate cell-span. + + ``wrap`` is used to enclose the `display_value` in braces which is needed for + column headers using an siunitx package. + + Requires the package {multirow}, whereas multicol support is usually built in + to the {tabular} environment. + + Examples + -------- + >>> cell = {'cellstyle': '', 'display_value':'text', 'attributes': 'colspan="3"'} + >>> _parse_latex_header_span(cell, 't', 'c') + '\\multicolumn{3}{c}{text}' + """ + display_val = _parse_latex_cell_styles( + cell["cellstyle"], cell["display_value"], convert_css + ) + if "attributes" in cell: + attrs = cell["attributes"] + if 'colspan="' in attrs: + colspan = attrs[attrs.find('colspan="') + 9 :] # len('colspan="') = 9 + colspan = int(colspan[: colspan.find('"')]) + if "naive-l" == multicol_align: + out = f"{{{display_val}}}" if wrap else f"{display_val}" + blanks = " & {}" if wrap else " &" + return out + blanks * (colspan - 1) + elif "naive-r" == multicol_align: + out = f"{{{display_val}}}" if wrap else f"{display_val}" + blanks = "{} & " if wrap else "& " + return blanks * (colspan - 1) + out + return f"\\multicolumn{{{colspan}}}{{{multicol_align}}}{{{display_val}}}" + elif 'rowspan="' in attrs: + if multirow_align == "naive": + return display_val + rowspan = attrs[attrs.find('rowspan="') + 9 :] + rowspan = int(rowspan[: rowspan.find('"')]) + return f"\\multirow[{multirow_align}]{{{rowspan}}}{{*}}{{{display_val}}}" + if wrap: + return f"{{{display_val}}}" + else: + return display_val + + +def _parse_latex_options_strip(value: str | float, arg: str) -> str: + """ + Strip a css_value which may have latex wrapping arguments, css comment identifiers, + and whitespaces, to a valid string for latex options parsing. + + For example: 'red /* --wrap */ ' --> 'red' + """ + return str(value).replace(arg, "").replace("/*", "").replace("*/", "").strip() + + +def _parse_latex_css_conversion(styles: CSSList) -> CSSList: + """ + Convert CSS (attribute,value) pairs to equivalent LaTeX (command,options) pairs. + + Ignore conversion if tagged with `--latex` option, skipped if no conversion found. + """ + + def font_weight(value, arg): + if value in ("bold", "bolder"): + return "bfseries", f"{arg}" + return None + + def font_style(value, arg): + if value == "italic": + return "itshape", f"{arg}" + if value == "oblique": + return "slshape", f"{arg}" + return None + + def color(value, user_arg, command, comm_arg): + """ + CSS colors have 5 formats to process: + + - 6 digit hex code: "#ff23ee" --> [HTML]{FF23EE} + - 3 digit hex code: "#f0e" --> [HTML]{FF00EE} + - rgba: rgba(128, 255, 0, 0.5) --> [rgb]{0.502, 1.000, 0.000} + - rgb: rgb(128, 255, 0,) --> [rbg]{0.502, 1.000, 0.000} + - string: red --> {red} + + Additionally rgb or rgba can be expressed in % which is also parsed. + """ + arg = user_arg if user_arg != "" else comm_arg + + if value[0] == "#" and len(value) == 7: # color is hex code + return command, f"[HTML]{{{value[1:].upper()}}}{arg}" + if value[0] == "#" and len(value) == 4: # color is short hex code + val = f"{value[1].upper()*2}{value[2].upper()*2}{value[3].upper()*2}" + return command, f"[HTML]{{{val}}}{arg}" + elif value[:3] == "rgb": # color is rgb or rgba + r = re.findall("(?<=\\()[0-9\\s%]+(?=,)", value)[0].strip() + r = float(r[:-1]) / 100 if "%" in r else int(r) / 255 + g = re.findall("(?<=,)[0-9\\s%]+(?=,)", value)[0].strip() + g = float(g[:-1]) / 100 if "%" in g else int(g) / 255 + if value[3] == "a": # color is rgba + b = re.findall("(?<=,)[0-9\\s%]+(?=,)", value)[1].strip() + else: # color is rgb + b = re.findall("(?<=,)[0-9\\s%]+(?=\\))", value)[0].strip() + b = float(b[:-1]) / 100 if "%" in b else int(b) / 255 + return command, f"[rgb]{{{r:.3f}, {g:.3f}, {b:.3f}}}{arg}" + else: + return command, f"{{{value}}}{arg}" # color is likely string-named + + CONVERTED_ATTRIBUTES: dict[str, Callable] = { + "font-weight": font_weight, + "background-color": partial(color, command="cellcolor", comm_arg="--lwrap"), + "color": partial(color, command="color", comm_arg=""), + "font-style": font_style, + } + + latex_styles: CSSList = [] + for attribute, value in styles: + if isinstance(value, str) and "--latex" in value: + # return the style without conversion but drop '--latex' + latex_styles.append((attribute, value.replace("--latex", ""))) + if attribute in CONVERTED_ATTRIBUTES: + arg = "" + for x in ["--wrap", "--nowrap", "--lwrap", "--dwrap", "--rwrap"]: + if x in str(value): + arg, value = x, _parse_latex_options_strip(value, x) + break + latex_style = CONVERTED_ATTRIBUTES[attribute](value, arg) + if latex_style is not None: + latex_styles.extend([latex_style]) + return latex_styles + + +def _escape_latex(s: str) -> str: + r""" + Replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, ``{``, ``}``, + ``~``, ``^``, and ``\`` in the string with LaTeX-safe sequences. + + Use this if you need to display text that might contain such characters in LaTeX. + + Parameters + ---------- + s : str + Input to be escaped + + Return + ------ + str : + Escaped string + """ + return ( + s.replace("\\", "ab2§=§8yz") # rare string for final conversion: avoid \\ clash + .replace("ab2§=§8yz ", "ab2§=§8yz\\space ") # since \backslash gobbles spaces + .replace("&", "\\&") + .replace("%", "\\%") + .replace("$", "\\$") + .replace("#", "\\#") + .replace("_", "\\_") + .replace("{", "\\{") + .replace("}", "\\}") + .replace("~ ", "~\\space ") # since \textasciitilde gobbles spaces + .replace("~", "\\textasciitilde ") + .replace("^ ", "^\\space ") # since \textasciicircum gobbles spaces + .replace("^", "\\textasciicircum ") + .replace("ab2§=§8yz", "\\textbackslash ") + ) + + +def _math_mode_with_dollar(s: str) -> str: + r""" + All characters in LaTeX math mode are preserved. + + The substrings in LaTeX math mode, which start with + the character ``$`` and end with ``$``, are preserved + without escaping. Otherwise regular LaTeX escaping applies. + + Parameters + ---------- + s : str + Input to be escaped + + Return + ------ + str : + Escaped string + """ + s = s.replace(r"\$", r"rt8§=§7wz") + pattern = re.compile(r"\$.*?\$") + pos = 0 + ps = pattern.search(s, pos) + res = [] + while ps: + res.append(_escape_latex(s[pos : ps.span()[0]])) + res.append(ps.group()) + pos = ps.span()[1] + ps = pattern.search(s, pos) + + res.append(_escape_latex(s[pos : len(s)])) + return "".join(res).replace(r"rt8§=§7wz", r"\$") + + +def _math_mode_with_parentheses(s: str) -> str: + r""" + All characters in LaTeX math mode are preserved. + + The substrings in LaTeX math mode, which start with + the character ``\(`` and end with ``\)``, are preserved + without escaping. Otherwise regular LaTeX escaping applies. + + Parameters + ---------- + s : str + Input to be escaped + + Return + ------ + str : + Escaped string + """ + s = s.replace(r"\(", r"LEFT§=§6yzLEFT").replace(r"\)", r"RIGHTab5§=§RIGHT") + res = [] + for item in re.split(r"LEFT§=§6yz|ab5§=§RIGHT", s): + if item.startswith("LEFT") and item.endswith("RIGHT"): + res.append(item.replace("LEFT", r"\(").replace("RIGHT", r"\)")) + elif "LEFT" in item and "RIGHT" in item: + res.append( + _escape_latex(item).replace("LEFT", r"\(").replace("RIGHT", r"\)") + ) + else: + res.append( + _escape_latex(item) + .replace("LEFT", r"\textbackslash (") + .replace("RIGHT", r"\textbackslash )") + ) + return "".join(res) + + +def _escape_latex_math(s: str) -> str: + r""" + All characters in LaTeX math mode are preserved. + + The substrings in LaTeX math mode, which either are surrounded + by two characters ``$`` or start with the character ``\(`` and end with ``\)``, + are preserved without escaping. Otherwise regular LaTeX escaping applies. + + Parameters + ---------- + s : str + Input to be escaped + + Return + ------ + str : + Escaped string + """ + s = s.replace(r"\$", r"rt8§=§7wz") + ps_d = re.compile(r"\$.*?\$").search(s, 0) + ps_p = re.compile(r"\(.*?\)").search(s, 0) + mode = [] + if ps_d: + mode.append(ps_d.span()[0]) + if ps_p: + mode.append(ps_p.span()[0]) + if len(mode) == 0: + return _escape_latex(s.replace(r"rt8§=§7wz", r"\$")) + if s[mode[0]] == r"$": + return _math_mode_with_dollar(s.replace(r"rt8§=§7wz", r"\$")) + if s[mode[0] - 1 : mode[0] + 1] == r"\(": + return _math_mode_with_parentheses(s.replace(r"rt8§=§7wz", r"\$")) + else: + return _escape_latex(s.replace(r"rt8§=§7wz", r"\$")) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/xml.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/xml.py new file mode 100644 index 0000000000000000000000000000000000000000..f56fca8d7ef4446727bfa34166b0c6b5a2856338 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/formats/xml.py @@ -0,0 +1,560 @@ +""" +:mod:`pandas.io.formats.xml` is a module for formatting data in XML. +""" +from __future__ import annotations + +import codecs +import io +from typing import ( + TYPE_CHECKING, + Any, + final, +) +import warnings + +from pandas.errors import AbstractMethodError +from pandas.util._decorators import ( + cache_readonly, + doc, +) + +from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.missing import isna + +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import get_handle +from pandas.io.xml import ( + get_data_from_filepath, + preprocess_data, +) + +if TYPE_CHECKING: + from pandas._typing import ( + CompressionOptions, + FilePath, + ReadBuffer, + StorageOptions, + WriteBuffer, + ) + + from pandas import DataFrame + + +@doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path_or_buffer", +) +class _BaseXMLFormatter: + """ + Subclass for formatting data in XML. + + Parameters + ---------- + path_or_buffer : str or file-like + This can be either a string of raw XML, a valid URL, + file or file-like object. + + index : bool + Whether to include index in xml document. + + row_name : str + Name for root of xml document. Default is 'data'. + + root_name : str + Name for row elements of xml document. Default is 'row'. + + na_rep : str + Missing data representation. + + attrs_cols : list + List of columns to write as attributes in row element. + + elem_cols : list + List of columns to write as children in row element. + + namespaces : dict + The namespaces to define in XML document as dicts with key + being namespace and value the URI. + + prefix : str + The prefix for each element in XML document including root. + + encoding : str + Encoding of xml object or document. + + xml_declaration : bool + Whether to include xml declaration at top line item in xml. + + pretty_print : bool + Whether to write xml document with line breaks and indentation. + + stylesheet : str or file-like + A URL, file, file-like object, or a raw string containing XSLT. + + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + See also + -------- + pandas.io.formats.xml.EtreeXMLFormatter + pandas.io.formats.xml.LxmlXMLFormatter + + """ + + def __init__( + self, + frame: DataFrame, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + index: bool = True, + root_name: str | None = "data", + row_name: str | None = "row", + na_rep: str | None = None, + attr_cols: list[str] | None = None, + elem_cols: list[str] | None = None, + namespaces: dict[str | None, str] | None = None, + prefix: str | None = None, + encoding: str = "utf-8", + xml_declaration: bool | None = True, + pretty_print: bool | None = True, + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, + ) -> None: + self.frame = frame + self.path_or_buffer = path_or_buffer + self.index = index + self.root_name = root_name + self.row_name = row_name + self.na_rep = na_rep + self.attr_cols = attr_cols + self.elem_cols = elem_cols + self.namespaces = namespaces + self.prefix = prefix + self.encoding = encoding + self.xml_declaration = xml_declaration + self.pretty_print = pretty_print + self.stylesheet = stylesheet + self.compression: CompressionOptions = compression + self.storage_options = storage_options + + self.orig_cols = self.frame.columns.tolist() + self.frame_dicts = self._process_dataframe() + + self._validate_columns() + self._validate_encoding() + self.prefix_uri = self._get_prefix_uri() + self._handle_indexes() + + def _build_tree(self) -> bytes: + """ + Build tree from data. + + This method initializes the root and builds attributes and elements + with optional namespaces. + """ + raise AbstractMethodError(self) + + @final + def _validate_columns(self) -> None: + """ + Validate elems_cols and attrs_cols. + + This method will check if columns is list-like. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + if self.attr_cols and not is_list_like(self.attr_cols): + raise TypeError( + f"{type(self.attr_cols).__name__} is not a valid type for attr_cols" + ) + + if self.elem_cols and not is_list_like(self.elem_cols): + raise TypeError( + f"{type(self.elem_cols).__name__} is not a valid type for elem_cols" + ) + + @final + def _validate_encoding(self) -> None: + """ + Validate encoding. + + This method will check if encoding is among listed under codecs. + + Raises + ------ + LookupError + * If encoding is not available in codecs. + """ + + codecs.lookup(self.encoding) + + @final + def _process_dataframe(self) -> dict[int | str, dict[str, Any]]: + """ + Adjust Data Frame to fit xml output. + + This method will adjust underlying data frame for xml output, + including optionally replacing missing values and including indexes. + """ + + df = self.frame + + if self.index: + df = df.reset_index() + + if self.na_rep is not None: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + df = df.fillna(self.na_rep) + + return df.to_dict(orient="index") + + @final + def _handle_indexes(self) -> None: + """ + Handle indexes. + + This method will add indexes into attr_cols or elem_cols. + """ + + if not self.index: + return + + first_key = next(iter(self.frame_dicts)) + indexes: list[str] = [ + x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols + ] + + if self.attr_cols: + self.attr_cols = indexes + self.attr_cols + + if self.elem_cols: + self.elem_cols = indexes + self.elem_cols + + def _get_prefix_uri(self) -> str: + """ + Get uri of namespace prefix. + + This method retrieves corresponding URI to prefix in namespaces. + + Raises + ------ + KeyError + *If prefix is not included in namespace dict. + """ + + raise AbstractMethodError(self) + + @final + def _other_namespaces(self) -> dict: + """ + Define other namespaces. + + This method will build dictionary of namespaces attributes + for root element, conditionally with optional namespaces and + prefix. + """ + + nmsp_dict: dict[str, str] = {} + if self.namespaces: + nmsp_dict = { + f"xmlns{p if p=='' else f':{p}'}": n + for p, n in self.namespaces.items() + if n != self.prefix_uri[1:-1] + } + + return nmsp_dict + + @final + def _build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any: + """ + Create attributes of row. + + This method adds attributes using attr_cols to row element and + works with tuples for multindex or hierarchical columns. + """ + + if not self.attr_cols: + return elem_row + + for col in self.attr_cols: + attr_name = self._get_flat_col_name(col) + try: + if not isna(d[col]): + elem_row.attrib[attr_name] = str(d[col]) + except KeyError: + raise KeyError(f"no valid column, {col}") + return elem_row + + @final + def _get_flat_col_name(self, col: str | tuple) -> str: + flat_col = col + if isinstance(col, tuple): + flat_col = ( + "".join([str(c) for c in col]).strip() + if "" in col + else "_".join([str(c) for c in col]).strip() + ) + return f"{self.prefix_uri}{flat_col}" + + @cache_readonly + def _sub_element_cls(self): + raise AbstractMethodError(self) + + @final + def _build_elems(self, d: dict[str, Any], elem_row: Any) -> None: + """ + Create child elements of row. + + This method adds child elements using elem_cols to row element and + works with tuples for multindex or hierarchical columns. + """ + sub_element_cls = self._sub_element_cls + + if not self.elem_cols: + return + + for col in self.elem_cols: + elem_name = self._get_flat_col_name(col) + try: + val = None if isna(d[col]) or d[col] == "" else str(d[col]) + sub_element_cls(elem_row, elem_name).text = val + except KeyError: + raise KeyError(f"no valid column, {col}") + + @final + def write_output(self) -> str | None: + xml_doc = self._build_tree() + + if self.path_or_buffer is not None: + with get_handle( + self.path_or_buffer, + "wb", + compression=self.compression, + storage_options=self.storage_options, + is_text=False, + ) as handles: + handles.handle.write(xml_doc) + return None + + else: + return xml_doc.decode(self.encoding).rstrip() + + +class EtreeXMLFormatter(_BaseXMLFormatter): + """ + Class for formatting data in xml using Python standard library + modules: `xml.etree.ElementTree` and `xml.dom.minidom`. + """ + + def _build_tree(self) -> bytes: + from xml.etree.ElementTree import ( + Element, + SubElement, + tostring, + ) + + self.root = Element( + f"{self.prefix_uri}{self.root_name}", attrib=self._other_namespaces() + ) + + for d in self.frame_dicts.values(): + elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") + + if not self.attr_cols and not self.elem_cols: + self.elem_cols = list(d.keys()) + self._build_elems(d, elem_row) + + else: + elem_row = self._build_attribs(d, elem_row) + self._build_elems(d, elem_row) + + self.out_xml = tostring( + self.root, + method="xml", + encoding=self.encoding, + xml_declaration=self.xml_declaration, + ) + + if self.pretty_print: + self.out_xml = self._prettify_tree() + + if self.stylesheet is not None: + raise ValueError( + "To use stylesheet, you need lxml installed and selected as parser." + ) + + return self.out_xml + + def _get_prefix_uri(self) -> str: + from xml.etree.ElementTree import register_namespace + + uri = "" + if self.namespaces: + for p, n in self.namespaces.items(): + if isinstance(p, str) and isinstance(n, str): + register_namespace(p, n) + if self.prefix: + try: + uri = f"{{{self.namespaces[self.prefix]}}}" + except KeyError: + raise KeyError(f"{self.prefix} is not included in namespaces") + elif "" in self.namespaces: + uri = f'{{{self.namespaces[""]}}}' + else: + uri = "" + + return uri + + @cache_readonly + def _sub_element_cls(self): + from xml.etree.ElementTree import SubElement + + return SubElement + + def _prettify_tree(self) -> bytes: + """ + Output tree for pretty print format. + + This method will pretty print xml with line breaks and indentation. + """ + + from xml.dom.minidom import parseString + + dom = parseString(self.out_xml) + + return dom.toprettyxml(indent=" ", encoding=self.encoding) + + +class LxmlXMLFormatter(_BaseXMLFormatter): + """ + Class for formatting data in xml using Python standard library + modules: `xml.etree.ElementTree` and `xml.dom.minidom`. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + self._convert_empty_str_key() + + def _build_tree(self) -> bytes: + """ + Build tree from data. + + This method initializes the root and builds attributes and elements + with optional namespaces. + """ + from lxml.etree import ( + Element, + SubElement, + tostring, + ) + + self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces) + + for d in self.frame_dicts.values(): + elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") + + if not self.attr_cols and not self.elem_cols: + self.elem_cols = list(d.keys()) + self._build_elems(d, elem_row) + + else: + elem_row = self._build_attribs(d, elem_row) + self._build_elems(d, elem_row) + + self.out_xml = tostring( + self.root, + pretty_print=self.pretty_print, + method="xml", + encoding=self.encoding, + xml_declaration=self.xml_declaration, + ) + + if self.stylesheet is not None: + self.out_xml = self._transform_doc() + + return self.out_xml + + def _convert_empty_str_key(self) -> None: + """ + Replace zero-length string in `namespaces`. + + This method will replace '' with None to align to `lxml` + requirement that empty string prefixes are not allowed. + """ + + if self.namespaces and "" in self.namespaces.keys(): + self.namespaces[None] = self.namespaces.pop("", "default") + + def _get_prefix_uri(self) -> str: + uri = "" + if self.namespaces: + if self.prefix: + try: + uri = f"{{{self.namespaces[self.prefix]}}}" + except KeyError: + raise KeyError(f"{self.prefix} is not included in namespaces") + elif "" in self.namespaces: + uri = f'{{{self.namespaces[""]}}}' + else: + uri = "" + + return uri + + @cache_readonly + def _sub_element_cls(self): + from lxml.etree import SubElement + + return SubElement + + def _transform_doc(self) -> bytes: + """ + Parse stylesheet from file or buffer and run it. + + This method will parse stylesheet object into tree for parsing + conditionally by its specific object type, then transforms + original tree with XSLT script. + """ + from lxml.etree import ( + XSLT, + XMLParser, + fromstring, + parse, + ) + + style_doc = self.stylesheet + assert style_doc is not None # is ensured by caller + + handle_data = get_data_from_filepath( + filepath_or_buffer=style_doc, + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + ) + + with preprocess_data(handle_data) as xml_data: + curr_parser = XMLParser(encoding=self.encoding) + + if isinstance(xml_data, io.StringIO): + xsl_doc = fromstring( + xml_data.getvalue().encode(self.encoding), parser=curr_parser + ) + else: + xsl_doc = parse(xml_data, parser=curr_parser) + + transformer = XSLT(xsl_doc) + new_doc = transformer(self.root) + + return bytes(new_doc) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8f4e7a62834b57c151189cdd2994a55d1ad9f7de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/__init__.py @@ -0,0 +1,15 @@ +from pandas.io.json._json import ( + read_json, + to_json, + ujson_dumps, + ujson_loads, +) +from pandas.io.json._table_schema import build_table_schema + +__all__ = [ + "ujson_dumps", + "ujson_loads", + "read_json", + "to_json", + "build_table_schema", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/_json.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/_json.py new file mode 100644 index 0000000000000000000000000000000000000000..9414f4521502999881238cf0b6e0dcb5c7ab1ad5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/_json.py @@ -0,0 +1,1505 @@ +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +from collections import abc +from io import StringIO +from itertools import islice +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Generic, + Literal, + TypeVar, + final, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas._libs.json import ( + ujson_dumps, + ujson_loads, +) +from pandas._libs.tslibs import iNaT +from pandas.compat._optional import import_optional_dependency +from pandas.errors import AbstractMethodError +from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend + +from pandas.core.dtypes.common import ( + ensure_str, + is_string_dtype, +) +from pandas.core.dtypes.dtypes import PeriodDtype + +from pandas import ( + ArrowDtype, + DataFrame, + Index, + MultiIndex, + Series, + isna, + notna, + to_datetime, +) +from pandas.core.reshape.concat import concat +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + IOHandles, + dedup_names, + extension_to_compression, + file_exists, + get_handle, + is_fsspec_url, + is_potential_multi_index, + is_url, + stringify_path, +) +from pandas.io.json._normalize import convert_to_line_delimits +from pandas.io.json._table_schema import ( + build_table_schema, + parse_table_schema, +) +from pandas.io.parsers.readers import validate_integer + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Mapping, + ) + from types import TracebackType + + from pandas._typing import ( + CompressionOptions, + DtypeArg, + DtypeBackend, + FilePath, + IndexLabel, + JSONEngine, + JSONSerializable, + ReadBuffer, + Self, + StorageOptions, + WriteBuffer, + ) + + from pandas.core.generic import NDFrame + +FrameSeriesStrT = TypeVar("FrameSeriesStrT", bound=Literal["frame", "series"]) + + +# interface to/from +@overload +def to_json( + path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes], + obj: NDFrame, + orient: str | None = ..., + date_format: str = ..., + double_precision: int = ..., + force_ascii: bool = ..., + date_unit: str = ..., + default_handler: Callable[[Any], JSONSerializable] | None = ..., + lines: bool = ..., + compression: CompressionOptions = ..., + index: bool | None = ..., + indent: int = ..., + storage_options: StorageOptions = ..., + mode: Literal["a", "w"] = ..., +) -> None: + ... + + +@overload +def to_json( + path_or_buf: None, + obj: NDFrame, + orient: str | None = ..., + date_format: str = ..., + double_precision: int = ..., + force_ascii: bool = ..., + date_unit: str = ..., + default_handler: Callable[[Any], JSONSerializable] | None = ..., + lines: bool = ..., + compression: CompressionOptions = ..., + index: bool | None = ..., + indent: int = ..., + storage_options: StorageOptions = ..., + mode: Literal["a", "w"] = ..., +) -> str: + ... + + +def to_json( + path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] | None, + obj: NDFrame, + orient: str | None = None, + date_format: str = "epoch", + double_precision: int = 10, + force_ascii: bool = True, + date_unit: str = "ms", + default_handler: Callable[[Any], JSONSerializable] | None = None, + lines: bool = False, + compression: CompressionOptions = "infer", + index: bool | None = None, + indent: int = 0, + storage_options: StorageOptions | None = None, + mode: Literal["a", "w"] = "w", +) -> str | None: + if orient in ["records", "values"] and index is True: + raise ValueError( + "'index=True' is only valid when 'orient' is 'split', 'table', " + "'index', or 'columns'." + ) + elif orient in ["index", "columns"] and index is False: + raise ValueError( + "'index=False' is only valid when 'orient' is 'split', 'table', " + "'records', or 'values'." + ) + elif index is None: + # will be ignored for orient='records' and 'values' + index = True + + if lines and orient != "records": + raise ValueError("'lines' keyword only valid when 'orient' is records") + + if mode not in ["a", "w"]: + msg = ( + f"mode={mode} is not a valid option." + "Only 'w' and 'a' are currently supported." + ) + raise ValueError(msg) + + if mode == "a" and (not lines or orient != "records"): + msg = ( + "mode='a' (append) is only supported when " + "lines is True and orient is 'records'" + ) + raise ValueError(msg) + + if orient == "table" and isinstance(obj, Series): + obj = obj.to_frame(name=obj.name or "values") + + writer: type[Writer] + if orient == "table" and isinstance(obj, DataFrame): + writer = JSONTableWriter + elif isinstance(obj, Series): + writer = SeriesWriter + elif isinstance(obj, DataFrame): + writer = FrameWriter + else: + raise NotImplementedError("'obj' should be a Series or a DataFrame") + + s = writer( + obj, + orient=orient, + date_format=date_format, + double_precision=double_precision, + ensure_ascii=force_ascii, + date_unit=date_unit, + default_handler=default_handler, + index=index, + indent=indent, + ).write() + + if lines: + s = convert_to_line_delimits(s) + + if path_or_buf is not None: + # apply compression and byte/text conversion + with get_handle( + path_or_buf, mode, compression=compression, storage_options=storage_options + ) as handles: + handles.handle.write(s) + else: + return s + return None + + +class Writer(ABC): + _default_orient: str + + def __init__( + self, + obj: NDFrame, + orient: str | None, + date_format: str, + double_precision: int, + ensure_ascii: bool, + date_unit: str, + index: bool, + default_handler: Callable[[Any], JSONSerializable] | None = None, + indent: int = 0, + ) -> None: + self.obj = obj + + if orient is None: + orient = self._default_orient + + self.orient = orient + self.date_format = date_format + self.double_precision = double_precision + self.ensure_ascii = ensure_ascii + self.date_unit = date_unit + self.default_handler = default_handler + self.index = index + self.indent = indent + + self.is_copy = None + self._format_axes() + + def _format_axes(self) -> None: + raise AbstractMethodError(self) + + def write(self) -> str: + iso_dates = self.date_format == "iso" + return ujson_dumps( + self.obj_to_write, + orient=self.orient, + double_precision=self.double_precision, + ensure_ascii=self.ensure_ascii, + date_unit=self.date_unit, + iso_dates=iso_dates, + default_handler=self.default_handler, + indent=self.indent, + ) + + @property + @abstractmethod + def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: + """Object to write in JSON format.""" + + +class SeriesWriter(Writer): + _default_orient = "index" + + @property + def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: + if not self.index and self.orient == "split": + return {"name": self.obj.name, "data": self.obj.values} + else: + return self.obj + + def _format_axes(self) -> None: + if not self.obj.index.is_unique and self.orient == "index": + raise ValueError(f"Series index must be unique for orient='{self.orient}'") + + +class FrameWriter(Writer): + _default_orient = "columns" + + @property + def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: + if not self.index and self.orient == "split": + obj_to_write = self.obj.to_dict(orient="split") + del obj_to_write["index"] + else: + obj_to_write = self.obj + return obj_to_write + + def _format_axes(self) -> None: + """ + Try to format axes if they are datelike. + """ + if not self.obj.index.is_unique and self.orient in ("index", "columns"): + raise ValueError( + f"DataFrame index must be unique for orient='{self.orient}'." + ) + if not self.obj.columns.is_unique and self.orient in ( + "index", + "columns", + "records", + ): + raise ValueError( + f"DataFrame columns must be unique for orient='{self.orient}'." + ) + + +class JSONTableWriter(FrameWriter): + _default_orient = "records" + + def __init__( + self, + obj, + orient: str | None, + date_format: str, + double_precision: int, + ensure_ascii: bool, + date_unit: str, + index: bool, + default_handler: Callable[[Any], JSONSerializable] | None = None, + indent: int = 0, + ) -> None: + """ + Adds a `schema` attribute with the Table Schema, resets + the index (can't do in caller, because the schema inference needs + to know what the index is, forces orient to records, and forces + date_format to 'iso'. + """ + super().__init__( + obj, + orient, + date_format, + double_precision, + ensure_ascii, + date_unit, + index, + default_handler=default_handler, + indent=indent, + ) + + if date_format != "iso": + msg = ( + "Trying to write with `orient='table'` and " + f"`date_format='{date_format}'`. Table Schema requires dates " + "to be formatted with `date_format='iso'`" + ) + raise ValueError(msg) + + self.schema = build_table_schema(obj, index=self.index) + + # NotImplemented on a column MultiIndex + if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): + raise NotImplementedError( + "orient='table' is not supported for MultiIndex columns" + ) + + # TODO: Do this timedelta properly in objToJSON.c See GH #15137 + if ( + (obj.ndim == 1) + and (obj.name in set(obj.index.names)) + or len(obj.columns.intersection(obj.index.names)) + ): + msg = "Overlapping names between the index and columns" + raise ValueError(msg) + + obj = obj.copy() + timedeltas = obj.select_dtypes(include=["timedelta"]).columns + if len(timedeltas): + obj[timedeltas] = obj[timedeltas].map(lambda x: x.isoformat()) + # Convert PeriodIndex to datetimes before serializing + if isinstance(obj.index.dtype, PeriodDtype): + obj.index = obj.index.to_timestamp() + + # exclude index from obj if index=False + if not self.index: + self.obj = obj.reset_index(drop=True) + else: + self.obj = obj.reset_index(drop=False) + self.date_format = "iso" + self.orient = "records" + self.index = index + + @property + def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: + return {"schema": self.schema, "data": self.obj} + + +@overload +def read_json( + path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], + *, + orient: str | None = ..., + typ: Literal["frame"] = ..., + dtype: DtypeArg | None = ..., + convert_axes: bool | None = ..., + convert_dates: bool | list[str] = ..., + keep_default_dates: bool = ..., + precise_float: bool = ..., + date_unit: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + lines: bool = ..., + chunksize: int, + compression: CompressionOptions = ..., + nrows: int | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + engine: JSONEngine = ..., +) -> JsonReader[Literal["frame"]]: + ... + + +@overload +def read_json( + path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], + *, + orient: str | None = ..., + typ: Literal["series"], + dtype: DtypeArg | None = ..., + convert_axes: bool | None = ..., + convert_dates: bool | list[str] = ..., + keep_default_dates: bool = ..., + precise_float: bool = ..., + date_unit: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + lines: bool = ..., + chunksize: int, + compression: CompressionOptions = ..., + nrows: int | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + engine: JSONEngine = ..., +) -> JsonReader[Literal["series"]]: + ... + + +@overload +def read_json( + path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], + *, + orient: str | None = ..., + typ: Literal["series"], + dtype: DtypeArg | None = ..., + convert_axes: bool | None = ..., + convert_dates: bool | list[str] = ..., + keep_default_dates: bool = ..., + precise_float: bool = ..., + date_unit: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + lines: bool = ..., + chunksize: None = ..., + compression: CompressionOptions = ..., + nrows: int | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + engine: JSONEngine = ..., +) -> Series: + ... + + +@overload +def read_json( + path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], + *, + orient: str | None = ..., + typ: Literal["frame"] = ..., + dtype: DtypeArg | None = ..., + convert_axes: bool | None = ..., + convert_dates: bool | list[str] = ..., + keep_default_dates: bool = ..., + precise_float: bool = ..., + date_unit: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + lines: bool = ..., + chunksize: None = ..., + compression: CompressionOptions = ..., + nrows: int | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + engine: JSONEngine = ..., +) -> DataFrame: + ... + + +@doc( + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] % "path_or_buf", +) +def read_json( + path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes], + *, + orient: str | None = None, + typ: Literal["frame", "series"] = "frame", + dtype: DtypeArg | None = None, + convert_axes: bool | None = None, + convert_dates: bool | list[str] = True, + keep_default_dates: bool = True, + precise_float: bool = False, + date_unit: str | None = None, + encoding: str | None = None, + encoding_errors: str | None = "strict", + lines: bool = False, + chunksize: int | None = None, + compression: CompressionOptions = "infer", + nrows: int | None = None, + storage_options: StorageOptions | None = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + engine: JSONEngine = "ujson", +) -> DataFrame | Series | JsonReader: + """ + Convert a JSON string to pandas object. + + Parameters + ---------- + path_or_buf : a valid JSON str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.json``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handle (e.g. via builtin ``open`` function) + or ``StringIO``. + + .. deprecated:: 2.1.0 + Passing json literal strings is deprecated. + + orient : str, optional + Indication of expected JSON string format. + Compatible JSON strings can be produced by ``to_json()`` with a + corresponding orient value. + The set of possible orients is: + + - ``'split'`` : dict like + ``{{index -> [index], columns -> [columns], data -> [values]}}`` + - ``'records'`` : list like + ``[{{column -> value}}, ... , {{column -> value}}]`` + - ``'index'`` : dict like ``{{index -> {{column -> value}}}}`` + - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}`` + - ``'values'`` : just the values array + - ``'table'`` : dict like ``{{'schema': {{schema}}, 'data': {{data}}}}`` + + The allowed and default values depend on the value + of the `typ` parameter. + + * when ``typ == 'series'``, + + - allowed orients are ``{{'split','records','index'}}`` + - default is ``'index'`` + - The Series index must be unique for orient ``'index'``. + + * when ``typ == 'frame'``, + + - allowed orients are ``{{'split','records','index', + 'columns','values', 'table'}}`` + - default is ``'columns'`` + - The DataFrame index must be unique for orients ``'index'`` and + ``'columns'``. + - The DataFrame columns must be unique for orients ``'index'``, + ``'columns'``, and ``'records'``. + + typ : {{'frame', 'series'}}, default 'frame' + The type of object to recover. + + dtype : bool or dict, default None + If True, infer dtypes; if a dict of column to dtype, then use those; + if False, then don't infer dtypes at all, applies only to the data. + + For all ``orient`` values except ``'table'``, default is True. + + convert_axes : bool, default None + Try to convert the axes to the proper dtypes. + + For all ``orient`` values except ``'table'``, default is True. + + convert_dates : bool or list of str, default True + If True then default datelike columns may be converted (depending on + keep_default_dates). + If False, no dates will be converted. + If a list of column names, then those columns will be converted and + default datelike columns may also be converted (depending on + keep_default_dates). + + keep_default_dates : bool, default True + If parsing dates (convert_dates is not False), then try to parse the + default datelike columns. + A column label is datelike if + + * it ends with ``'_at'``, + + * it ends with ``'_time'``, + + * it begins with ``'timestamp'``, + + * it is ``'modified'``, or + + * it is ``'date'``. + + precise_float : bool, default False + Set to enable usage of higher precision (strtod) function when + decoding string to double values. Default (False) is to use fast but + less precise builtin functionality. + + date_unit : str, default None + The timestamp unit to detect if converting dates. The default behaviour + is to try and detect the correct precision, but if this is not desired + then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, + milliseconds, microseconds or nanoseconds respectively. + + encoding : str, default is 'utf-8' + The encoding to use to decode py3 bytes. + + encoding_errors : str, optional, default "strict" + How encoding errors are treated. `List of possible values + `_ . + + .. versionadded:: 1.3.0 + + lines : bool, default False + Read the file as a json object per line. + + chunksize : int, optional + Return JsonReader object for iteration. + See the `line-delimited json docs + `_ + for more information on ``chunksize``. + This can only be passed if `lines=True`. + If this is None, the file will be read into memory all at once. + {decompression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + nrows : int, optional + The number of lines from the line-delimited jsonfile that has to be read. + This can only be passed if `lines=True`. + If this is None, all the rows will be returned. + + {storage_options} + + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + engine : {{"ujson", "pyarrow"}}, default "ujson" + Parser engine to use. The ``"pyarrow"`` engine is only available when + ``lines=True``. + + .. versionadded:: 2.0 + + Returns + ------- + Series, DataFrame, or pandas.api.typing.JsonReader + A JsonReader is returned when ``chunksize`` is not ``0`` or ``None``. + Otherwise, the type returned depends on the value of ``typ``. + + See Also + -------- + DataFrame.to_json : Convert a DataFrame to a JSON string. + Series.to_json : Convert a Series to a JSON string. + json_normalize : Normalize semi-structured JSON data into a flat table. + + Notes + ----- + Specific to ``orient='table'``, if a :class:`DataFrame` with a literal + :class:`Index` name of `index` gets written with :func:`to_json`, the + subsequent read operation will incorrectly set the :class:`Index` name to + ``None``. This is because `index` is also used by :func:`DataFrame.to_json` + to denote a missing :class:`Index` name, and the subsequent + :func:`read_json` operation cannot distinguish between the two. The same + limitation is encountered with a :class:`MultiIndex` and any names + beginning with ``'level_'``. + + Examples + -------- + >>> from io import StringIO + >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + + Encoding/decoding a Dataframe using ``'split'`` formatted JSON: + + >>> df.to_json(orient='split') + '\ +{{\ +"columns":["col 1","col 2"],\ +"index":["row 1","row 2"],\ +"data":[["a","b"],["c","d"]]\ +}}\ +' + >>> pd.read_json(StringIO(_), orient='split') + col 1 col 2 + row 1 a b + row 2 c d + + Encoding/decoding a Dataframe using ``'index'`` formatted JSON: + + >>> df.to_json(orient='index') + '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}' + + >>> pd.read_json(StringIO(_), orient='index') + col 1 col 2 + row 1 a b + row 2 c d + + Encoding/decoding a Dataframe using ``'records'`` formatted JSON. + Note that index labels are not preserved with this encoding. + + >>> df.to_json(orient='records') + '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]' + >>> pd.read_json(StringIO(_), orient='records') + col 1 col 2 + 0 a b + 1 c d + + Encoding with Table Schema + + >>> df.to_json(orient='table') + '\ +{{"schema":{{"fields":[\ +{{"name":"index","type":"string"}},\ +{{"name":"col 1","type":"string"}},\ +{{"name":"col 2","type":"string"}}],\ +"primaryKey":["index"],\ +"pandas_version":"1.4.0"}},\ +"data":[\ +{{"index":"row 1","col 1":"a","col 2":"b"}},\ +{{"index":"row 2","col 1":"c","col 2":"d"}}]\ +}}\ +' + + The following example uses ``dtype_backend="numpy_nullable"`` + + >>> data = '''{{"index": {{"0": 0, "1": 1}}, + ... "a": {{"0": 1, "1": null}}, + ... "b": {{"0": 2.5, "1": 4.5}}, + ... "c": {{"0": true, "1": false}}, + ... "d": {{"0": "a", "1": "b"}}, + ... "e": {{"0": 1577.2, "1": 1577.1}}}}''' + >>> pd.read_json(StringIO(data), dtype_backend="numpy_nullable") + index a b c d e + 0 0 1 2.5 True a 1577.2 + 1 1 4.5 False b 1577.1 + """ + if orient == "table" and dtype: + raise ValueError("cannot pass both dtype and orient='table'") + if orient == "table" and convert_axes: + raise ValueError("cannot pass both convert_axes and orient='table'") + + check_dtype_backend(dtype_backend) + + if dtype is None and orient != "table": + # error: Incompatible types in assignment (expression has type "bool", variable + # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], + # Type[int], Type[complex], Type[bool], Type[object], Dict[Hashable, + # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float], + # Type[int], Type[complex], Type[bool], Type[object]]], None]") + dtype = True # type: ignore[assignment] + if convert_axes is None and orient != "table": + convert_axes = True + + json_reader = JsonReader( + path_or_buf, + orient=orient, + typ=typ, + dtype=dtype, + convert_axes=convert_axes, + convert_dates=convert_dates, + keep_default_dates=keep_default_dates, + precise_float=precise_float, + date_unit=date_unit, + encoding=encoding, + lines=lines, + chunksize=chunksize, + compression=compression, + nrows=nrows, + storage_options=storage_options, + encoding_errors=encoding_errors, + dtype_backend=dtype_backend, + engine=engine, + ) + + if chunksize: + return json_reader + else: + return json_reader.read() + + +class JsonReader(abc.Iterator, Generic[FrameSeriesStrT]): + """ + JsonReader provides an interface for reading in a JSON file. + + If initialized with ``lines=True`` and ``chunksize``, can be iterated over + ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the + whole document. + """ + + def __init__( + self, + filepath_or_buffer, + orient, + typ: FrameSeriesStrT, + dtype, + convert_axes: bool | None, + convert_dates, + keep_default_dates: bool, + precise_float: bool, + date_unit, + encoding, + lines: bool, + chunksize: int | None, + compression: CompressionOptions, + nrows: int | None, + storage_options: StorageOptions | None = None, + encoding_errors: str | None = "strict", + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + engine: JSONEngine = "ujson", + ) -> None: + self.orient = orient + self.typ = typ + self.dtype = dtype + self.convert_axes = convert_axes + self.convert_dates = convert_dates + self.keep_default_dates = keep_default_dates + self.precise_float = precise_float + self.date_unit = date_unit + self.encoding = encoding + self.engine = engine + self.compression = compression + self.storage_options = storage_options + self.lines = lines + self.chunksize = chunksize + self.nrows_seen = 0 + self.nrows = nrows + self.encoding_errors = encoding_errors + self.handles: IOHandles[str] | None = None + self.dtype_backend = dtype_backend + + if self.engine not in {"pyarrow", "ujson"}: + raise ValueError( + f"The engine type {self.engine} is currently not supported." + ) + if self.chunksize is not None: + self.chunksize = validate_integer("chunksize", self.chunksize, 1) + if not self.lines: + raise ValueError("chunksize can only be passed if lines=True") + if self.engine == "pyarrow": + raise ValueError( + "currently pyarrow engine doesn't support chunksize parameter" + ) + if self.nrows is not None: + self.nrows = validate_integer("nrows", self.nrows, 0) + if not self.lines: + raise ValueError("nrows can only be passed if lines=True") + if ( + isinstance(filepath_or_buffer, str) + and not self.lines + and "\n" in filepath_or_buffer + ): + warnings.warn( + "Passing literal json to 'read_json' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if self.engine == "pyarrow": + if not self.lines: + raise ValueError( + "currently pyarrow engine only supports " + "the line-delimited JSON format" + ) + self.data = filepath_or_buffer + elif self.engine == "ujson": + data = self._get_data_from_filepath(filepath_or_buffer) + self.data = self._preprocess_data(data) + + def _preprocess_data(self, data): + """ + At this point, the data either has a `read` attribute (e.g. a file + object or a StringIO) or is a string that is a JSON document. + + If self.chunksize, we prepare the data for the `__next__` method. + Otherwise, we read it into memory for the `read` method. + """ + if hasattr(data, "read") and not (self.chunksize or self.nrows): + with self: + data = data.read() + if not hasattr(data, "read") and (self.chunksize or self.nrows): + data = StringIO(data) + + return data + + def _get_data_from_filepath(self, filepath_or_buffer): + """ + The function read_json accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, StringIO) + 3. JSON string + + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + + It raises FileNotFoundError if the input is a string ending in + one of .json, .json.gz, .json.bz2, etc. but no such file exists. + """ + # if it is a string but the file does not exist, it might be a JSON string + filepath_or_buffer = stringify_path(filepath_or_buffer) + if ( + not isinstance(filepath_or_buffer, str) + or is_url(filepath_or_buffer) + or is_fsspec_url(filepath_or_buffer) + or file_exists(filepath_or_buffer) + ): + self.handles = get_handle( + filepath_or_buffer, + "r", + encoding=self.encoding, + compression=self.compression, + storage_options=self.storage_options, + errors=self.encoding_errors, + ) + filepath_or_buffer = self.handles.handle + elif ( + isinstance(filepath_or_buffer, str) + and filepath_or_buffer.lower().endswith( + (".json",) + tuple(f".json{c}" for c in extension_to_compression) + ) + and not file_exists(filepath_or_buffer) + ): + raise FileNotFoundError(f"File {filepath_or_buffer} does not exist") + else: + warnings.warn( + "Passing literal json to 'read_json' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return filepath_or_buffer + + def _combine_lines(self, lines) -> str: + """ + Combines a list of JSON objects into one JSON object. + """ + return ( + f'[{",".join([line for line in (line.strip() for line in lines) if line])}]' + ) + + @overload + def read(self: JsonReader[Literal["frame"]]) -> DataFrame: + ... + + @overload + def read(self: JsonReader[Literal["series"]]) -> Series: + ... + + @overload + def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series: + ... + + def read(self) -> DataFrame | Series: + """ + Read the whole JSON input into a pandas object. + """ + obj: DataFrame | Series + with self: + if self.engine == "pyarrow": + pyarrow_json = import_optional_dependency("pyarrow.json") + pa_table = pyarrow_json.read_json(self.data) + + mapping: type[ArrowDtype] | None | Callable + if self.dtype_backend == "pyarrow": + mapping = ArrowDtype + elif self.dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping().get + else: + mapping = None + + return pa_table.to_pandas(types_mapper=mapping) + elif self.engine == "ujson": + if self.lines: + if self.chunksize: + obj = concat(self) + elif self.nrows: + lines = list(islice(self.data, self.nrows)) + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + else: + data = ensure_str(self.data) + data_lines = data.split("\n") + obj = self._get_object_parser(self._combine_lines(data_lines)) + else: + obj = self._get_object_parser(self.data) + if self.dtype_backend is not lib.no_default: + return obj.convert_dtypes( + infer_objects=False, dtype_backend=self.dtype_backend + ) + else: + return obj + + def _get_object_parser(self, json) -> DataFrame | Series: + """ + Parses a json document into a pandas object. + """ + typ = self.typ + dtype = self.dtype + kwargs = { + "orient": self.orient, + "dtype": self.dtype, + "convert_axes": self.convert_axes, + "convert_dates": self.convert_dates, + "keep_default_dates": self.keep_default_dates, + "precise_float": self.precise_float, + "date_unit": self.date_unit, + "dtype_backend": self.dtype_backend, + } + obj = None + if typ == "frame": + obj = FrameParser(json, **kwargs).parse() + + if typ == "series" or obj is None: + if not isinstance(dtype, bool): + kwargs["dtype"] = dtype + obj = SeriesParser(json, **kwargs).parse() + + return obj + + def close(self) -> None: + """ + If we opened a stream earlier, in _get_data_from_filepath, we should + close it. + + If an open stream or file was passed, we leave it open. + """ + if self.handles is not None: + self.handles.close() + + def __iter__(self) -> Self: + return self + + @overload + def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame: + ... + + @overload + def __next__(self: JsonReader[Literal["series"]]) -> Series: + ... + + @overload + def __next__(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series: + ... + + def __next__(self) -> DataFrame | Series: + if self.nrows and self.nrows_seen >= self.nrows: + self.close() + raise StopIteration + + lines = list(islice(self.data, self.chunksize)) + if not lines: + self.close() + raise StopIteration + + try: + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + + # Make sure that the returned objects have the right index. + obj.index = range(self.nrows_seen, self.nrows_seen + len(obj)) + self.nrows_seen += len(obj) + except Exception as ex: + self.close() + raise ex + + if self.dtype_backend is not lib.no_default: + return obj.convert_dtypes( + infer_objects=False, dtype_backend=self.dtype_backend + ) + else: + return obj + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: + self.close() + + +class Parser: + _split_keys: tuple[str, ...] + _default_orient: str + + _STAMP_UNITS = ("s", "ms", "us", "ns") + _MIN_STAMPS = { + "s": 31536000, + "ms": 31536000000, + "us": 31536000000000, + "ns": 31536000000000000, + } + json: str + + def __init__( + self, + json: str, + orient, + dtype: DtypeArg | None = None, + convert_axes: bool = True, + convert_dates: bool | list[str] = True, + keep_default_dates: bool = False, + precise_float: bool = False, + date_unit=None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + ) -> None: + self.json = json + + if orient is None: + orient = self._default_orient + + self.orient = orient + + self.dtype = dtype + + if date_unit is not None: + date_unit = date_unit.lower() + if date_unit not in self._STAMP_UNITS: + raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}") + self.min_stamp = self._MIN_STAMPS[date_unit] + else: + self.min_stamp = self._MIN_STAMPS["s"] + + self.precise_float = precise_float + self.convert_axes = convert_axes + self.convert_dates = convert_dates + self.date_unit = date_unit + self.keep_default_dates = keep_default_dates + self.obj: DataFrame | Series | None = None + self.dtype_backend = dtype_backend + + @final + def check_keys_split(self, decoded: dict) -> None: + """ + Checks that dict has only the appropriate keys for orient='split'. + """ + bad_keys = set(decoded.keys()).difference(set(self._split_keys)) + if bad_keys: + bad_keys_joined = ", ".join(bad_keys) + raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}") + + @final + def parse(self): + self._parse() + + if self.obj is None: + return None + if self.convert_axes: + self._convert_axes() + self._try_convert_types() + return self.obj + + def _parse(self) -> None: + raise AbstractMethodError(self) + + @final + def _convert_axes(self) -> None: + """ + Try to convert axes. + """ + obj = self.obj + assert obj is not None # for mypy + for axis_name in obj._AXIS_ORDERS: + ax = obj._get_axis(axis_name) + ser = Series(ax, dtype=ax.dtype, copy=False) + new_ser, result = self._try_convert_data( + name=axis_name, + data=ser, + use_dtypes=False, + convert_dates=True, + is_axis=True, + ) + if result: + new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False) + setattr(self.obj, axis_name, new_axis) + + def _try_convert_types(self) -> None: + raise AbstractMethodError(self) + + @final + def _try_convert_data( + self, + name: Hashable, + data: Series, + use_dtypes: bool = True, + convert_dates: bool | list[str] = True, + is_axis: bool = False, + ) -> tuple[Series, bool]: + """ + Try to parse a Series into a column by inferring dtype. + """ + # don't try to coerce, unless a force conversion + if use_dtypes: + if not self.dtype: + if all(notna(data)): + return data, False + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + filled = data.fillna(np.nan) + + return filled, True + + elif self.dtype is True: + pass + else: + # dtype to force + dtype = ( + self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype + ) + if dtype is not None: + try: + return data.astype(dtype), True + except (TypeError, ValueError): + return data, False + + if convert_dates: + new_data, result = self._try_convert_to_date(data) + if result: + return new_data, True + + converted = False + if self.dtype_backend is not lib.no_default and not is_axis: + # Fall through for conversion later on + return data, True + elif is_string_dtype(data.dtype): + # try float + try: + data = data.astype("float64") + converted = True + except (TypeError, ValueError): + pass + + if data.dtype.kind == "f" and data.dtype != "float64": + # coerce floats to 64 + try: + data = data.astype("float64") + converted = True + except (TypeError, ValueError): + pass + + # don't coerce 0-len data + if len(data) and data.dtype in ("float", "object"): + # coerce ints if we can + try: + new_data = data.astype("int64") + if (new_data == data).all(): + data = new_data + converted = True + except (TypeError, ValueError, OverflowError): + pass + + if data.dtype == "int" and data.dtype != "int64": + # coerce ints to 64 + try: + data = data.astype("int64") + converted = True + except (TypeError, ValueError): + pass + + # if we have an index, we want to preserve dtypes + if name == "index" and len(data): + if self.orient == "split": + return data, False + + return data, converted + + @final + def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: + """ + Try to parse a ndarray like into a date column. + + Try to coerce object in epoch/iso formats and integer/float in epoch + formats. Return a boolean if parsing was successful. + """ + # no conversion on empty + if not len(data): + return data, False + + new_data = data + + if new_data.dtype == "string": + new_data = new_data.astype(object) + + if new_data.dtype == "object": + try: + new_data = data.astype("int64") + except OverflowError: + return data, False + except (TypeError, ValueError): + pass + + # ignore numbers that are out of range + if issubclass(new_data.dtype.type, np.number): + in_range = ( + isna(new_data._values) + | (new_data > self.min_stamp) + | (new_data._values == iNaT) + ) + if not in_range.all(): + return data, False + + date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS + for date_unit in date_units: + try: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + ".*parsing datetimes with mixed time " + "zones will raise an error", + category=FutureWarning, + ) + new_data = to_datetime(new_data, errors="raise", unit=date_unit) + except (ValueError, OverflowError, TypeError): + continue + return new_data, True + return data, False + + +class SeriesParser(Parser): + _default_orient = "index" + _split_keys = ("name", "index", "data") + obj: Series | None + + def _parse(self) -> None: + data = ujson_loads(self.json, precise_float=self.precise_float) + + if self.orient == "split": + decoded = {str(k): v for k, v in data.items()} + self.check_keys_split(decoded) + self.obj = Series(**decoded) + else: + self.obj = Series(data) + + def _try_convert_types(self) -> None: + if self.obj is None: + return + obj, result = self._try_convert_data( + "data", self.obj, convert_dates=self.convert_dates + ) + if result: + self.obj = obj + + +class FrameParser(Parser): + _default_orient = "columns" + _split_keys = ("columns", "index", "data") + obj: DataFrame | None + + def _parse(self) -> None: + json = self.json + orient = self.orient + + if orient == "columns": + self.obj = DataFrame( + ujson_loads(json, precise_float=self.precise_float), dtype=None + ) + elif orient == "split": + decoded = { + str(k): v + for k, v in ujson_loads(json, precise_float=self.precise_float).items() + } + self.check_keys_split(decoded) + orig_names = [ + (tuple(col) if isinstance(col, list) else col) + for col in decoded["columns"] + ] + decoded["columns"] = dedup_names( + orig_names, + is_potential_multi_index(orig_names, None), + ) + self.obj = DataFrame(dtype=None, **decoded) + elif orient == "index": + self.obj = DataFrame.from_dict( + ujson_loads(json, precise_float=self.precise_float), + dtype=None, + orient="index", + ) + elif orient == "table": + self.obj = parse_table_schema(json, precise_float=self.precise_float) + else: + self.obj = DataFrame( + ujson_loads(json, precise_float=self.precise_float), dtype=None + ) + + def _process_converter( + self, + f: Callable[[Hashable, Series], tuple[Series, bool]], + filt: Callable[[Hashable], bool] | None = None, + ) -> None: + """ + Take a conversion function and possibly recreate the frame. + """ + if filt is None: + filt = lambda col: True + + obj = self.obj + assert obj is not None # for mypy + + needs_new_obj = False + new_obj = {} + for i, (col, c) in enumerate(obj.items()): + if filt(col): + new_data, result = f(col, c) + if result: + c = new_data + needs_new_obj = True + new_obj[i] = c + + if needs_new_obj: + # possibly handle dup columns + new_frame = DataFrame(new_obj, index=obj.index) + new_frame.columns = obj.columns + self.obj = new_frame + + def _try_convert_types(self) -> None: + if self.obj is None: + return + if self.convert_dates: + self._try_convert_dates() + + self._process_converter( + lambda col, c: self._try_convert_data(col, c, convert_dates=False) + ) + + def _try_convert_dates(self) -> None: + if self.obj is None: + return + + # our columns to parse + convert_dates_list_bool = self.convert_dates + if isinstance(convert_dates_list_bool, bool): + convert_dates_list_bool = [] + convert_dates = set(convert_dates_list_bool) + + def is_ok(col) -> bool: + """ + Return if this col is ok to try for a date parse. + """ + if col in convert_dates: + return True + if not self.keep_default_dates: + return False + if not isinstance(col, str): + return False + + col_lower = col.lower() + if ( + col_lower.endswith(("_at", "_time")) + or col_lower == "modified" + or col_lower == "date" + or col_lower == "datetime" + or col_lower.startswith("timestamp") + ): + return True + return False + + self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/_normalize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/_normalize.py new file mode 100644 index 0000000000000000000000000000000000000000..b1e2210f9d8940a0931b07e1631350089140ff95 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/_normalize.py @@ -0,0 +1,544 @@ +# --------------------------------------------------------------------- +# JSON normalization routines +from __future__ import annotations + +from collections import ( + abc, + defaultdict, +) +import copy +from typing import ( + TYPE_CHECKING, + Any, + DefaultDict, +) + +import numpy as np + +from pandas._libs.writers import convert_json_to_lines + +import pandas as pd +from pandas import DataFrame + +if TYPE_CHECKING: + from collections.abc import Iterable + + from pandas._typing import ( + IgnoreRaise, + Scalar, + ) + + +def convert_to_line_delimits(s: str) -> str: + """ + Helper function that converts JSON lists to line delimited JSON. + """ + # Determine we have a JSON list to turn to lines otherwise just return the + # json object, only lists can + if not s[0] == "[" and s[-1] == "]": + return s + s = s[1:-1] + + return convert_json_to_lines(s) + + +def nested_to_record( + ds, + prefix: str = "", + sep: str = ".", + level: int = 0, + max_level: int | None = None, +): + """ + A simplified json_normalize + + Converts a nested dict into a flat dict ("record"), unlike json_normalize, + it does not attempt to extract a subset of the data. + + Parameters + ---------- + ds : dict or list of dicts + prefix: the prefix, optional, default: "" + sep : str, default '.' + Nested records will generate names separated by sep, + e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar + level: int, optional, default: 0 + The number of levels in the json string. + + max_level: int, optional, default: None + The max depth to normalize. + + Returns + ------- + d - dict or list of dicts, matching `ds` + + Examples + -------- + >>> nested_to_record( + ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2)) + ... ) + {\ +'flat1': 1, \ +'dict1.c': 1, \ +'dict1.d': 2, \ +'nested.e.c': 1, \ +'nested.e.d': 2, \ +'nested.d': 2\ +} + """ + singleton = False + if isinstance(ds, dict): + ds = [ds] + singleton = True + new_ds = [] + for d in ds: + new_d = copy.deepcopy(d) + for k, v in d.items(): + # each key gets renamed with prefix + if not isinstance(k, str): + k = str(k) + if level == 0: + newkey = k + else: + newkey = prefix + sep + k + + # flatten if type is dict and + # current dict level < maximum level provided and + # only dicts gets recurse-flattened + # only at level>1 do we rename the rest of the keys + if not isinstance(v, dict) or ( + max_level is not None and level >= max_level + ): + if level != 0: # so we skip copying for top level, common case + v = new_d.pop(k) + new_d[newkey] = v + continue + + v = new_d.pop(k) + new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level)) + new_ds.append(new_d) + + if singleton: + return new_ds[0] + return new_ds + + +def _normalise_json( + data: Any, + key_string: str, + normalized_dict: dict[str, Any], + separator: str, +) -> dict[str, Any]: + """ + Main recursive function + Designed for the most basic use case of pd.json_normalize(data) + intended as a performance improvement, see #15621 + + Parameters + ---------- + data : Any + Type dependent on types contained within nested Json + key_string : str + New key (with separator(s) in) for data + normalized_dict : dict + The new normalized/flattened Json dict + separator : str, default '.' + Nested records will generate names separated by sep, + e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar + """ + if isinstance(data, dict): + for key, value in data.items(): + new_key = f"{key_string}{separator}{key}" + + if not key_string: + new_key = new_key.removeprefix(separator) + + _normalise_json( + data=value, + key_string=new_key, + normalized_dict=normalized_dict, + separator=separator, + ) + else: + normalized_dict[key_string] = data + return normalized_dict + + +def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]: + """ + Order the top level keys and then recursively go to depth + + Parameters + ---------- + data : dict or list of dicts + separator : str, default '.' + Nested records will generate names separated by sep, + e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar + + Returns + ------- + dict or list of dicts, matching `normalised_json_object` + """ + top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)} + nested_dict_ = _normalise_json( + data={k: v for k, v in data.items() if isinstance(v, dict)}, + key_string="", + normalized_dict={}, + separator=separator, + ) + return {**top_dict_, **nested_dict_} + + +def _simple_json_normalize( + ds: dict | list[dict], + sep: str = ".", +) -> dict | list[dict] | Any: + """ + A optimized basic json_normalize + + Converts a nested dict into a flat dict ("record"), unlike + json_normalize and nested_to_record it doesn't do anything clever. + But for the most basic use cases it enhances performance. + E.g. pd.json_normalize(data) + + Parameters + ---------- + ds : dict or list of dicts + sep : str, default '.' + Nested records will generate names separated by sep, + e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar + + Returns + ------- + frame : DataFrame + d - dict or list of dicts, matching `normalised_json_object` + + Examples + -------- + >>> _simple_json_normalize( + ... { + ... "flat1": 1, + ... "dict1": {"c": 1, "d": 2}, + ... "nested": {"e": {"c": 1, "d": 2}, "d": 2}, + ... } + ... ) + {\ +'flat1': 1, \ +'dict1.c': 1, \ +'dict1.d': 2, \ +'nested.e.c': 1, \ +'nested.e.d': 2, \ +'nested.d': 2\ +} + + """ + normalised_json_object = {} + # expect a dictionary, as most jsons are. However, lists are perfectly valid + if isinstance(ds, dict): + normalised_json_object = _normalise_json_ordered(data=ds, separator=sep) + elif isinstance(ds, list): + normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds] + return normalised_json_list + return normalised_json_object + + +def json_normalize( + data: dict | list[dict], + record_path: str | list | None = None, + meta: str | list[str | list[str]] | None = None, + meta_prefix: str | None = None, + record_prefix: str | None = None, + errors: IgnoreRaise = "raise", + sep: str = ".", + max_level: int | None = None, +) -> DataFrame: + """ + Normalize semi-structured JSON data into a flat table. + + Parameters + ---------- + data : dict or list of dicts + Unserialized JSON objects. + record_path : str or list of str, default None + Path in each object to list of records. If not passed, data will be + assumed to be an array of records. + meta : list of paths (str or list of str), default None + Fields to use as metadata for each record in resulting table. + meta_prefix : str, default None + If True, prefix records with dotted (?) path, e.g. foo.bar.field if + meta is ['foo', 'bar']. + record_prefix : str, default None + If True, prefix records with dotted (?) path, e.g. foo.bar.field if + path to records is ['foo', 'bar']. + errors : {'raise', 'ignore'}, default 'raise' + Configures error handling. + + * 'ignore' : will ignore KeyError if keys listed in meta are not + always present. + * 'raise' : will raise KeyError if keys listed in meta are not + always present. + sep : str, default '.' + Nested records will generate names separated by sep. + e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar. + max_level : int, default None + Max number of levels(depth of dict) to normalize. + if None, normalizes all levels. + + Returns + ------- + frame : DataFrame + Normalize semi-structured JSON data into a flat table. + + Examples + -------- + >>> data = [ + ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, + ... {"name": {"given": "Mark", "family": "Regner"}}, + ... {"id": 2, "name": "Faye Raker"}, + ... ] + >>> pd.json_normalize(data) + id name.first name.last name.given name.family name + 0 1.0 Coleen Volk NaN NaN NaN + 1 NaN NaN NaN Mark Regner NaN + 2 2.0 NaN NaN NaN NaN Faye Raker + + >>> data = [ + ... { + ... "id": 1, + ... "name": "Cole Volk", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, + ... { + ... "id": 2, + ... "name": "Faye Raker", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... ] + >>> pd.json_normalize(data, max_level=0) + id name fitness + 0 1.0 Cole Volk {'height': 130, 'weight': 60} + 1 NaN Mark Reg {'height': 130, 'weight': 60} + 2 2.0 Faye Raker {'height': 130, 'weight': 60} + + Normalizes nested data up to level 1. + + >>> data = [ + ... { + ... "id": 1, + ... "name": "Cole Volk", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, + ... { + ... "id": 2, + ... "name": "Faye Raker", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... ] + >>> pd.json_normalize(data, max_level=1) + id name fitness.height fitness.weight + 0 1.0 Cole Volk 130 60 + 1 NaN Mark Reg 130 60 + 2 2.0 Faye Raker 130 60 + + >>> data = [ + ... { + ... "state": "Florida", + ... "shortname": "FL", + ... "info": {"governor": "Rick Scott"}, + ... "counties": [ + ... {"name": "Dade", "population": 12345}, + ... {"name": "Broward", "population": 40000}, + ... {"name": "Palm Beach", "population": 60000}, + ... ], + ... }, + ... { + ... "state": "Ohio", + ... "shortname": "OH", + ... "info": {"governor": "John Kasich"}, + ... "counties": [ + ... {"name": "Summit", "population": 1234}, + ... {"name": "Cuyahoga", "population": 1337}, + ... ], + ... }, + ... ] + >>> result = pd.json_normalize( + ... data, "counties", ["state", "shortname", ["info", "governor"]] + ... ) + >>> result + name population state shortname info.governor + 0 Dade 12345 Florida FL Rick Scott + 1 Broward 40000 Florida FL Rick Scott + 2 Palm Beach 60000 Florida FL Rick Scott + 3 Summit 1234 Ohio OH John Kasich + 4 Cuyahoga 1337 Ohio OH John Kasich + + >>> data = {"A": [1, 2]} + >>> pd.json_normalize(data, "A", record_prefix="Prefix.") + Prefix.0 + 0 1 + 1 2 + + Returns normalized data with columns prefixed with the given string. + """ + + def _pull_field( + js: dict[str, Any], spec: list | str, extract_record: bool = False + ) -> Scalar | Iterable: + """Internal function to pull field""" + result = js + try: + if isinstance(spec, list): + for field in spec: + if result is None: + raise KeyError(field) + result = result[field] + else: + result = result[spec] + except KeyError as e: + if extract_record: + raise KeyError( + f"Key {e} not found. If specifying a record_path, all elements of " + f"data should have the path." + ) from e + if errors == "ignore": + return np.nan + else: + raise KeyError( + f"Key {e} not found. To replace missing values of {e} with " + f"np.nan, pass in errors='ignore'" + ) from e + + return result + + def _pull_records(js: dict[str, Any], spec: list | str) -> list: + """ + Internal function to pull field for records, and similar to + _pull_field, but require to return list. And will raise error + if has non iterable value. + """ + result = _pull_field(js, spec, extract_record=True) + + # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not + # null, otherwise return an empty list + if not isinstance(result, list): + if pd.isnull(result): + result = [] + else: + raise TypeError( + f"{js} has non list value {result} for path {spec}. " + "Must be list or null." + ) + return result + + if isinstance(data, list) and not data: + return DataFrame() + elif isinstance(data, dict): + # A bit of a hackjob + data = [data] + elif isinstance(data, abc.Iterable) and not isinstance(data, str): + # GH35923 Fix pd.json_normalize to not skip the first element of a + # generator input + data = list(data) + else: + raise NotImplementedError + + # check to see if a simple recursive function is possible to + # improve performance (see #15621) but only for cases such + # as pd.Dataframe(data) or pd.Dataframe(data, sep) + if ( + record_path is None + and meta is None + and meta_prefix is None + and record_prefix is None + and max_level is None + ): + return DataFrame(_simple_json_normalize(data, sep=sep)) + + if record_path is None: + if any([isinstance(x, dict) for x in y.values()] for y in data): + # naive normalization, this is idempotent for flat records + # and potentially will inflate the data considerably for + # deeply nested structures: + # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} + # + # TODO: handle record value which are lists, at least error + # reasonably + data = nested_to_record(data, sep=sep, max_level=max_level) + return DataFrame(data) + elif not isinstance(record_path, list): + record_path = [record_path] + + if meta is None: + meta = [] + elif not isinstance(meta, list): + meta = [meta] + + _meta = [m if isinstance(m, list) else [m] for m in meta] + + # Disastrously inefficient for now + records: list = [] + lengths = [] + + meta_vals: DefaultDict = defaultdict(list) + meta_keys = [sep.join(val) for val in _meta] + + def _recursive_extract(data, path, seen_meta, level: int = 0) -> None: + if isinstance(data, dict): + data = [data] + if len(path) > 1: + for obj in data: + for val, key in zip(_meta, meta_keys): + if level + 1 == len(val): + seen_meta[key] = _pull_field(obj, val[-1]) + + _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) + else: + for obj in data: + recs = _pull_records(obj, path[0]) + recs = [ + nested_to_record(r, sep=sep, max_level=max_level) + if isinstance(r, dict) + else r + for r in recs + ] + + # For repeating the metadata later + lengths.append(len(recs)) + for val, key in zip(_meta, meta_keys): + if level + 1 > len(val): + meta_val = seen_meta[key] + else: + meta_val = _pull_field(obj, val[level:]) + meta_vals[key].append(meta_val) + records.extend(recs) + + _recursive_extract(data, record_path, {}, level=0) + + result = DataFrame(records) + + if record_prefix is not None: + result = result.rename(columns=lambda x: f"{record_prefix}{x}") + + # Data types, a problem + for k, v in meta_vals.items(): + if meta_prefix is not None: + k = meta_prefix + k + + if k in result: + raise ValueError( + f"Conflicting metadata name {k}, need distinguishing prefix " + ) + # GH 37782 + + values = np.array(v, dtype=object) + + if values.ndim > 1: + # GH 37782 + values = np.empty((len(v),), dtype=object) + for i, v in enumerate(v): + values[i] = v + + result[k] = values.repeat(lengths) + return result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/_table_schema.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/_table_schema.py new file mode 100644 index 0000000000000000000000000000000000000000..4d9fba72cf1733e8893e194c29d5a00bc3ad1d5c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/json/_table_schema.py @@ -0,0 +1,389 @@ +""" +Table Schema builders + +https://specs.frictionlessdata.io/table-schema/ +""" +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, + cast, +) +import warnings + +from pandas._libs import lib +from pandas._libs.json import ujson_loads +from pandas._libs.tslibs import timezones +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.base import _registry as registry +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer_dtype, + is_numeric_dtype, + is_string_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + ExtensionDtype, + PeriodDtype, +) + +from pandas import DataFrame +import pandas.core.common as com + +from pandas.tseries.frequencies import to_offset + +if TYPE_CHECKING: + from pandas._typing import ( + DtypeObj, + JSONSerializable, + ) + + from pandas import Series + from pandas.core.indexes.multi import MultiIndex + + +TABLE_SCHEMA_VERSION = "1.4.0" + + +def as_json_table_type(x: DtypeObj) -> str: + """ + Convert a NumPy / pandas type to its corresponding json_table. + + Parameters + ---------- + x : np.dtype or ExtensionDtype + + Returns + ------- + str + the Table Schema data types + + Notes + ----- + This table shows the relationship between NumPy / pandas dtypes, + and Table Schema dtypes. + + ============== ================= + Pandas type Table Schema type + ============== ================= + int64 integer + float64 number + bool boolean + datetime64[ns] datetime + timedelta64[ns] duration + object str + categorical any + =============== ================= + """ + if is_integer_dtype(x): + return "integer" + elif is_bool_dtype(x): + return "boolean" + elif is_numeric_dtype(x): + return "number" + elif lib.is_np_dtype(x, "M") or isinstance(x, (DatetimeTZDtype, PeriodDtype)): + return "datetime" + elif lib.is_np_dtype(x, "m"): + return "duration" + elif isinstance(x, ExtensionDtype): + return "any" + elif is_string_dtype(x): + return "string" + else: + return "any" + + +def set_default_names(data): + """Sets index names to 'index' for regular, or 'level_x' for Multi""" + if com.all_not_none(*data.index.names): + nms = data.index.names + if len(nms) == 1 and data.index.name == "index": + warnings.warn( + "Index name of 'index' is not round-trippable.", + stacklevel=find_stack_level(), + ) + elif len(nms) > 1 and any(x.startswith("level_") for x in nms): + warnings.warn( + "Index names beginning with 'level_' are not round-trippable.", + stacklevel=find_stack_level(), + ) + return data + + data = data.copy() + if data.index.nlevels > 1: + data.index.names = com.fill_missing_names(data.index.names) + else: + data.index.name = data.index.name or "index" + return data + + +def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]: + dtype = arr.dtype + name: JSONSerializable + if arr.name is None: + name = "values" + else: + name = arr.name + field: dict[str, JSONSerializable] = { + "name": name, + "type": as_json_table_type(dtype), + } + + if isinstance(dtype, CategoricalDtype): + cats = dtype.categories + ordered = dtype.ordered + + field["constraints"] = {"enum": list(cats)} + field["ordered"] = ordered + elif isinstance(dtype, PeriodDtype): + field["freq"] = dtype.freq.freqstr + elif isinstance(dtype, DatetimeTZDtype): + if timezones.is_utc(dtype.tz): + # timezone.utc has no "zone" attr + field["tz"] = "UTC" + else: + # error: "tzinfo" has no attribute "zone" + field["tz"] = dtype.tz.zone # type: ignore[attr-defined] + elif isinstance(dtype, ExtensionDtype): + field["extDtype"] = dtype.name + return field + + +def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype: + """ + Converts a JSON field descriptor into its corresponding NumPy / pandas type + + Parameters + ---------- + field + A JSON field descriptor + + Returns + ------- + dtype + + Raises + ------ + ValueError + If the type of the provided field is unknown or currently unsupported + + Examples + -------- + >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"}) + 'int64' + + >>> convert_json_field_to_pandas_type( + ... { + ... "name": "a_categorical", + ... "type": "any", + ... "constraints": {"enum": ["a", "b", "c"]}, + ... "ordered": True, + ... } + ... ) + CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=object) + + >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"}) + 'datetime64[ns]' + + >>> convert_json_field_to_pandas_type( + ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"} + ... ) + 'datetime64[ns, US/Central]' + """ + typ = field["type"] + if typ == "string": + return "object" + elif typ == "integer": + return field.get("extDtype", "int64") + elif typ == "number": + return field.get("extDtype", "float64") + elif typ == "boolean": + return field.get("extDtype", "bool") + elif typ == "duration": + return "timedelta64" + elif typ == "datetime": + if field.get("tz"): + return f"datetime64[ns, {field['tz']}]" + elif field.get("freq"): + # GH#9586 rename frequency M to ME for offsets + offset = to_offset(field["freq"]) + freq_n, freq_name = offset.n, offset.name + freq = freq_to_period_freqstr(freq_n, freq_name) + # GH#47747 using datetime over period to minimize the change surface + return f"period[{freq}]" + else: + return "datetime64[ns]" + elif typ == "any": + if "constraints" in field and "ordered" in field: + return CategoricalDtype( + categories=field["constraints"]["enum"], ordered=field["ordered"] + ) + elif "extDtype" in field: + return registry.find(field["extDtype"]) + else: + return "object" + + raise ValueError(f"Unsupported or invalid field type: {typ}") + + +def build_table_schema( + data: DataFrame | Series, + index: bool = True, + primary_key: bool | None = None, + version: bool = True, +) -> dict[str, JSONSerializable]: + """ + Create a Table schema from ``data``. + + Parameters + ---------- + data : Series, DataFrame + index : bool, default True + Whether to include ``data.index`` in the schema. + primary_key : bool or None, default True + Column names to designate as the primary key. + The default `None` will set `'primaryKey'` to the index + level or levels if the index is unique. + version : bool, default True + Whether to include a field `pandas_version` with the version + of pandas that last revised the table schema. This version + can be different from the installed pandas version. + + Returns + ------- + dict + + Notes + ----- + See `Table Schema + `__ for + conversion types. + Timedeltas as converted to ISO8601 duration format with + 9 decimal places after the seconds field for nanosecond precision. + + Categoricals are converted to the `any` dtype, and use the `enum` field + constraint to list the allowed values. The `ordered` attribute is included + in an `ordered` field. + + Examples + -------- + >>> from pandas.io.json._table_schema import build_table_schema + >>> df = pd.DataFrame( + ... {'A': [1, 2, 3], + ... 'B': ['a', 'b', 'c'], + ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), + ... }, index=pd.Index(range(3), name='idx')) + >>> build_table_schema(df) + {'fields': \ +[{'name': 'idx', 'type': 'integer'}, \ +{'name': 'A', 'type': 'integer'}, \ +{'name': 'B', 'type': 'string'}, \ +{'name': 'C', 'type': 'datetime'}], \ +'primaryKey': ['idx'], \ +'pandas_version': '1.4.0'} + """ + if index is True: + data = set_default_names(data) + + schema: dict[str, Any] = {} + fields = [] + + if index: + if data.index.nlevels > 1: + data.index = cast("MultiIndex", data.index) + for level, name in zip(data.index.levels, data.index.names): + new_field = convert_pandas_type_to_json_field(level) + new_field["name"] = name + fields.append(new_field) + else: + fields.append(convert_pandas_type_to_json_field(data.index)) + + if data.ndim > 1: + for column, s in data.items(): + fields.append(convert_pandas_type_to_json_field(s)) + else: + fields.append(convert_pandas_type_to_json_field(data)) + + schema["fields"] = fields + if index and data.index.is_unique and primary_key is None: + if data.index.nlevels == 1: + schema["primaryKey"] = [data.index.name] + else: + schema["primaryKey"] = data.index.names + elif primary_key is not None: + schema["primaryKey"] = primary_key + + if version: + schema["pandas_version"] = TABLE_SCHEMA_VERSION + return schema + + +def parse_table_schema(json, precise_float: bool) -> DataFrame: + """ + Builds a DataFrame from a given schema + + Parameters + ---------- + json : + A JSON table schema + precise_float : bool + Flag controlling precision when decoding string to double values, as + dictated by ``read_json`` + + Returns + ------- + df : DataFrame + + Raises + ------ + NotImplementedError + If the JSON table schema contains either timezone or timedelta data + + Notes + ----- + Because :func:`DataFrame.to_json` uses the string 'index' to denote a + name-less :class:`Index`, this function sets the name of the returned + :class:`DataFrame` to ``None`` when said string is encountered with a + normal :class:`Index`. For a :class:`MultiIndex`, the same limitation + applies to any strings beginning with 'level_'. Therefore, an + :class:`Index` name of 'index' and :class:`MultiIndex` names starting + with 'level_' are not supported. + + See Also + -------- + build_table_schema : Inverse function. + pandas.read_json + """ + table = ujson_loads(json, precise_float=precise_float) + col_order = [field["name"] for field in table["schema"]["fields"]] + df = DataFrame(table["data"], columns=col_order)[col_order] + + dtypes = { + field["name"]: convert_json_field_to_pandas_type(field) + for field in table["schema"]["fields"] + } + + # No ISO constructor for Timedelta as of yet, so need to raise + if "timedelta64" in dtypes.values(): + raise NotImplementedError( + 'table="orient" can not yet read ISO-formatted Timedelta data' + ) + + df = df.astype(dtypes) + + if "primaryKey" in table["schema"]: + df = df.set_index(table["schema"]["primaryKey"]) + if len(df.index.names) == 1: + if df.index.name == "index": + df.index.name = None + else: + df.index.names = [ + None if x.startswith("level_") else x for x in df.index.names + ] + + return df diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ff11968db15f0f7c6057a46c252a91daee7b9cd9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/__init__.py @@ -0,0 +1,9 @@ +from pandas.io.parsers.readers import ( + TextFileReader, + TextParser, + read_csv, + read_fwf, + read_table, +) + +__all__ = ["TextFileReader", "TextParser", "read_csv", "read_fwf", "read_table"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/arrow_parser_wrapper.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/arrow_parser_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..890b22154648e6b12d636c5df3595d105ff02ac9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/arrow_parser_wrapper.py @@ -0,0 +1,303 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +import warnings + +from pandas._config import using_pyarrow_string_dtype + +from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + ParserError, + ParserWarning, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.inference import is_integer + +import pandas as pd +from pandas import DataFrame + +from pandas.io._util import ( + _arrow_dtype_mapping, + arrow_string_types_mapper, +) +from pandas.io.parsers.base_parser import ParserBase + +if TYPE_CHECKING: + from pandas._typing import ReadBuffer + + +class ArrowParserWrapper(ParserBase): + """ + Wrapper for the pyarrow engine for read_csv() + """ + + def __init__(self, src: ReadBuffer[bytes], **kwds) -> None: + super().__init__(kwds) + self.kwds = kwds + self.src = src + + self._parse_kwds() + + def _parse_kwds(self) -> None: + """ + Validates keywords before passing to pyarrow. + """ + encoding: str | None = self.kwds.get("encoding") + self.encoding = "utf-8" if encoding is None else encoding + + na_values = self.kwds["na_values"] + if isinstance(na_values, dict): + raise ValueError( + "The pyarrow engine doesn't support passing a dict for na_values" + ) + self.na_values = list(self.kwds["na_values"]) + + def _get_pyarrow_options(self) -> None: + """ + Rename some arguments to pass to pyarrow + """ + mapping = { + "usecols": "include_columns", + "na_values": "null_values", + "escapechar": "escape_char", + "skip_blank_lines": "ignore_empty_lines", + "decimal": "decimal_point", + "quotechar": "quote_char", + } + for pandas_name, pyarrow_name in mapping.items(): + if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None: + self.kwds[pyarrow_name] = self.kwds.pop(pandas_name) + + # Date format handling + # If we get a string, we need to convert it into a list for pyarrow + # If we get a dict, we want to parse those separately + date_format = self.date_format + if isinstance(date_format, str): + date_format = [date_format] + else: + # In case of dict, we don't want to propagate through, so + # just set to pyarrow default of None + + # Ideally, in future we disable pyarrow dtype inference (read in as string) + # to prevent misreads. + date_format = None + self.kwds["timestamp_parsers"] = date_format + + self.parse_options = { + option_name: option_value + for option_name, option_value in self.kwds.items() + if option_value is not None + and option_name + in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") + } + + on_bad_lines = self.kwds.get("on_bad_lines") + if on_bad_lines is not None: + if callable(on_bad_lines): + self.parse_options["invalid_row_handler"] = on_bad_lines + elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR: + self.parse_options[ + "invalid_row_handler" + ] = None # PyArrow raises an exception by default + elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN: + + def handle_warning(invalid_row) -> str: + warnings.warn( + f"Expected {invalid_row.expected_columns} columns, but found " + f"{invalid_row.actual_columns}: {invalid_row.text}", + ParserWarning, + stacklevel=find_stack_level(), + ) + return "skip" + + self.parse_options["invalid_row_handler"] = handle_warning + elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP: + self.parse_options["invalid_row_handler"] = lambda _: "skip" + + self.convert_options = { + option_name: option_value + for option_name, option_value in self.kwds.items() + if option_value is not None + and option_name + in ( + "include_columns", + "null_values", + "true_values", + "false_values", + "decimal_point", + "timestamp_parsers", + ) + } + self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] + # autogenerated column names are prefixed with 'f' in pyarrow.csv + if self.header is None and "include_columns" in self.convert_options: + self.convert_options["include_columns"] = [ + f"f{n}" for n in self.convert_options["include_columns"] + ] + + self.read_options = { + "autogenerate_column_names": self.header is None, + "skip_rows": self.header + if self.header is not None + else self.kwds["skiprows"], + "encoding": self.encoding, + } + + def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: + """ + Processes data read in based on kwargs. + + Parameters + ---------- + frame: DataFrame + The DataFrame to process. + + Returns + ------- + DataFrame + The processed DataFrame. + """ + num_cols = len(frame.columns) + multi_index_named = True + if self.header is None: + if self.names is None: + if self.header is None: + self.names = range(num_cols) + if len(self.names) != num_cols: + # usecols is passed through to pyarrow, we only handle index col here + # The only way self.names is not the same length as number of cols is + # if we have int index_col. We should just pad the names(they will get + # removed anyways) to expected length then. + self.names = list(range(num_cols - len(self.names))) + self.names + multi_index_named = False + frame.columns = self.names + # we only need the frame not the names + _, frame = self._do_date_conversions(frame.columns, frame) + if self.index_col is not None: + index_to_set = self.index_col.copy() + for i, item in enumerate(self.index_col): + if is_integer(item): + index_to_set[i] = frame.columns[item] + # String case + elif item not in frame.columns: + raise ValueError(f"Index {item} invalid") + + # Process dtype for index_col and drop from dtypes + if self.dtype is not None: + key, new_dtype = ( + (item, self.dtype.get(item)) + if self.dtype.get(item) is not None + else (frame.columns[item], self.dtype.get(frame.columns[item])) + ) + if new_dtype is not None: + frame[key] = frame[key].astype(new_dtype) + del self.dtype[key] + + frame.set_index(index_to_set, drop=True, inplace=True) + # Clear names if headerless and no name given + if self.header is None and not multi_index_named: + frame.index.names = [None] * len(frame.index.names) + + if self.dtype is not None: + # Ignore non-existent columns from dtype mapping + # like other parsers do + if isinstance(self.dtype, dict): + self.dtype = { + k: pandas_dtype(v) + for k, v in self.dtype.items() + if k in frame.columns + } + else: + self.dtype = pandas_dtype(self.dtype) + try: + frame = frame.astype(self.dtype) + except TypeError as e: + # GH#44901 reraise to keep api consistent + raise ValueError(e) + return frame + + def _validate_usecols(self, usecols) -> None: + if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols): + raise ValueError( + "The pyarrow engine does not allow 'usecols' to be integer " + "column positions. Pass a list of string column names instead." + ) + elif callable(usecols): + raise ValueError( + "The pyarrow engine does not allow 'usecols' to be a callable." + ) + + def read(self) -> DataFrame: + """ + Reads the contents of a CSV file into a DataFrame and + processes it according to the kwargs passed in the + constructor. + + Returns + ------- + DataFrame + The DataFrame created from the CSV file. + """ + pa = import_optional_dependency("pyarrow") + pyarrow_csv = import_optional_dependency("pyarrow.csv") + self._get_pyarrow_options() + + try: + convert_options = pyarrow_csv.ConvertOptions(**self.convert_options) + except TypeError: + include = self.convert_options.get("include_columns", None) + if include is not None: + self._validate_usecols(include) + + nulls = self.convert_options.get("null_values", set()) + if not lib.is_list_like(nulls) or not all( + isinstance(x, str) for x in nulls + ): + raise TypeError( + "The 'pyarrow' engine requires all na_values to be strings" + ) + + raise + + try: + table = pyarrow_csv.read_csv( + self.src, + read_options=pyarrow_csv.ReadOptions(**self.read_options), + parse_options=pyarrow_csv.ParseOptions(**self.parse_options), + convert_options=convert_options, + ) + except pa.ArrowInvalid as e: + raise ParserError(e) from e + + dtype_backend = self.kwds["dtype_backend"] + + # Convert all pa.null() cols -> float64 (non nullable) + # else Int64 (nullable case, see below) + if dtype_backend is lib.no_default: + new_schema = table.schema + new_type = pa.float64() + for i, arrow_type in enumerate(table.schema.types): + if pa.types.is_null(arrow_type): + new_schema = new_schema.set( + i, new_schema.field(i).with_type(new_type) + ) + + table = table.cast(new_schema) + + if dtype_backend == "pyarrow": + frame = table.to_pandas(types_mapper=pd.ArrowDtype) + elif dtype_backend == "numpy_nullable": + # Modify the default mapping to also + # map null to Int64 (to match other engines) + dtype_mapping = _arrow_dtype_mapping() + dtype_mapping[pa.null()] = pd.Int64Dtype() + frame = table.to_pandas(types_mapper=dtype_mapping.get) + elif using_pyarrow_string_dtype(): + frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + + else: + frame = table.to_pandas() + return self._finalize_pandas_output(frame) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/base_parser.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/base_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..09f0f2af8e5c6b55bff173ff74cd290fdf61cbae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/base_parser.py @@ -0,0 +1,1448 @@ +from __future__ import annotations + +from collections import defaultdict +from copy import copy +import csv +import datetime +from enum import Enum +import itertools +from typing import ( + TYPE_CHECKING, + Any, + Callable, + cast, + final, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import ( + lib, + parsers, +) +import pandas._libs.ops as libops +from pandas._libs.parsers import STR_NA_VALUES +from pandas._libs.tslibs import parsing +from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + ParserError, + ParserWarning, +) +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.astype import astype_array +from pandas.core.dtypes.common import ( + ensure_object, + is_bool_dtype, + is_dict_like, + is_extension_array_dtype, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + ExtensionDtype, +) +from pandas.core.dtypes.missing import isna + +from pandas import ( + ArrowDtype, + DataFrame, + DatetimeIndex, + StringDtype, + concat, +) +from pandas.core import algorithms +from pandas.core.arrays import ( + ArrowExtensionArray, + BaseMaskedArray, + BooleanArray, + Categorical, + ExtensionArray, + FloatingArray, + IntegerArray, +) +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.indexes.api import ( + Index, + MultiIndex, + default_index, + ensure_index_from_sequences, +) +from pandas.core.series import Series +from pandas.core.tools import datetimes as tools + +from pandas.io.common import is_potential_multi_index + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterable, + Mapping, + Sequence, + ) + + from pandas._typing import ( + ArrayLike, + DtypeArg, + DtypeObj, + Scalar, + ) + + +class ParserBase: + class BadLineHandleMethod(Enum): + ERROR = 0 + WARN = 1 + SKIP = 2 + + _implicit_index: bool + _first_chunk: bool + keep_default_na: bool + dayfirst: bool + cache_dates: bool + keep_date_col: bool + usecols_dtype: str | None + + def __init__(self, kwds) -> None: + self._implicit_index = False + + self.names = kwds.get("names") + self.orig_names: Sequence[Hashable] | None = None + + self.index_col = kwds.get("index_col", None) + self.unnamed_cols: set = set() + self.index_names: Sequence[Hashable] | None = None + self.col_names: Sequence[Hashable] | None = None + + self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) + self._parse_date_cols: Iterable = [] + self.date_parser = kwds.pop("date_parser", lib.no_default) + self.date_format = kwds.pop("date_format", None) + self.dayfirst = kwds.pop("dayfirst", False) + self.keep_date_col = kwds.pop("keep_date_col", False) + + self.na_values = kwds.get("na_values") + self.na_fvalues = kwds.get("na_fvalues") + self.na_filter = kwds.get("na_filter", False) + self.keep_default_na = kwds.get("keep_default_na", True) + + self.dtype = copy(kwds.get("dtype", None)) + self.converters = kwds.get("converters") + self.dtype_backend = kwds.get("dtype_backend") + + self.true_values = kwds.get("true_values") + self.false_values = kwds.get("false_values") + self.cache_dates = kwds.pop("cache_dates", True) + + self._date_conv = _make_date_converter( + date_parser=self.date_parser, + date_format=self.date_format, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + ) + + # validate header options for mi + self.header = kwds.get("header") + if is_list_like(self.header, allow_sets=False): + if kwds.get("usecols"): + raise ValueError( + "cannot specify usecols when specifying a multi-index header" + ) + if kwds.get("names"): + raise ValueError( + "cannot specify names when specifying a multi-index header" + ) + + # validate index_col that only contains integers + if self.index_col is not None: + # In this case we can pin down index_col as list[int] + if is_integer(self.index_col): + self.index_col = [self.index_col] + elif not ( + is_list_like(self.index_col, allow_sets=False) + and all(map(is_integer, self.index_col)) + ): + raise ValueError( + "index_col must only contain row numbers " + "when specifying a multi-index header" + ) + else: + self.index_col = list(self.index_col) + + self._name_processed = False + + self._first_chunk = True + + self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) + + # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) + # Normally, this arg would get pre-processed earlier on + self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) + + def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable: + """ + Check if parse_dates are in columns. + + If user has provided names for parse_dates, check if those columns + are available. + + Parameters + ---------- + columns : list + List of names of the dataframe. + + Returns + ------- + The names of the columns which will get parsed later if a dict or list + is given as specification. + + Raises + ------ + ValueError + If column to parse_date is not in dataframe. + + """ + cols_needed: Iterable + if is_dict_like(self.parse_dates): + cols_needed = itertools.chain(*self.parse_dates.values()) + elif is_list_like(self.parse_dates): + # a column in parse_dates could be represented + # ColReference = Union[int, str] + # DateGroups = List[ColReference] + # ParseDates = Union[DateGroups, List[DateGroups], + # Dict[ColReference, DateGroups]] + cols_needed = itertools.chain.from_iterable( + col if is_list_like(col) and not isinstance(col, tuple) else [col] + for col in self.parse_dates + ) + else: + cols_needed = [] + + cols_needed = list(cols_needed) + + # get only columns that are references using names (str), not by index + missing_cols = ", ".join( + sorted( + { + col + for col in cols_needed + if isinstance(col, str) and col not in columns + } + ) + ) + if missing_cols: + raise ValueError( + f"Missing column provided to 'parse_dates': '{missing_cols}'" + ) + # Convert positions to actual column names + return [ + col if (isinstance(col, str) or col in columns) else columns[col] + for col in cols_needed + ] + + def close(self) -> None: + pass + + @final + @property + def _has_complex_date_col(self) -> bool: + return isinstance(self.parse_dates, dict) or ( + isinstance(self.parse_dates, list) + and len(self.parse_dates) > 0 + and isinstance(self.parse_dates[0], list) + ) + + @final + def _should_parse_dates(self, i: int) -> bool: + if lib.is_bool(self.parse_dates): + return bool(self.parse_dates) + else: + if self.index_names is not None: + name = self.index_names[i] + else: + name = None + j = i if self.index_col is None else self.index_col[i] + + return (j in self.parse_dates) or ( + name is not None and name in self.parse_dates + ) + + @final + def _extract_multi_indexer_columns( + self, + header, + index_names: Sequence[Hashable] | None, + passed_names: bool = False, + ) -> tuple[ + Sequence[Hashable], Sequence[Hashable] | None, Sequence[Hashable] | None, bool + ]: + """ + Extract and return the names, index_names, col_names if the column + names are a MultiIndex. + + Parameters + ---------- + header: list of lists + The header rows + index_names: list, optional + The names of the future index + passed_names: bool, default False + A flag specifying if names where passed + + """ + if len(header) < 2: + return header[0], index_names, None, passed_names + + # the names are the tuples of the header that are not the index cols + # 0 is the name of the index, assuming index_col is a list of column + # numbers + ic = self.index_col + if ic is None: + ic = [] + + if not isinstance(ic, (list, tuple, np.ndarray)): + ic = [ic] + sic = set(ic) + + # clean the index_names + index_names = header.pop(-1) + index_names, _, _ = self._clean_index_names(index_names, self.index_col) + + # extract the columns + field_count = len(header[0]) + + # check if header lengths are equal + if not all(len(header_iter) == field_count for header_iter in header[1:]): + raise ParserError("Header rows must have an equal number of columns.") + + def extract(r): + return tuple(r[i] for i in range(field_count) if i not in sic) + + columns = list(zip(*(extract(r) for r in header))) + names = columns.copy() + for single_ic in sorted(ic): + names.insert(single_ic, single_ic) + + # Clean the column names (if we have an index_col). + if len(ic): + col_names = [ + r[ic[0]] + if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols) + else None + for r in header + ] + else: + col_names = [None] * len(header) + + passed_names = True + + return names, index_names, col_names, passed_names + + @final + def _maybe_make_multi_index_columns( + self, + columns: Sequence[Hashable], + col_names: Sequence[Hashable] | None = None, + ) -> Sequence[Hashable] | MultiIndex: + # possibly create a column mi here + if is_potential_multi_index(columns): + list_columns = cast(list[tuple], columns) + return MultiIndex.from_tuples(list_columns, names=col_names) + return columns + + @final + def _make_index( + self, data, alldata, columns, indexnamerow: list[Scalar] | None = None + ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: + index: Index | None + if not is_index_col(self.index_col) or not self.index_col: + index = None + + elif not self._has_complex_date_col: + simple_index = self._get_simple_index(alldata, columns) + index = self._agg_index(simple_index) + elif self._has_complex_date_col: + if not self._name_processed: + (self.index_names, _, self.index_col) = self._clean_index_names( + list(columns), self.index_col + ) + self._name_processed = True + date_index = self._get_complex_date_index(data, columns) + index = self._agg_index(date_index, try_parse_dates=False) + + # add names for the index + if indexnamerow: + coffset = len(indexnamerow) - len(columns) + assert index is not None + index = index.set_names(indexnamerow[:coffset]) + + # maybe create a mi on the columns + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + + return index, columns + + @final + def _get_simple_index(self, data, columns): + def ix(col): + if not isinstance(col, str): + return col + raise ValueError(f"Index {col} invalid") + + to_remove = [] + index = [] + for idx in self.index_col: + i = ix(idx) + to_remove.append(i) + index.append(data[i]) + + # remove index items from content and columns, don't pop in + # loop + for i in sorted(to_remove, reverse=True): + data.pop(i) + if not self._implicit_index: + columns.pop(i) + + return index + + @final + def _get_complex_date_index(self, data, col_names): + def _get_name(icol): + if isinstance(icol, str): + return icol + + if col_names is None: + raise ValueError(f"Must supply column order to use {icol!s} as index") + + for i, c in enumerate(col_names): + if i == icol: + return c + + to_remove = [] + index = [] + for idx in self.index_col: + name = _get_name(idx) + to_remove.append(name) + index.append(data[name]) + + # remove index items from content and columns, don't pop in + # loop + for c in sorted(to_remove, reverse=True): + data.pop(c) + col_names.remove(c) + + return index + + @final + def _clean_mapping(self, mapping): + """converts col numbers to names""" + if not isinstance(mapping, dict): + return mapping + clean = {} + # for mypy + assert self.orig_names is not None + + for col, v in mapping.items(): + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + clean[col] = v + if isinstance(mapping, defaultdict): + remaining_cols = set(self.orig_names) - set(clean.keys()) + clean.update({col: mapping[col] for col in remaining_cols}) + return clean + + @final + def _agg_index(self, index, try_parse_dates: bool = True) -> Index: + arrays = [] + converters = self._clean_mapping(self.converters) + + for i, arr in enumerate(index): + if try_parse_dates and self._should_parse_dates(i): + arr = self._date_conv( + arr, + col=self.index_names[i] if self.index_names is not None else None, + ) + + if self.na_filter: + col_na_values = self.na_values + col_na_fvalues = self.na_fvalues + else: + col_na_values = set() + col_na_fvalues = set() + + if isinstance(self.na_values, dict): + assert self.index_names is not None + col_name = self.index_names[i] + if col_name is not None: + col_na_values, col_na_fvalues = _get_na_values( + col_name, self.na_values, self.na_fvalues, self.keep_default_na + ) + + clean_dtypes = self._clean_mapping(self.dtype) + + cast_type = None + index_converter = False + if self.index_names is not None: + if isinstance(clean_dtypes, dict): + cast_type = clean_dtypes.get(self.index_names[i], None) + + if isinstance(converters, dict): + index_converter = converters.get(self.index_names[i]) is not None + + try_num_bool = not ( + cast_type and is_string_dtype(cast_type) or index_converter + ) + + arr, _ = self._infer_types( + arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool + ) + arrays.append(arr) + + names = self.index_names + index = ensure_index_from_sequences(arrays, names) + + return index + + @final + def _convert_to_ndarrays( + self, + dct: Mapping, + na_values, + na_fvalues, + verbose: bool = False, + converters=None, + dtypes=None, + ): + result = {} + for c, values in dct.items(): + conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes + + if self.na_filter: + col_na_values, col_na_fvalues = _get_na_values( + c, na_values, na_fvalues, self.keep_default_na + ) + else: + col_na_values, col_na_fvalues = set(), set() + + if c in self._parse_date_cols: + # GH#26203 Do not convert columns which get converted to dates + # but replace nans to ensure to_datetime works + mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) + np.putmask(values, mask, np.nan) + result[c] = values + continue + + if conv_f is not None: + # conv_f applied to data before inference + if cast_type is not None: + warnings.warn( + ( + "Both a converter and dtype were specified " + f"for column {c} - only the converter will be used." + ), + ParserWarning, + stacklevel=find_stack_level(), + ) + + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = algorithms.isin(values, list(na_values)).view(np.uint8) + values = lib.map_infer_mask(values, conv_f, mask) + + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool=False, + ) + else: + is_ea = is_extension_array_dtype(cast_type) + is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) + # skip inference if specified dtype is object + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) + + # general type inference and conversion + cvals, na_count = self._infer_types( + values, + set(col_na_values) | col_na_fvalues, + cast_type is None, + try_num_bool, + ) + + # type specified in dtype param or cast_type is an EA + if cast_type is not None: + cast_type = pandas_dtype(cast_type) + if cast_type and (cvals.dtype != cast_type or is_ea): + if not is_ea and na_count > 0: + if is_bool_dtype(cast_type): + raise ValueError(f"Bool column has NA values in column {c}") + cvals = self._cast_types(cvals, cast_type, c) + + result[c] = cvals + if verbose and na_count: + print(f"Filled {na_count} NA values in column {c!s}") + return result + + @final + def _set_noconvert_dtype_columns( + self, col_indices: list[int], names: Sequence[Hashable] + ) -> set[int]: + """ + Set the columns that should not undergo dtype conversions. + + Currently, any column that is involved with date parsing will not + undergo such conversions. If usecols is specified, the positions of the columns + not to cast is relative to the usecols not to all columns. + + Parameters + ---------- + col_indices: The indices specifying order and positions of the columns + names: The column names which order is corresponding with the order + of col_indices + + Returns + ------- + A set of integers containing the positions of the columns not to convert. + """ + usecols: list[int] | list[str] | None + noconvert_columns = set() + if self.usecols_dtype == "integer": + # A set of integers will be converted to a list in + # the correct order every single time. + usecols = sorted(self.usecols) + elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): + # The names attribute should have the correct columns + # in the proper order for indexing with parse_dates. + usecols = col_indices + else: + # Usecols is empty. + usecols = None + + def _set(x) -> int: + if usecols is not None and is_integer(x): + x = usecols[x] + + if not is_integer(x): + x = col_indices[names.index(x)] + + return x + + if isinstance(self.parse_dates, list): + for val in self.parse_dates: + if isinstance(val, list): + for k in val: + noconvert_columns.add(_set(k)) + else: + noconvert_columns.add(_set(val)) + + elif isinstance(self.parse_dates, dict): + for val in self.parse_dates.values(): + if isinstance(val, list): + for k in val: + noconvert_columns.add(_set(k)) + else: + noconvert_columns.add(_set(val)) + + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + noconvert_columns.add(_set(k)) + elif self.index_col is not None: + noconvert_columns.add(_set(self.index_col)) + + return noconvert_columns + + @final + def _infer_types( + self, values, na_values, no_dtype_specified, try_num_bool: bool = True + ) -> tuple[ArrayLike, int]: + """ + Infer types of values, possibly casting + + Parameters + ---------- + values : ndarray + na_values : set + no_dtype_specified: Specifies if we want to cast explicitly + try_num_bool : bool, default try + try to cast values to numeric (first preference) or boolean + + Returns + ------- + converted : ndarray or ExtensionArray + na_count : int + """ + na_count = 0 + if issubclass(values.dtype.type, (np.number, np.bool_)): + # If our array has numeric dtype, we don't have to check for strings in isin + na_values = np.array([val for val in na_values if not isinstance(val, str)]) + mask = algorithms.isin(values, na_values) + na_count = mask.astype("uint8", copy=False).sum() + if na_count > 0: + if is_integer_dtype(values): + values = values.astype(np.float64) + np.putmask(values, mask, np.nan) + return values, na_count + + dtype_backend = self.dtype_backend + non_default_dtype_backend = ( + no_dtype_specified and dtype_backend is not lib.no_default + ) + result: ArrayLike + + if try_num_bool and is_object_dtype(values.dtype): + # exclude e.g DatetimeIndex here + try: + result, result_mask = lib.maybe_convert_numeric( + values, + na_values, + False, + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] + ) + except (ValueError, TypeError): + # e.g. encountering datetime string gets ValueError + # TypeError can be raised in floatify + na_count = parsers.sanitize_objects(values, na_values) + result = values + else: + if non_default_dtype_backend: + if result_mask is None: + result_mask = np.zeros(result.shape, dtype=np.bool_) + + if result_mask.all(): + result = IntegerArray( + np.ones(result_mask.shape, dtype=np.int64), result_mask + ) + elif is_integer_dtype(result): + result = IntegerArray(result, result_mask) + elif is_bool_dtype(result): + result = BooleanArray(result, result_mask) + elif is_float_dtype(result): + result = FloatingArray(result, result_mask) + + na_count = result_mask.sum() + else: + na_count = isna(result).sum() + else: + result = values + if values.dtype == np.object_: + na_count = parsers.sanitize_objects(values, na_values) + + if result.dtype == np.object_ and try_num_bool: + result, bool_mask = libops.maybe_convert_bool( + np.asarray(values), + true_values=self.true_values, + false_values=self.false_values, + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] + ) + if result.dtype == np.bool_ and non_default_dtype_backend: + if bool_mask is None: + bool_mask = np.zeros(result.shape, dtype=np.bool_) + result = BooleanArray(result, bool_mask) + elif result.dtype == np.object_ and non_default_dtype_backend: + # read_excel sends array of datetime objects + if not lib.is_datetime_array(result, skipna=True): + dtype = StringDtype() + cls = dtype.construct_array_type() + result = cls._from_sequence(values, dtype=dtype) + + if dtype_backend == "pyarrow": + pa = import_optional_dependency("pyarrow") + if isinstance(result, np.ndarray): + result = ArrowExtensionArray(pa.array(result, from_pandas=True)) + elif isinstance(result, BaseMaskedArray): + if result._mask.all(): + # We want an arrow null array here + result = ArrowExtensionArray(pa.array([None] * len(result))) + else: + result = ArrowExtensionArray( + pa.array(result._data, mask=result._mask) + ) + else: + result = ArrowExtensionArray( + pa.array(result.to_numpy(), from_pandas=True) + ) + + return result, na_count + + @final + def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike: + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray or ExtensionArray + cast_type : np.dtype or ExtensionDtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray or ExtensionArray + """ + if isinstance(cast_type, CategoricalDtype): + known_cats = cast_type.categories is not None + + if not is_object_dtype(values.dtype) and not known_cats: + # TODO: this is for consistency with + # c-parser which parses all categories + # as strings + values = lib.ensure_string_array( + values, skipna=False, convert_na_value=False + ) + + cats = Index(values).unique().dropna() + values = Categorical._from_inferred_categories( + cats, cats.get_indexer(values), cast_type, true_values=self.true_values + ) + + # use the EA's implementation of casting + elif isinstance(cast_type, ExtensionDtype): + array_type = cast_type.construct_array_type() + try: + if isinstance(cast_type, BooleanDtype): + # error: Unexpected keyword argument "true_values" for + # "_from_sequence_of_strings" of "ExtensionArray" + return array_type._from_sequence_of_strings( # type: ignore[call-arg] + values, + dtype=cast_type, + true_values=self.true_values, + false_values=self.false_values, + ) + else: + return array_type._from_sequence_of_strings(values, dtype=cast_type) + except NotImplementedError as err: + raise NotImplementedError( + f"Extension Array: {array_type} must implement " + "_from_sequence_of_strings in order to be used in parser methods" + ) from err + + elif isinstance(values, ExtensionArray): + values = values.astype(cast_type, copy=False) + elif issubclass(cast_type.type, str): + # TODO: why skipna=True here and False above? some tests depend + # on it here, but nothing fails if we change it above + # (as no tests get there as of 2022-12-06) + values = lib.ensure_string_array( + values, skipna=True, convert_na_value=False + ) + else: + try: + values = astype_array(values, cast_type, copy=True) + except ValueError as err: + raise ValueError( + f"Unable to convert column {column} to type {cast_type}" + ) from err + return values + + @overload + def _do_date_conversions( + self, + names: Index, + data: DataFrame, + ) -> tuple[Sequence[Hashable] | Index, DataFrame]: + ... + + @overload + def _do_date_conversions( + self, + names: Sequence[Hashable], + data: Mapping[Hashable, ArrayLike], + ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: + ... + + @final + def _do_date_conversions( + self, + names: Sequence[Hashable] | Index, + data: Mapping[Hashable, ArrayLike] | DataFrame, + ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]: + # returns data, columns + + if self.parse_dates is not None: + data, names = _process_date_conversion( + data, + self._date_conv, + self.parse_dates, + self.index_col, + self.index_names, + names, + keep_date_col=self.keep_date_col, + dtype_backend=self.dtype_backend, + ) + + return names, data + + @final + def _check_data_length( + self, + columns: Sequence[Hashable], + data: Sequence[ArrayLike], + ) -> None: + """Checks if length of data is equal to length of column names. + + One set of trailing commas is allowed. self.index_col not False + results in a ParserError previously when lengths do not match. + + Parameters + ---------- + columns: list of column names + data: list of array-likes containing the data column-wise. + """ + if not self.index_col and len(columns) != len(data) and columns: + empty_str = is_object_dtype(data[-1]) and data[-1] == "" + # error: No overload variant of "__ror__" of "ndarray" matches + # argument type "ExtensionArray" + empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator] + if len(columns) == len(data) - 1 and np.all(empty_str_or_na): + return + warnings.warn( + "Length of header or names does not match length of data. This leads " + "to a loss of data with index_col=False.", + ParserWarning, + stacklevel=find_stack_level(), + ) + + @overload + def _evaluate_usecols( + self, + usecols: set[int] | Callable[[Hashable], object], + names: Sequence[Hashable], + ) -> set[int]: + ... + + @overload + def _evaluate_usecols( + self, usecols: set[str], names: Sequence[Hashable] + ) -> set[str]: + ... + + @final + def _evaluate_usecols( + self, + usecols: Callable[[Hashable], object] | set[str] | set[int], + names: Sequence[Hashable], + ) -> set[str] | set[int]: + """ + Check whether or not the 'usecols' parameter + is a callable. If so, enumerates the 'names' + parameter and returns a set of indices for + each entry in 'names' that evaluates to True. + If not a callable, returns 'usecols'. + """ + if callable(usecols): + return {i for i, name in enumerate(names) if usecols(name)} + return usecols + + @final + def _validate_usecols_names(self, usecols, names: Sequence): + """ + Validates that all usecols are present in a given + list of names. If not, raise a ValueError that + shows what usecols are missing. + + Parameters + ---------- + usecols : iterable of usecols + The columns to validate are present in names. + names : iterable of names + The column names to check against. + + Returns + ------- + usecols : iterable of usecols + The `usecols` parameter if the validation succeeds. + + Raises + ------ + ValueError : Columns were missing. Error message will list them. + """ + missing = [c for c in usecols if c not in names] + if len(missing) > 0: + raise ValueError( + f"Usecols do not match columns, columns expected but not found: " + f"{missing}" + ) + + return usecols + + @final + def _validate_usecols_arg(self, usecols): + """ + Validate the 'usecols' parameter. + + Checks whether or not the 'usecols' parameter contains all integers + (column selection by index), strings (column by name) or is a callable. + Raises a ValueError if that is not the case. + + Parameters + ---------- + usecols : list-like, callable, or None + List of columns to use when parsing or a callable that can be used + to filter a list of table columns. + + Returns + ------- + usecols_tuple : tuple + A tuple of (verified_usecols, usecols_dtype). + + 'verified_usecols' is either a set if an array-like is passed in or + 'usecols' if a callable or None is passed in. + + 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like + is passed in or None if a callable or None is passed in. + """ + msg = ( + "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." + ) + if usecols is not None: + if callable(usecols): + return usecols, None + + if not is_list_like(usecols): + # see gh-20529 + # + # Ensure it is iterable container but not string. + raise ValueError(msg) + + usecols_dtype = lib.infer_dtype(usecols, skipna=False) + + if usecols_dtype not in ("empty", "integer", "string"): + raise ValueError(msg) + + usecols = set(usecols) + + return usecols, usecols_dtype + return usecols, None + + @final + def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]: + if not is_index_col(index_col): + return None, columns, index_col + + columns = list(columns) + + # In case of no rows and multiindex columns we have to set index_names to + # list of Nones GH#38292 + if not columns: + return [None] * len(index_col), columns, index_col + + cp_cols = list(columns) + index_names: list[str | int | None] = [] + + # don't mutate + index_col = list(index_col) + + for i, c in enumerate(index_col): + if isinstance(c, str): + index_names.append(c) + for j, name in enumerate(cp_cols): + if name == c: + index_col[i] = j + columns.remove(name) + break + else: + name = cp_cols[c] + columns.remove(name) + index_names.append(name) + + # Only clean index names that were placeholders. + for i, name in enumerate(index_names): + if isinstance(name, str) and name in self.unnamed_cols: + index_names[i] = None + + return index_names, columns, index_col + + @final + def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): + columns = list(columns) + + index_col = self.index_col + index_names = self.index_names + + # Convert `dtype` to a defaultdict of some kind. + # This will enable us to write `dtype[col_name]` + # without worrying about KeyError issues later on. + dtype_dict: defaultdict[Hashable, Any] + if not is_dict_like(dtype): + # if dtype == None, default will be object. + default_dtype = dtype or object + dtype_dict = defaultdict(lambda: default_dtype) + else: + dtype = cast(dict, dtype) + dtype_dict = defaultdict( + lambda: object, + {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, + ) + + # Even though we have no data, the "index" of the empty DataFrame + # could for example still be an empty MultiIndex. Thus, we need to + # check whether we have any index columns specified, via either: + # + # 1) index_col (column indices) + # 2) index_names (column names) + # + # Both must be non-null to ensure a successful construction. Otherwise, + # we have to create a generic empty Index. + index: Index + if (index_col is None or index_col is False) or index_names is None: + index = default_index(0) + else: + data = [Series([], dtype=dtype_dict[name]) for name in index_names] + index = ensure_index_from_sequences(data, names=index_names) + index_col.sort() + + for i, n in enumerate(index_col): + columns.pop(n - i) + + col_dict = { + col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns + } + + return index, columns, col_dict + + +def _make_date_converter( + date_parser=lib.no_default, + dayfirst: bool = False, + cache_dates: bool = True, + date_format: dict[Hashable, str] | str | None = None, +): + if date_parser is not lib.no_default: + warnings.warn( + "The argument 'date_parser' is deprecated and will " + "be removed in a future version. " + "Please use 'date_format' instead, or read your data in as 'object' dtype " + "and then call 'to_datetime'.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if date_parser is not lib.no_default and date_format is not None: + raise TypeError("Cannot use both 'date_parser' and 'date_format'") + + def unpack_if_single_element(arg): + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1: + return arg[0] + return arg + + def converter(*date_cols, col: Hashable): + if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm": + return date_cols[0] + + if date_parser is lib.no_default: + strs = parsing.concat_date_cols(date_cols) + date_fmt = ( + date_format.get(col) if isinstance(date_format, dict) else date_format + ) + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + ".*parsing datetimes with mixed time zones will raise an error", + category=FutureWarning, + ) + str_objs = ensure_object(strs) + try: + result = tools.to_datetime( + str_objs, + format=date_fmt, + utc=False, + dayfirst=dayfirst, + cache=cache_dates, + ) + except (ValueError, TypeError): + # test_usecols_with_parse_dates4 + return str_objs + + if isinstance(result, DatetimeIndex): + arr = result.to_numpy() + arr.flags.writeable = True + return arr + return result._values + else: + try: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + ".*parsing datetimes with mixed time zones " + "will raise an error", + category=FutureWarning, + ) + pre_parsed = date_parser( + *(unpack_if_single_element(arg) for arg in date_cols) + ) + try: + result = tools.to_datetime( + pre_parsed, + cache=cache_dates, + ) + except (ValueError, TypeError): + # test_read_csv_with_custom_date_parser + result = pre_parsed + if isinstance(result, datetime.datetime): + raise Exception("scalar parser") + return result + except Exception: + # e.g. test_datetime_fractional_seconds + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + ".*parsing datetimes with mixed time zones " + "will raise an error", + category=FutureWarning, + ) + pre_parsed = parsing.try_parse_dates( + parsing.concat_date_cols(date_cols), + parser=date_parser, + ) + try: + return tools.to_datetime(pre_parsed) + except (ValueError, TypeError): + # TODO: not reached in tests 2023-10-27; needed? + return pre_parsed + + return converter + + +parser_defaults = { + "delimiter": None, + "escapechar": None, + "quotechar": '"', + "quoting": csv.QUOTE_MINIMAL, + "doublequote": True, + "skipinitialspace": False, + "lineterminator": None, + "header": "infer", + "index_col": None, + "names": None, + "skiprows": None, + "skipfooter": 0, + "nrows": None, + "na_values": None, + "keep_default_na": True, + "true_values": None, + "false_values": None, + "converters": None, + "dtype": None, + "cache_dates": True, + "thousands": None, + "comment": None, + "decimal": ".", + # 'engine': 'c', + "parse_dates": False, + "keep_date_col": False, + "dayfirst": False, + "date_parser": lib.no_default, + "date_format": None, + "usecols": None, + # 'iterator': False, + "chunksize": None, + "verbose": False, + "encoding": None, + "compression": None, + "skip_blank_lines": True, + "encoding_errors": "strict", + "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, + "dtype_backend": lib.no_default, +} + + +def _process_date_conversion( + data_dict, + converter: Callable, + parse_spec, + index_col, + index_names, + columns, + keep_date_col: bool = False, + dtype_backend=lib.no_default, +): + def _isindex(colspec): + return (isinstance(index_col, list) and colspec in index_col) or ( + isinstance(index_names, list) and colspec in index_names + ) + + new_cols = [] + new_data = {} + + orig_names = columns + columns = list(columns) + + date_cols = set() + + if parse_spec is None or isinstance(parse_spec, bool): + return data_dict, columns + + if isinstance(parse_spec, list): + # list of column lists + for colspec in parse_spec: + if is_scalar(colspec) or isinstance(colspec, tuple): + if isinstance(colspec, int) and colspec not in data_dict: + colspec = orig_names[colspec] + if _isindex(colspec): + continue + elif dtype_backend == "pyarrow": + import pyarrow as pa + + dtype = data_dict[colspec].dtype + if isinstance(dtype, ArrowDtype) and ( + pa.types.is_timestamp(dtype.pyarrow_dtype) + or pa.types.is_date(dtype.pyarrow_dtype) + ): + continue + + # Pyarrow engine returns Series which we need to convert to + # numpy array before converter, its a no-op for other parsers + data_dict[colspec] = converter( + np.asarray(data_dict[colspec]), col=colspec + ) + else: + new_name, col, old_names = _try_convert_dates( + converter, colspec, data_dict, orig_names + ) + if new_name in data_dict: + raise ValueError(f"New date column already in dict {new_name}") + new_data[new_name] = col + new_cols.append(new_name) + date_cols.update(old_names) + + elif isinstance(parse_spec, dict): + # dict of new name to column list + for new_name, colspec in parse_spec.items(): + if new_name in data_dict: + raise ValueError(f"Date column {new_name} already in dict") + + _, col, old_names = _try_convert_dates( + converter, + colspec, + data_dict, + orig_names, + target_name=new_name, + ) + + new_data[new_name] = col + + # If original column can be converted to date we keep the converted values + # This can only happen if values are from single column + if len(colspec) == 1: + new_data[colspec[0]] = col + + new_cols.append(new_name) + date_cols.update(old_names) + + if isinstance(data_dict, DataFrame): + data_dict = concat([DataFrame(new_data), data_dict], axis=1, copy=False) + else: + data_dict.update(new_data) + new_cols.extend(columns) + + if not keep_date_col: + for c in list(date_cols): + data_dict.pop(c) + new_cols.remove(c) + + return data_dict, new_cols + + +def _try_convert_dates( + parser: Callable, colspec, data_dict, columns, target_name: str | None = None +): + colset = set(columns) + colnames = [] + + for c in colspec: + if c in colset: + colnames.append(c) + elif isinstance(c, int) and c not in columns: + colnames.append(columns[c]) + else: + colnames.append(c) + + new_name: tuple | str + if all(isinstance(x, tuple) for x in colnames): + new_name = tuple(map("_".join, zip(*colnames))) + else: + new_name = "_".join([str(x) for x in colnames]) + to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] + + new_col = parser(*to_parse, col=new_name if target_name is None else target_name) + return new_name, new_col, colnames + + +def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): + """ + Get the NaN values for a given column. + + Parameters + ---------- + col : str + The name of the column. + na_values : array-like, dict + The object listing the NaN values as strings. + na_fvalues : array-like, dict + The object listing the NaN values as floats. + keep_default_na : bool + If `na_values` is a dict, and the column is not mapped in the + dictionary, whether to return the default NaN values or the empty set. + + Returns + ------- + nan_tuple : A length-two tuple composed of + + 1) na_values : the string NaN values for that column. + 2) na_fvalues : the float NaN values for that column. + """ + if isinstance(na_values, dict): + if col in na_values: + return na_values[col], na_fvalues[col] + else: + if keep_default_na: + return STR_NA_VALUES, set() + + return set(), set() + else: + return na_values, na_fvalues + + +def _validate_parse_dates_arg(parse_dates): + """ + Check whether or not the 'parse_dates' parameter + is a non-boolean scalar. Raises a ValueError if + that is the case. + """ + msg = ( + "Only booleans, lists, and dictionaries are accepted " + "for the 'parse_dates' parameter" + ) + + if not ( + parse_dates is None + or lib.is_bool(parse_dates) + or isinstance(parse_dates, (list, dict)) + ): + raise TypeError(msg) + + return parse_dates + + +def is_index_col(col) -> bool: + return col is not None and col is not False diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/c_parser_wrapper.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/c_parser_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..0cd788c5e57399597e3fe4ee1b1bf2af4bffd74b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/c_parser_wrapper.py @@ -0,0 +1,410 @@ +from __future__ import annotations + +from collections import defaultdict +from typing import TYPE_CHECKING +import warnings + +import numpy as np + +from pandas._libs import ( + lib, + parsers, +) +from pandas.compat._optional import import_optional_dependency +from pandas.errors import DtypeWarning +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.concat import ( + concat_compat, + union_categoricals, +) +from pandas.core.dtypes.dtypes import CategoricalDtype + +from pandas.core.indexes.api import ensure_index_from_sequences + +from pandas.io.common import ( + dedup_names, + is_potential_multi_index, +) +from pandas.io.parsers.base_parser import ( + ParserBase, + ParserError, + is_index_col, +) + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Mapping, + Sequence, + ) + + from pandas._typing import ( + ArrayLike, + DtypeArg, + DtypeObj, + ReadCsvBuffer, + ) + + from pandas import ( + Index, + MultiIndex, + ) + + +class CParserWrapper(ParserBase): + low_memory: bool + _reader: parsers.TextReader + + def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: + super().__init__(kwds) + self.kwds = kwds + kwds = kwds.copy() + + self.low_memory = kwds.pop("low_memory", False) + + # #2442 + # error: Cannot determine type of 'index_col' + kwds["allow_leading_cols"] = ( + self.index_col is not False # type: ignore[has-type] + ) + + # GH20529, validate usecol arg before TextReader + kwds["usecols"] = self.usecols + + # Have to pass int, would break tests using TextReader directly otherwise :( + kwds["on_bad_lines"] = self.on_bad_lines.value + + for key in ( + "storage_options", + "encoding", + "memory_map", + "compression", + ): + kwds.pop(key, None) + + kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None)) + if "dtype_backend" not in kwds or kwds["dtype_backend"] is lib.no_default: + kwds["dtype_backend"] = "numpy" + if kwds["dtype_backend"] == "pyarrow": + # Fail here loudly instead of in cython after reading + import_optional_dependency("pyarrow") + self._reader = parsers.TextReader(src, **kwds) + + self.unnamed_cols = self._reader.unnamed_cols + + # error: Cannot determine type of 'names' + passed_names = self.names is None # type: ignore[has-type] + + if self._reader.header is None: + self.names = None + else: + # error: Cannot determine type of 'names' + # error: Cannot determine type of 'index_names' + ( + self.names, # type: ignore[has-type] + self.index_names, + self.col_names, + passed_names, + ) = self._extract_multi_indexer_columns( + self._reader.header, + self.index_names, # type: ignore[has-type] + passed_names, + ) + + # error: Cannot determine type of 'names' + if self.names is None: # type: ignore[has-type] + self.names = list(range(self._reader.table_width)) + + # gh-9755 + # + # need to set orig_names here first + # so that proper indexing can be done + # with _set_noconvert_columns + # + # once names has been filtered, we will + # then set orig_names again to names + # error: Cannot determine type of 'names' + self.orig_names = self.names[:] # type: ignore[has-type] + + if self.usecols: + usecols = self._evaluate_usecols(self.usecols, self.orig_names) + + # GH 14671 + # assert for mypy, orig_names is List or None, None would error in issubset + assert self.orig_names is not None + if self.usecols_dtype == "string" and not set(usecols).issubset( + self.orig_names + ): + self._validate_usecols_names(usecols, self.orig_names) + + # error: Cannot determine type of 'names' + if len(self.names) > len(usecols): # type: ignore[has-type] + # error: Cannot determine type of 'names' + self.names = [ # type: ignore[has-type] + n + # error: Cannot determine type of 'names' + for i, n in enumerate(self.names) # type: ignore[has-type] + if (i in usecols or n in usecols) + ] + + # error: Cannot determine type of 'names' + if len(self.names) < len(usecols): # type: ignore[has-type] + # error: Cannot determine type of 'names' + self._validate_usecols_names( + usecols, + self.names, # type: ignore[has-type] + ) + + # error: Cannot determine type of 'names' + self._validate_parse_dates_presence(self.names) # type: ignore[has-type] + self._set_noconvert_columns() + + # error: Cannot determine type of 'names' + self.orig_names = self.names # type: ignore[has-type] + + if not self._has_complex_date_col: + # error: Cannot determine type of 'index_col' + if self._reader.leading_cols == 0 and is_index_col( + self.index_col # type: ignore[has-type] + ): + self._name_processed = True + ( + index_names, + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + self.index_col, + ) = self._clean_index_names( + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + # error: Cannot determine type of 'index_col' + self.index_col, # type: ignore[has-type] + ) + + if self.index_names is None: + self.index_names = index_names + + if self._reader.header is None and not passed_names: + assert self.index_names is not None + self.index_names = [None] * len(self.index_names) + + self._implicit_index = self._reader.leading_cols > 0 + + def close(self) -> None: + # close handles opened by C parser + try: + self._reader.close() + except ValueError: + pass + + def _set_noconvert_columns(self) -> None: + """ + Set the columns that should not undergo dtype conversions. + + Currently, any column that is involved with date parsing will not + undergo such conversions. + """ + assert self.orig_names is not None + # error: Cannot determine type of 'names' + + # much faster than using orig_names.index(x) xref GH#44106 + names_dict = {x: i for i, x in enumerate(self.orig_names)} + col_indices = [names_dict[x] for x in self.names] # type: ignore[has-type] + # error: Cannot determine type of 'names' + noconvert_columns = self._set_noconvert_dtype_columns( + col_indices, + self.names, # type: ignore[has-type] + ) + for col in noconvert_columns: + self._reader.set_noconvert(col) + + def read( + self, + nrows: int | None = None, + ) -> tuple[ + Index | MultiIndex | None, + Sequence[Hashable] | MultiIndex, + Mapping[Hashable, ArrayLike], + ]: + index: Index | MultiIndex | None + column_names: Sequence[Hashable] | MultiIndex + try: + if self.low_memory: + chunks = self._reader.read_low_memory(nrows) + # destructive to chunks + data = _concatenate_chunks(chunks) + + else: + data = self._reader.read(nrows) + except StopIteration: + if self._first_chunk: + self._first_chunk = False + names = dedup_names( + self.orig_names, + is_potential_multi_index(self.orig_names, self.index_col), + ) + index, columns, col_dict = self._get_empty_meta( + names, + dtype=self.dtype, + ) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + + if self.usecols is not None: + columns = self._filter_usecols(columns) + + col_dict = {k: v for k, v in col_dict.items() if k in columns} + + return index, columns, col_dict + + else: + self.close() + raise + + # Done with first read, next time raise StopIteration + self._first_chunk = False + + # error: Cannot determine type of 'names' + names = self.names # type: ignore[has-type] + + if self._reader.leading_cols: + if self._has_complex_date_col: + raise NotImplementedError("file structure not yet supported") + + # implicit index, no index names + arrays = [] + + if self.index_col and self._reader.leading_cols != len(self.index_col): + raise ParserError( + "Could not construct index. Requested to use " + f"{len(self.index_col)} number of columns, but " + f"{self._reader.leading_cols} left to parse." + ) + + for i in range(self._reader.leading_cols): + if self.index_col is None: + values = data.pop(i) + else: + values = data.pop(self.index_col[i]) + + values = self._maybe_parse_dates(values, i, try_parse_dates=True) + arrays.append(values) + + index = ensure_index_from_sequences(arrays) + + if self.usecols is not None: + names = self._filter_usecols(names) + + names = dedup_names(names, is_potential_multi_index(names, self.index_col)) + + # rename dict keys + data_tups = sorted(data.items()) + data = {k: v for k, (i, v) in zip(names, data_tups)} + + column_names, date_data = self._do_date_conversions(names, data) + + # maybe create a mi on the columns + column_names = self._maybe_make_multi_index_columns( + column_names, self.col_names + ) + + else: + # rename dict keys + data_tups = sorted(data.items()) + + # ugh, mutation + + # assert for mypy, orig_names is List or None, None would error in list(...) + assert self.orig_names is not None + names = list(self.orig_names) + names = dedup_names(names, is_potential_multi_index(names, self.index_col)) + + if self.usecols is not None: + names = self._filter_usecols(names) + + # columns as list + alldata = [x[1] for x in data_tups] + if self.usecols is None: + self._check_data_length(names, alldata) + + data = {k: v for k, (i, v) in zip(names, data_tups)} + + names, date_data = self._do_date_conversions(names, data) + index, column_names = self._make_index(date_data, alldata, names) + + return index, column_names, date_data + + def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]: + # hackish + usecols = self._evaluate_usecols(self.usecols, names) + if usecols is not None and len(names) != len(usecols): + names = [ + name for i, name in enumerate(names) if i in usecols or name in usecols + ] + return names + + def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): + if try_parse_dates and self._should_parse_dates(index): + values = self._date_conv( + values, + col=self.index_names[index] if self.index_names is not None else None, + ) + return values + + +def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: + """ + Concatenate chunks of data read with low_memory=True. + + The tricky part is handling Categoricals, where different chunks + may have different inferred categories. + """ + names = list(chunks[0].keys()) + warning_columns = [] + + result: dict = {} + for name in names: + arrs = [chunk.pop(name) for chunk in chunks] + # Check each arr for consistent types. + dtypes = {a.dtype for a in arrs} + non_cat_dtypes = {x for x in dtypes if not isinstance(x, CategoricalDtype)} + + dtype = dtypes.pop() + if isinstance(dtype, CategoricalDtype): + result[name] = union_categoricals(arrs, sort_categories=False) + else: + result[name] = concat_compat(arrs) + if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object): + warning_columns.append(str(name)) + + if warning_columns: + warning_names = ",".join(warning_columns) + warning_message = " ".join( + [ + f"Columns ({warning_names}) have mixed types. " + f"Specify dtype option on import or set low_memory=False." + ] + ) + warnings.warn(warning_message, DtypeWarning, stacklevel=find_stack_level()) + return result + + +def ensure_dtype_objs( + dtype: DtypeArg | dict[Hashable, DtypeArg] | None +) -> DtypeObj | dict[Hashable, DtypeObj] | None: + """ + Ensure we have either None, a dtype object, or a dictionary mapping to + dtype objects. + """ + if isinstance(dtype, defaultdict): + # "None" not callable [misc] + default_dtype = pandas_dtype(dtype.default_factory()) # type: ignore[misc] + dtype_converted: defaultdict = defaultdict(lambda: default_dtype) + for key in dtype.keys(): + dtype_converted[key] = pandas_dtype(dtype[key]) + return dtype_converted + elif isinstance(dtype, dict): + return {k: pandas_dtype(dtype[k]) for k in dtype} + elif dtype is not None: + return pandas_dtype(dtype) + return dtype diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/python_parser.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/python_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..79e7554a5744cf439a65e9fd1e18782a0fa71548 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/python_parser.py @@ -0,0 +1,1387 @@ +from __future__ import annotations + +from collections import ( + abc, + defaultdict, +) +from collections.abc import ( + Hashable, + Iterator, + Mapping, + Sequence, +) +import csv +from io import StringIO +import re +from typing import ( + IO, + TYPE_CHECKING, + DefaultDict, + Literal, + cast, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas.errors import ( + EmptyDataError, + ParserError, + ParserWarning, +) +from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_integer, + is_numeric_dtype, +) +from pandas.core.dtypes.inference import is_dict_like + +from pandas.io.common import ( + dedup_names, + is_potential_multi_index, +) +from pandas.io.parsers.base_parser import ( + ParserBase, + parser_defaults, +) + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + ReadCsvBuffer, + Scalar, + ) + + from pandas import ( + Index, + MultiIndex, + ) + +# BOM character (byte order mark) +# This exists at the beginning of a file to indicate endianness +# of a file (stream). Unfortunately, this marker screws up parsing, +# so we need to remove it if we see it. +_BOM = "\ufeff" + + +class PythonParser(ParserBase): + _no_thousands_columns: set[int] + + def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: + """ + Workhorse function for processing nested list into DataFrame + """ + super().__init__(kwds) + + self.data: Iterator[str] | None = None + self.buf: list = [] + self.pos = 0 + self.line_pos = 0 + + self.skiprows = kwds["skiprows"] + + if callable(self.skiprows): + self.skipfunc = self.skiprows + else: + self.skipfunc = lambda x: x in self.skiprows + + self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"]) + self.delimiter = kwds["delimiter"] + + self.quotechar = kwds["quotechar"] + if isinstance(self.quotechar, str): + self.quotechar = str(self.quotechar) + + self.escapechar = kwds["escapechar"] + self.doublequote = kwds["doublequote"] + self.skipinitialspace = kwds["skipinitialspace"] + self.lineterminator = kwds["lineterminator"] + self.quoting = kwds["quoting"] + self.skip_blank_lines = kwds["skip_blank_lines"] + + self.has_index_names = False + if "has_index_names" in kwds: + self.has_index_names = kwds["has_index_names"] + + self.verbose = kwds["verbose"] + + self.thousands = kwds["thousands"] + self.decimal = kwds["decimal"] + + self.comment = kwds["comment"] + + # Set self.data to something that can read lines. + if isinstance(f, list): + # read_excel: f is a list + self.data = cast(Iterator[str], f) + else: + assert hasattr(f, "readline") + self.data = self._make_reader(f) + + # Get columns in two steps: infer from data, then + # infer column indices from self.usecols if it is specified. + self._col_indices: list[int] | None = None + columns: list[list[Scalar | None]] + ( + columns, + self.num_original_columns, + self.unnamed_cols, + ) = self._infer_columns() + + # Now self.columns has the set of columns that we will process. + # The original set is stored in self.original_columns. + # error: Cannot determine type of 'index_names' + ( + self.columns, + self.index_names, + self.col_names, + _, + ) = self._extract_multi_indexer_columns( + columns, + self.index_names, # type: ignore[has-type] + ) + + # get popped off for index + self.orig_names: list[Hashable] = list(self.columns) + + # needs to be cleaned/refactored + # multiple date column thing turning into a real spaghetti factory + + if not self._has_complex_date_col: + (index_names, self.orig_names, self.columns) = self._get_index_name() + self._name_processed = True + if self.index_names is None: + self.index_names = index_names + + if self._col_indices is None: + self._col_indices = list(range(len(self.columns))) + + self._parse_date_cols = self._validate_parse_dates_presence(self.columns) + self._no_thousands_columns = self._set_no_thousand_columns() + + if len(self.decimal) != 1: + raise ValueError("Only length-1 decimal markers supported") + + @cache_readonly + def num(self) -> re.Pattern: + decimal = re.escape(self.decimal) + if self.thousands is None: + regex = rf"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$" + else: + thousands = re.escape(self.thousands) + regex = ( + rf"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?" + rf"([0-9]?(E|e)\-?[0-9]+)?$" + ) + return re.compile(regex) + + def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]): + sep = self.delimiter + + if sep is None or len(sep) == 1: + if self.lineterminator: + raise ValueError( + "Custom line terminators not supported in python parser (yet)" + ) + + class MyDialect(csv.Dialect): + delimiter = self.delimiter + quotechar = self.quotechar + escapechar = self.escapechar + doublequote = self.doublequote + skipinitialspace = self.skipinitialspace + quoting = self.quoting + lineterminator = "\n" + + dia = MyDialect + + if sep is not None: + dia.delimiter = sep + else: + # attempt to sniff the delimiter from the first valid line, + # i.e. no comment line and not in skiprows + line = f.readline() + lines = self._check_comments([[line]])[0] + while self.skipfunc(self.pos) or not lines: + self.pos += 1 + line = f.readline() + lines = self._check_comments([[line]])[0] + lines_str = cast(list[str], lines) + + # since `line` was a string, lines will be a list containing + # only a single string + line = lines_str[0] + + self.pos += 1 + self.line_pos += 1 + sniffed = csv.Sniffer().sniff(line) + dia.delimiter = sniffed.delimiter + + # Note: encoding is irrelevant here + line_rdr = csv.reader(StringIO(line), dialect=dia) + self.buf.extend(list(line_rdr)) + + # Note: encoding is irrelevant here + reader = csv.reader(f, dialect=dia, strict=True) + + else: + + def _read(): + line = f.readline() + pat = re.compile(sep) + + yield pat.split(line.strip()) + + for line in f: + yield pat.split(line.strip()) + + reader = _read() + + return reader + + def read( + self, rows: int | None = None + ) -> tuple[ + Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike] + ]: + try: + content = self._get_lines(rows) + except StopIteration: + if self._first_chunk: + content = [] + else: + self.close() + raise + + # done with first read, next time raise StopIteration + self._first_chunk = False + + columns: Sequence[Hashable] = list(self.orig_names) + if not len(content): # pragma: no cover + # DataFrame with the right metadata, even though it's length 0 + # error: Cannot determine type of 'index_col' + names = dedup_names( + self.orig_names, + is_potential_multi_index( + self.orig_names, + self.index_col, # type: ignore[has-type] + ), + ) + index, columns, col_dict = self._get_empty_meta( + names, + self.dtype, + ) + conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names) + return index, conv_columns, col_dict + + # handle new style for names in index + count_empty_content_vals = count_empty_vals(content[0]) + indexnamerow = None + if self.has_index_names and count_empty_content_vals == len(columns): + indexnamerow = content[0] + content = content[1:] + + alldata = self._rows_to_cols(content) + data, columns = self._exclude_implicit_index(alldata) + + conv_data = self._convert_data(data) + columns, conv_data = self._do_date_conversions(columns, conv_data) + + index, result_columns = self._make_index( + conv_data, alldata, columns, indexnamerow + ) + + return index, result_columns, conv_data + + def _exclude_implicit_index( + self, + alldata: list[np.ndarray], + ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]: + # error: Cannot determine type of 'index_col' + names = dedup_names( + self.orig_names, + is_potential_multi_index( + self.orig_names, + self.index_col, # type: ignore[has-type] + ), + ) + + offset = 0 + if self._implicit_index: + # error: Cannot determine type of 'index_col' + offset = len(self.index_col) # type: ignore[has-type] + + len_alldata = len(alldata) + self._check_data_length(names, alldata) + + return { + name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata + }, names + + # legacy + def get_chunk( + self, size: int | None = None + ) -> tuple[ + Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike] + ]: + if size is None: + # error: "PythonParser" has no attribute "chunksize" + size = self.chunksize # type: ignore[attr-defined] + return self.read(rows=size) + + def _convert_data( + self, + data: Mapping[Hashable, np.ndarray], + ) -> Mapping[Hashable, ArrayLike]: + # apply converters + clean_conv = self._clean_mapping(self.converters) + clean_dtypes = self._clean_mapping(self.dtype) + + # Apply NA values. + clean_na_values = {} + clean_na_fvalues = {} + + if isinstance(self.na_values, dict): + for col in self.na_values: + na_value = self.na_values[col] + na_fvalue = self.na_fvalues[col] + + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + + clean_na_values[col] = na_value + clean_na_fvalues[col] = na_fvalue + else: + clean_na_values = self.na_values + clean_na_fvalues = self.na_fvalues + + return self._convert_to_ndarrays( + data, + clean_na_values, + clean_na_fvalues, + self.verbose, + clean_conv, + clean_dtypes, + ) + + @cache_readonly + def _have_mi_columns(self) -> bool: + if self.header is None: + return False + + header = self.header + if isinstance(header, (list, tuple, np.ndarray)): + return len(header) > 1 + else: + return False + + def _infer_columns( + self, + ) -> tuple[list[list[Scalar | None]], int, set[Scalar | None]]: + names = self.names + num_original_columns = 0 + clear_buffer = True + unnamed_cols: set[Scalar | None] = set() + + if self.header is not None: + header = self.header + have_mi_columns = self._have_mi_columns + + if isinstance(header, (list, tuple, np.ndarray)): + # we have a mi columns, so read an extra line + if have_mi_columns: + header = list(header) + [header[-1] + 1] + else: + header = [header] + + columns: list[list[Scalar | None]] = [] + for level, hr in enumerate(header): + try: + line = self._buffered_line() + + while self.line_pos <= hr: + line = self._next_line() + + except StopIteration as err: + if 0 < self.line_pos <= hr and ( + not have_mi_columns or hr != header[-1] + ): + # If no rows we want to raise a different message and if + # we have mi columns, the last line is not part of the header + joi = list(map(str, header[:-1] if have_mi_columns else header)) + msg = f"[{','.join(joi)}], len of {len(joi)}, " + raise ValueError( + f"Passed header={msg}" + f"but only {self.line_pos} lines in file" + ) from err + + # We have an empty file, so check + # if columns are provided. That will + # serve as the 'line' for parsing + if have_mi_columns and hr > 0: + if clear_buffer: + self._clear_buffer() + columns.append([None] * len(columns[-1])) + return columns, num_original_columns, unnamed_cols + + if not self.names: + raise EmptyDataError("No columns to parse from file") from err + + line = self.names[:] + + this_columns: list[Scalar | None] = [] + this_unnamed_cols = [] + + for i, c in enumerate(line): + if c == "": + if have_mi_columns: + col_name = f"Unnamed: {i}_level_{level}" + else: + col_name = f"Unnamed: {i}" + + this_unnamed_cols.append(i) + this_columns.append(col_name) + else: + this_columns.append(c) + + if not have_mi_columns: + counts: DefaultDict = defaultdict(int) + # Ensure that regular columns are used before unnamed ones + # to keep given names and mangle unnamed columns + col_loop_order = [ + i + for i in range(len(this_columns)) + if i not in this_unnamed_cols + ] + this_unnamed_cols + + # TODO: Use pandas.io.common.dedup_names instead (see #50371) + for i in col_loop_order: + col = this_columns[i] + old_col = col + cur_count = counts[col] + + if cur_count > 0: + while cur_count > 0: + counts[old_col] = cur_count + 1 + col = f"{old_col}.{cur_count}" + if col in this_columns: + cur_count += 1 + else: + cur_count = counts[col] + + if ( + self.dtype is not None + and is_dict_like(self.dtype) + and self.dtype.get(old_col) is not None + and self.dtype.get(col) is None + ): + self.dtype.update({col: self.dtype.get(old_col)}) + this_columns[i] = col + counts[col] = cur_count + 1 + elif have_mi_columns: + # if we have grabbed an extra line, but its not in our + # format so save in the buffer, and create an blank extra + # line for the rest of the parsing code + if hr == header[-1]: + lc = len(this_columns) + # error: Cannot determine type of 'index_col' + sic = self.index_col # type: ignore[has-type] + ic = len(sic) if sic is not None else 0 + unnamed_count = len(this_unnamed_cols) + + # if wrong number of blanks or no index, not our format + if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0: + clear_buffer = False + this_columns = [None] * lc + self.buf = [self.buf[-1]] + + columns.append(this_columns) + unnamed_cols.update({this_columns[i] for i in this_unnamed_cols}) + + if len(columns) == 1: + num_original_columns = len(this_columns) + + if clear_buffer: + self._clear_buffer() + + first_line: list[Scalar] | None + if names is not None: + # Read first row after header to check if data are longer + try: + first_line = self._next_line() + except StopIteration: + first_line = None + + len_first_data_row = 0 if first_line is None else len(first_line) + + if len(names) > len(columns[0]) and len(names) > len_first_data_row: + raise ValueError( + "Number of passed names did not match " + "number of header fields in the file" + ) + if len(columns) > 1: + raise TypeError("Cannot pass names with multi-index columns") + + if self.usecols is not None: + # Set _use_cols. We don't store columns because they are + # overwritten. + self._handle_usecols(columns, names, num_original_columns) + else: + num_original_columns = len(names) + if self._col_indices is not None and len(names) != len( + self._col_indices + ): + columns = [[names[i] for i in sorted(self._col_indices)]] + else: + columns = [names] + else: + columns = self._handle_usecols( + columns, columns[0], num_original_columns + ) + else: + ncols = len(self._header_line) + num_original_columns = ncols + + if not names: + columns = [list(range(ncols))] + columns = self._handle_usecols(columns, columns[0], ncols) + elif self.usecols is None or len(names) >= ncols: + columns = self._handle_usecols([names], names, ncols) + num_original_columns = len(names) + elif not callable(self.usecols) and len(names) != len(self.usecols): + raise ValueError( + "Number of passed names did not match number of " + "header fields in the file" + ) + else: + # Ignore output but set used columns. + columns = [names] + self._handle_usecols(columns, columns[0], ncols) + + return columns, num_original_columns, unnamed_cols + + @cache_readonly + def _header_line(self): + # Store line for reuse in _get_index_name + if self.header is not None: + return None + + try: + line = self._buffered_line() + except StopIteration as err: + if not self.names: + raise EmptyDataError("No columns to parse from file") from err + + line = self.names[:] + return line + + def _handle_usecols( + self, + columns: list[list[Scalar | None]], + usecols_key: list[Scalar | None], + num_original_columns: int, + ) -> list[list[Scalar | None]]: + """ + Sets self._col_indices + + usecols_key is used if there are string usecols. + """ + col_indices: set[int] | list[int] + if self.usecols is not None: + if callable(self.usecols): + col_indices = self._evaluate_usecols(self.usecols, usecols_key) + elif any(isinstance(u, str) for u in self.usecols): + if len(columns) > 1: + raise ValueError( + "If using multiple headers, usecols must be integers." + ) + col_indices = [] + + for col in self.usecols: + if isinstance(col, str): + try: + col_indices.append(usecols_key.index(col)) + except ValueError: + self._validate_usecols_names(self.usecols, usecols_key) + else: + col_indices.append(col) + else: + missing_usecols = [ + col for col in self.usecols if col >= num_original_columns + ] + if missing_usecols: + raise ParserError( + "Defining usecols with out-of-bounds indices is not allowed. " + f"{missing_usecols} are out-of-bounds.", + ) + col_indices = self.usecols + + columns = [ + [n for i, n in enumerate(column) if i in col_indices] + for column in columns + ] + self._col_indices = sorted(col_indices) + return columns + + def _buffered_line(self) -> list[Scalar]: + """ + Return a line from buffer, filling buffer if required. + """ + if len(self.buf) > 0: + return self.buf[0] + else: + return self._next_line() + + def _check_for_bom(self, first_row: list[Scalar]) -> list[Scalar]: + """ + Checks whether the file begins with the BOM character. + If it does, remove it. In addition, if there is quoting + in the field subsequent to the BOM, remove it as well + because it technically takes place at the beginning of + the name, not the middle of it. + """ + # first_row will be a list, so we need to check + # that that list is not empty before proceeding. + if not first_row: + return first_row + + # The first element of this row is the one that could have the + # BOM that we want to remove. Check that the first element is a + # string before proceeding. + if not isinstance(first_row[0], str): + return first_row + + # Check that the string is not empty, as that would + # obviously not have a BOM at the start of it. + if not first_row[0]: + return first_row + + # Since the string is non-empty, check that it does + # in fact begin with a BOM. + first_elt = first_row[0][0] + if first_elt != _BOM: + return first_row + + first_row_bom = first_row[0] + new_row: str + + if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar: + start = 2 + quote = first_row_bom[1] + end = first_row_bom[2:].index(quote) + 2 + + # Extract the data between the quotation marks + new_row = first_row_bom[start:end] + + # Extract any remaining data after the second + # quotation mark. + if len(first_row_bom) > end + 1: + new_row += first_row_bom[end + 1 :] + + else: + # No quotation so just remove BOM from first element + new_row = first_row_bom[1:] + + new_row_list: list[Scalar] = [new_row] + return new_row_list + first_row[1:] + + def _is_line_empty(self, line: list[Scalar]) -> bool: + """ + Check if a line is empty or not. + + Parameters + ---------- + line : str, array-like + The line of data to check. + + Returns + ------- + boolean : Whether or not the line is empty. + """ + return not line or all(not x for x in line) + + def _next_line(self) -> list[Scalar]: + if isinstance(self.data, list): + while self.skipfunc(self.pos): + if self.pos >= len(self.data): + break + self.pos += 1 + + while True: + try: + line = self._check_comments([self.data[self.pos]])[0] + self.pos += 1 + # either uncommented or blank to begin with + if not self.skip_blank_lines and ( + self._is_line_empty(self.data[self.pos - 1]) or line + ): + break + if self.skip_blank_lines: + ret = self._remove_empty_lines([line]) + if ret: + line = ret[0] + break + except IndexError: + raise StopIteration + else: + while self.skipfunc(self.pos): + self.pos += 1 + # assert for mypy, data is Iterator[str] or None, would error in next + assert self.data is not None + next(self.data) + + while True: + orig_line = self._next_iter_line(row_num=self.pos + 1) + self.pos += 1 + + if orig_line is not None: + line = self._check_comments([orig_line])[0] + + if self.skip_blank_lines: + ret = self._remove_empty_lines([line]) + + if ret: + line = ret[0] + break + elif self._is_line_empty(orig_line) or line: + break + + # This was the first line of the file, + # which could contain the BOM at the + # beginning of it. + if self.pos == 1: + line = self._check_for_bom(line) + + self.line_pos += 1 + self.buf.append(line) + return line + + def _alert_malformed(self, msg: str, row_num: int) -> None: + """ + Alert a user about a malformed row, depending on value of + `self.on_bad_lines` enum. + + If `self.on_bad_lines` is ERROR, the alert will be `ParserError`. + If `self.on_bad_lines` is WARN, the alert will be printed out. + + Parameters + ---------- + msg: str + The error message to display. + row_num: int + The row number where the parsing error occurred. + Because this row number is displayed, we 1-index, + even though we 0-index internally. + """ + if self.on_bad_lines == self.BadLineHandleMethod.ERROR: + raise ParserError(msg) + if self.on_bad_lines == self.BadLineHandleMethod.WARN: + warnings.warn( + f"Skipping line {row_num}: {msg}\n", + ParserWarning, + stacklevel=find_stack_level(), + ) + + def _next_iter_line(self, row_num: int) -> list[Scalar] | None: + """ + Wrapper around iterating through `self.data` (CSV source). + + When a CSV error is raised, we check for specific + error messages that allow us to customize the + error message displayed to the user. + + Parameters + ---------- + row_num: int + The row number of the line being parsed. + """ + try: + # assert for mypy, data is Iterator[str] or None, would error in next + assert self.data is not None + line = next(self.data) + # for mypy + assert isinstance(line, list) + return line + except csv.Error as e: + if self.on_bad_lines in ( + self.BadLineHandleMethod.ERROR, + self.BadLineHandleMethod.WARN, + ): + msg = str(e) + + if "NULL byte" in msg or "line contains NUL" in msg: + msg = ( + "NULL byte detected. This byte " + "cannot be processed in Python's " + "native csv library at the moment, " + "so please pass in engine='c' instead" + ) + + if self.skipfooter > 0: + reason = ( + "Error could possibly be due to " + "parsing errors in the skipped footer rows " + "(the skipfooter keyword is only applied " + "after Python's csv library has parsed " + "all rows)." + ) + msg += ". " + reason + + self._alert_malformed(msg, row_num) + return None + + def _check_comments(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: + if self.comment is None: + return lines + ret = [] + for line in lines: + rl = [] + for x in line: + if ( + not isinstance(x, str) + or self.comment not in x + or x in self.na_values + ): + rl.append(x) + else: + x = x[: x.find(self.comment)] + if len(x) > 0: + rl.append(x) + break + ret.append(rl) + return ret + + def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: + """ + Iterate through the lines and remove any that are + either empty or contain only one whitespace value + + Parameters + ---------- + lines : list of list of Scalars + The array of lines that we are to filter. + + Returns + ------- + filtered_lines : list of list of Scalars + The same array of lines with the "empty" ones removed. + """ + # Remove empty lines and lines with only one whitespace value + ret = [ + line + for line in lines + if ( + len(line) > 1 + or len(line) == 1 + and (not isinstance(line[0], str) or line[0].strip()) + ) + ] + return ret + + def _check_thousands(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: + if self.thousands is None: + return lines + + return self._search_replace_num_columns( + lines=lines, search=self.thousands, replace="" + ) + + def _search_replace_num_columns( + self, lines: list[list[Scalar]], search: str, replace: str + ) -> list[list[Scalar]]: + ret = [] + for line in lines: + rl = [] + for i, x in enumerate(line): + if ( + not isinstance(x, str) + or search not in x + or i in self._no_thousands_columns + or not self.num.search(x.strip()) + ): + rl.append(x) + else: + rl.append(x.replace(search, replace)) + ret.append(rl) + return ret + + def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: + if self.decimal == parser_defaults["decimal"]: + return lines + + return self._search_replace_num_columns( + lines=lines, search=self.decimal, replace="." + ) + + def _clear_buffer(self) -> None: + self.buf = [] + + def _get_index_name( + self, + ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]: + """ + Try several cases to get lines: + + 0) There are headers on row 0 and row 1 and their + total summed lengths equals the length of the next line. + Treat row 0 as columns and row 1 as indices + 1) Look for implicit index: there are more columns + on row 1 than row 0. If this is true, assume that row + 1 lists index columns and row 0 lists normal columns. + 2) Get index from the columns if it was listed. + """ + columns: Sequence[Hashable] = self.orig_names + orig_names = list(columns) + columns = list(columns) + + line: list[Scalar] | None + if self._header_line is not None: + line = self._header_line + else: + try: + line = self._next_line() + except StopIteration: + line = None + + next_line: list[Scalar] | None + try: + next_line = self._next_line() + except StopIteration: + next_line = None + + # implicitly index_col=0 b/c 1 fewer column names + implicit_first_cols = 0 + if line is not None: + # leave it 0, #2442 + # Case 1 + # error: Cannot determine type of 'index_col' + index_col = self.index_col # type: ignore[has-type] + if index_col is not False: + implicit_first_cols = len(line) - self.num_original_columns + + # Case 0 + if ( + next_line is not None + and self.header is not None + and index_col is not False + ): + if len(next_line) == len(line) + self.num_original_columns: + # column and index names on diff rows + self.index_col = list(range(len(line))) + self.buf = self.buf[1:] + + for c in reversed(line): + columns.insert(0, c) + + # Update list of original names to include all indices. + orig_names = list(columns) + self.num_original_columns = len(columns) + return line, orig_names, columns + + if implicit_first_cols > 0: + # Case 1 + self._implicit_index = True + if self.index_col is None: + self.index_col = list(range(implicit_first_cols)) + + index_name = None + + else: + # Case 2 + (index_name, _, self.index_col) = self._clean_index_names( + columns, self.index_col + ) + + return index_name, orig_names, columns + + def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: + col_len = self.num_original_columns + + if self._implicit_index: + col_len += len(self.index_col) + + max_len = max(len(row) for row in content) + + # Check that there are no rows with too many + # elements in their row (rows with too few + # elements are padded with NaN). + # error: Non-overlapping identity check (left operand type: "List[int]", + # right operand type: "Literal[False]") + if ( + max_len > col_len + and self.index_col is not False # type: ignore[comparison-overlap] + and self.usecols is None + ): + footers = self.skipfooter if self.skipfooter else 0 + bad_lines = [] + + iter_content = enumerate(content) + content_len = len(content) + content = [] + + for i, _content in iter_content: + actual_len = len(_content) + + if actual_len > col_len: + if callable(self.on_bad_lines): + new_l = self.on_bad_lines(_content) + if new_l is not None: + content.append(new_l) + elif self.on_bad_lines in ( + self.BadLineHandleMethod.ERROR, + self.BadLineHandleMethod.WARN, + ): + row_num = self.pos - (content_len - i + footers) + bad_lines.append((row_num, actual_len)) + + if self.on_bad_lines == self.BadLineHandleMethod.ERROR: + break + else: + content.append(_content) + + for row_num, actual_len in bad_lines: + msg = ( + f"Expected {col_len} fields in line {row_num + 1}, saw " + f"{actual_len}" + ) + if ( + self.delimiter + and len(self.delimiter) > 1 + and self.quoting != csv.QUOTE_NONE + ): + # see gh-13374 + reason = ( + "Error could possibly be due to quotes being " + "ignored when a multi-char delimiter is used." + ) + msg += ". " + reason + + self._alert_malformed(msg, row_num + 1) + + # see gh-13320 + zipped_content = list(lib.to_object_array(content, min_width=col_len).T) + + if self.usecols: + assert self._col_indices is not None + col_indices = self._col_indices + + if self._implicit_index: + zipped_content = [ + a + for i, a in enumerate(zipped_content) + if ( + i < len(self.index_col) + or i - len(self.index_col) in col_indices + ) + ] + else: + zipped_content = [ + a for i, a in enumerate(zipped_content) if i in col_indices + ] + return zipped_content + + def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: + lines = self.buf + new_rows = None + + # already fetched some number + if rows is not None: + # we already have the lines in the buffer + if len(self.buf) >= rows: + new_rows, self.buf = self.buf[:rows], self.buf[rows:] + + # need some lines + else: + rows -= len(self.buf) + + if new_rows is None: + if isinstance(self.data, list): + if self.pos > len(self.data): + raise StopIteration + if rows is None: + new_rows = self.data[self.pos :] + new_pos = len(self.data) + else: + new_rows = self.data[self.pos : self.pos + rows] + new_pos = self.pos + rows + + new_rows = self._remove_skipped_rows(new_rows) + lines.extend(new_rows) + self.pos = new_pos + + else: + new_rows = [] + try: + if rows is not None: + row_index = 0 + row_ct = 0 + offset = self.pos if self.pos is not None else 0 + while row_ct < rows: + # assert for mypy, data is Iterator[str] or None, would + # error in next + assert self.data is not None + new_row = next(self.data) + if not self.skipfunc(offset + row_index): + row_ct += 1 + row_index += 1 + new_rows.append(new_row) + + len_new_rows = len(new_rows) + new_rows = self._remove_skipped_rows(new_rows) + lines.extend(new_rows) + else: + rows = 0 + + while True: + next_row = self._next_iter_line(row_num=self.pos + rows + 1) + rows += 1 + + if next_row is not None: + new_rows.append(next_row) + len_new_rows = len(new_rows) + + except StopIteration: + len_new_rows = len(new_rows) + new_rows = self._remove_skipped_rows(new_rows) + lines.extend(new_rows) + if len(lines) == 0: + raise + self.pos += len_new_rows + + self.buf = [] + else: + lines = new_rows + + if self.skipfooter: + lines = lines[: -self.skipfooter] + + lines = self._check_comments(lines) + if self.skip_blank_lines: + lines = self._remove_empty_lines(lines) + lines = self._check_thousands(lines) + return self._check_decimal(lines) + + def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar]]: + if self.skiprows: + return [ + row for i, row in enumerate(new_rows) if not self.skipfunc(i + self.pos) + ] + return new_rows + + def _set_no_thousand_columns(self) -> set[int]: + no_thousands_columns: set[int] = set() + if self.columns and self.parse_dates: + assert self._col_indices is not None + no_thousands_columns = self._set_noconvert_dtype_columns( + self._col_indices, self.columns + ) + if self.columns and self.dtype: + assert self._col_indices is not None + for i, col in zip(self._col_indices, self.columns): + if not isinstance(self.dtype, dict) and not is_numeric_dtype( + self.dtype + ): + no_thousands_columns.add(i) + if ( + isinstance(self.dtype, dict) + and col in self.dtype + and ( + not is_numeric_dtype(self.dtype[col]) + or is_bool_dtype(self.dtype[col]) + ) + ): + no_thousands_columns.add(i) + return no_thousands_columns + + +class FixedWidthReader(abc.Iterator): + """ + A reader of fixed-width lines. + """ + + def __init__( + self, + f: IO[str] | ReadCsvBuffer[str], + colspecs: list[tuple[int, int]] | Literal["infer"], + delimiter: str | None, + comment: str | None, + skiprows: set[int] | None = None, + infer_nrows: int = 100, + ) -> None: + self.f = f + self.buffer: Iterator | None = None + self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " + self.comment = comment + if colspecs == "infer": + self.colspecs = self.detect_colspecs( + infer_nrows=infer_nrows, skiprows=skiprows + ) + else: + self.colspecs = colspecs + + if not isinstance(self.colspecs, (tuple, list)): + raise TypeError( + "column specifications must be a list or tuple, " + f"input was a {type(colspecs).__name__}" + ) + + for colspec in self.colspecs: + if not ( + isinstance(colspec, (tuple, list)) + and len(colspec) == 2 + and isinstance(colspec[0], (int, np.integer, type(None))) + and isinstance(colspec[1], (int, np.integer, type(None))) + ): + raise TypeError( + "Each column specification must be " + "2 element tuple or list of integers" + ) + + def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]: + """ + Read rows from self.f, skipping as specified. + + We distinguish buffer_rows (the first <= infer_nrows + lines) from the rows returned to detect_colspecs + because it's simpler to leave the other locations + with skiprows logic alone than to modify them to + deal with the fact we skipped some rows here as + well. + + Parameters + ---------- + infer_nrows : int + Number of rows to read from self.f, not counting + rows that are skipped. + skiprows: set, optional + Indices of rows to skip. + + Returns + ------- + detect_rows : list of str + A list containing the rows to read. + + """ + if skiprows is None: + skiprows = set() + buffer_rows = [] + detect_rows = [] + for i, row in enumerate(self.f): + if i not in skiprows: + detect_rows.append(row) + buffer_rows.append(row) + if len(detect_rows) >= infer_nrows: + break + self.buffer = iter(buffer_rows) + return detect_rows + + def detect_colspecs( + self, infer_nrows: int = 100, skiprows: set[int] | None = None + ) -> list[tuple[int, int]]: + # Regex escape the delimiters + delimiters = "".join([rf"\{x}" for x in self.delimiter]) + pattern = re.compile(f"([^{delimiters}]+)") + rows = self.get_rows(infer_nrows, skiprows) + if not rows: + raise EmptyDataError("No rows from which to infer column width") + max_len = max(map(len, rows)) + mask = np.zeros(max_len + 1, dtype=int) + if self.comment is not None: + rows = [row.partition(self.comment)[0] for row in rows] + for row in rows: + for m in pattern.finditer(row): + mask[m.start() : m.end()] = 1 + shifted = np.roll(mask, 1) + shifted[0] = 0 + edges = np.where((mask ^ shifted) == 1)[0] + edge_pairs = list(zip(edges[::2], edges[1::2])) + return edge_pairs + + def __next__(self) -> list[str]: + # Argument 1 to "next" has incompatible type "Union[IO[str], + # ReadCsvBuffer[str]]"; expected "SupportsNext[str]" + if self.buffer is not None: + try: + line = next(self.buffer) + except StopIteration: + self.buffer = None + line = next(self.f) # type: ignore[arg-type] + else: + line = next(self.f) # type: ignore[arg-type] + # Note: 'colspecs' is a sequence of half-open intervals. + return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs] + + +class FixedWidthFieldParser(PythonParser): + """ + Specialization that Converts fixed-width fields into DataFrames. + See PythonParser for details. + """ + + def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None: + # Support iterators, convert to a list. + self.colspecs = kwds.pop("colspecs") + self.infer_nrows = kwds.pop("infer_nrows") + PythonParser.__init__(self, f, **kwds) + + def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader: + return FixedWidthReader( + f, + self.colspecs, + self.delimiter, + self.comment, + self.skiprows, + self.infer_nrows, + ) + + def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: + """ + Returns the list of lines without the empty ones. With fixed-width + fields, empty lines become arrays of empty strings. + + See PythonParser._remove_empty_lines. + """ + return [ + line + for line in lines + if any(not isinstance(e, str) or e.strip() for e in line) + ] + + +def count_empty_vals(vals) -> int: + return sum(1 for v in vals if v == "" or v is None) + + +def _validate_skipfooter_arg(skipfooter: int) -> int: + """ + Validate the 'skipfooter' parameter. + + Checks whether 'skipfooter' is a non-negative integer. + Raises a ValueError if that is not the case. + + Parameters + ---------- + skipfooter : non-negative integer + The number of rows to skip at the end of the file. + + Returns + ------- + validated_skipfooter : non-negative integer + The original input if the validation succeeds. + + Raises + ------ + ValueError : 'skipfooter' was not a non-negative integer. + """ + if not is_integer(skipfooter): + raise ValueError("skipfooter must be an integer") + + if skipfooter < 0: + raise ValueError("skipfooter cannot be negative") + + # Incompatible return value type (got "Union[int, integer[Any]]", expected "int") + return skipfooter # type: ignore[return-value] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py new file mode 100644 index 0000000000000000000000000000000000000000..e04f27b56061030d19081d87439f0461fa53cc76 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py @@ -0,0 +1,2383 @@ +""" +Module contains tools for processing files into DataFrames or other objects + +GH#48849 provides a convenient way of deprecating keyword arguments +""" +from __future__ import annotations + +from collections import ( + abc, + defaultdict, +) +import csv +import sys +from textwrap import fill +from typing import ( + IO, + TYPE_CHECKING, + Any, + Callable, + Literal, + NamedTuple, + TypedDict, + overload, +) +import warnings + +import numpy as np + +from pandas._config import using_copy_on_write + +from pandas._libs import lib +from pandas._libs.parsers import STR_NA_VALUES +from pandas.errors import ( + AbstractMethodError, + ParserWarning, +) +from pandas.util._decorators import Appender +from pandas.util._exceptions import find_stack_level +from pandas.util._validators import check_dtype_backend + +from pandas.core.dtypes.common import ( + is_file_like, + is_float, + is_hashable, + is_integer, + is_list_like, + pandas_dtype, +) + +from pandas import Series +from pandas.core.frame import DataFrame +from pandas.core.indexes.api import RangeIndex +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + IOHandles, + get_handle, + stringify_path, + validate_header_arg, +) +from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper +from pandas.io.parsers.base_parser import ( + ParserBase, + is_index_col, + parser_defaults, +) +from pandas.io.parsers.c_parser_wrapper import CParserWrapper +from pandas.io.parsers.python_parser import ( + FixedWidthFieldParser, + PythonParser, +) + +if TYPE_CHECKING: + from collections.abc import ( + Hashable, + Iterable, + Mapping, + Sequence, + ) + from types import TracebackType + + from pandas._typing import ( + CompressionOptions, + CSVEngine, + DtypeArg, + DtypeBackend, + FilePath, + IndexLabel, + ReadCsvBuffer, + Self, + StorageOptions, + UsecolsArgType, + ) +_doc_read_csv_and_table = ( + r""" +{summary} + +Also supports optionally iterating or breaking of the file +into chunks. + +Additional help can be found in the online docs for +`IO Tools `_. + +Parameters +---------- +filepath_or_buffer : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is + expected. A local file could be: file://localhost/path/to/table.csv. + + If you want to pass in a path object, pandas accepts any ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, such as + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. +sep : str, default {_default_sep} + Character or regex pattern to treat as the delimiter. If ``sep=None``, the + C engine cannot automatically detect + the separator, but the Python parsing engine can, meaning the latter will + be used and automatically detect the separator from only the first valid + row of the file by Python's builtin sniffer tool, ``csv.Sniffer``. + In addition, separators longer than 1 character and different from + ``'\s+'`` will be interpreted as regular expressions and will also force + the use of the Python parsing engine. Note that regex delimiters are prone + to ignoring quoted data. Regex example: ``'\r\t'``. +delimiter : str, optional + Alias for ``sep``. +header : int, Sequence of int, 'infer' or None, default 'infer' + Row number(s) containing column labels and marking the start of the + data (zero-indexed). Default behavior is to infer the column names: if no ``names`` + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly to ``names`` then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a :class:`~pandas.MultiIndex` on the columns + e.g. ``[0, 1, 3]``. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so ``header=0`` denotes the first line of + data rather than the first line of the file. +names : Sequence of Hashable, optional + Sequence of column labels to apply. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. +index_col : Hashable, Sequence of Hashable or False, optional + Column(s) to use as row label(s), denoted either by column labels or column + indices. If a sequence of labels or indices is given, :class:`~pandas.MultiIndex` + will be formed for the row labels. + + Note: ``index_col=False`` can be used to force pandas to *not* use the first + column as the index, e.g., when you have a malformed file with delimiters at + the end of each line. +usecols : Sequence of Hashable or Callable, optional + Subset of columns to select, denoted either by column labels or column indices. + If list-like, all elements must either + be positional (i.e. integer indices into the document columns) or strings + that correspond to column names provided either by the user in ``names`` or + inferred from the document header row(s). If ``names`` are given, the document + header row(s) are not taken into account. For example, a valid list-like + ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a :class:`~pandas.DataFrame` from ``data`` with element order + preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` + for columns in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to ``True``. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. +dtype : dtype or dict of {{Hashable : dtype}}, optional + Data type(s) to apply to either the whole dataset or individual columns. + E.g., ``{{'a': np.float64, 'b': np.int32, 'c': 'Int64'}}`` + Use ``str`` or ``object`` together with suitable ``na_values`` settings + to preserve and not interpret ``dtype``. + If ``converters`` are specified, they will be applied INSTEAD + of ``dtype`` conversion. + + .. versionadded:: 1.5.0 + + Support for ``defaultdict`` was added. Specify a ``defaultdict`` as input where + the default determines the ``dtype`` of the columns which are not explicitly + listed. +engine : {{'c', 'python', 'pyarrow'}}, optional + Parser engine to use. The C and pyarrow engines are faster, while the python engine + is currently more feature-complete. Multithreading is currently only supported by + the pyarrow engine. + + .. versionadded:: 1.4.0 + + The 'pyarrow' engine was added as an *experimental* engine, and some features + are unsupported, or may not work correctly, with this engine. +converters : dict of {{Hashable : Callable}}, optional + Functions for converting values in specified columns. Keys can either + be column labels or column indices. +true_values : list, optional + Values to consider as ``True`` in addition to case-insensitive variants of 'True'. +false_values : list, optional + Values to consider as ``False`` in addition to case-insensitive variants of 'False'. +skipinitialspace : bool, default False + Skip spaces after delimiter. +skiprows : int, list of int or Callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (``int``) + at the start of the file. + + If callable, the callable function will be evaluated against the row + indices, returning ``True`` if the row should be skipped and ``False`` otherwise. + An example of a valid callable argument would be ``lambda x: x in [0, 2]``. +skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with ``engine='c'``). +nrows : int, optional + Number of rows of file to read. Useful for reading pieces of large files. +na_values : Hashable, Iterable of Hashable or dict of {{Hashable : Iterable}}, optional + Additional strings to recognize as ``NA``/``NaN``. If ``dict`` passed, specific + per-column ``NA`` values. By default the following values are interpreted as + ``NaN``: " """ + + fill('", "'.join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") + + """ ". + +keep_default_na : bool, default True + Whether or not to include the default ``NaN`` values when parsing the data. + Depending on whether ``na_values`` is passed in, the behavior is as follows: + + * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` + is appended to the default ``NaN`` values used for parsing. + * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only + the default ``NaN`` values are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only + the ``NaN`` values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no + strings will be parsed as ``NaN``. + + Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. +na_filter : bool, default True + Detect missing value markers (empty strings and the value of ``na_values``). In + data without any ``NA`` values, passing ``na_filter=False`` can improve the + performance of reading a large file. +verbose : bool, default False + Indicate number of ``NA`` values placed in non-numeric columns. + + .. deprecated:: 2.2.0 +skip_blank_lines : bool, default True + If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. +parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \ +default False + The behavior is as follows: + + * ``bool``. If ``True`` -> try parsing the index. Note: Automatically set to + ``True`` if ``date_format`` or ``date_parser`` arguments have been passed. + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 + each as a separate date column. + * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse + as a single date column. Values are joined with a space before parsing. + * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call + result 'foo'. Values are joined with a space before parsing. + + If a column or index cannot be represented as an array of ``datetime``, + say because of an unparsable value or a mixture of timezones, the column + or index will be returned unaltered as an ``object`` data type. For + non-standard ``datetime`` parsing, use :func:`~pandas.to_datetime` after + :func:`~pandas.read_csv`. + + Note: A fast-path exists for iso8601-formatted dates. +infer_datetime_format : bool, default False + If ``True`` and ``parse_dates`` is enabled, pandas will attempt to infer the + format of the ``datetime`` strings in the columns, and if it can be inferred, + switch to a faster method of parsing them. In some cases this can increase + the parsing speed by 5-10x. + + .. deprecated:: 2.0.0 + A strict version of this argument is now the default, passing it has no effect. + +keep_date_col : bool, default False + If ``True`` and ``parse_dates`` specifies combining multiple columns then + keep the original columns. +date_parser : Callable, optional + Function to use for converting a sequence of string columns to an array of + ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the + conversion. pandas will try to call ``date_parser`` in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by ``parse_dates`` into a single array + and pass that; and 3) call ``date_parser`` once for each row using one or + more strings (corresponding to the columns defined by ``parse_dates``) as + arguments. + + .. deprecated:: 2.0.0 + Use ``date_format`` instead, or read in as ``object`` and then apply + :func:`~pandas.to_datetime` as-needed. +date_format : str or dict of column -> format, optional + Format to use for parsing dates when used in conjunction with ``parse_dates``. + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. + + .. versionadded:: 2.0.0 +dayfirst : bool, default False + DD/MM format dates, international and European format. +cache_dates : bool, default True + If ``True``, use a cache of unique, converted dates to apply the ``datetime`` + conversion. May produce significant speed-up when parsing duplicate + date strings, especially ones with timezone offsets. + +iterator : bool, default False + Return ``TextFileReader`` object for iteration or getting chunks with + ``get_chunk()``. +chunksize : int, optional + Number of lines to read from the file per chunk. Passing a value will cause the + function to return a ``TextFileReader`` object for iteration. + See the `IO Tools docs + `_ + for more information on ``iterator`` and ``chunksize``. + +{decompression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + +thousands : str (length 1), optional + Character acting as the thousands separator in numerical values. +decimal : str (length 1), default '.' + Character to recognize as decimal point (e.g., use ',' for European data). +lineterminator : str (length 1), optional + Character used to denote a line break. Only valid with C parser. +quotechar : str (length 1), optional + Character used to denote the start and end of a quoted item. Quoted + items can include the ``delimiter`` and it will be ignored. +quoting : {{0 or csv.QUOTE_MINIMAL, 1 or csv.QUOTE_ALL, 2 or csv.QUOTE_NONNUMERIC, \ +3 or csv.QUOTE_NONE}}, default csv.QUOTE_MINIMAL + Control field quoting behavior per ``csv.QUOTE_*`` constants. Default is + ``csv.QUOTE_MINIMAL`` (i.e., 0) which implies that only fields containing special + characters are quoted (e.g., characters defined in ``quotechar``, ``delimiter``, + or ``lineterminator``. +doublequote : bool, default True + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive ``quotechar`` elements INSIDE a + field as a single ``quotechar`` element. +escapechar : str (length 1), optional + Character used to escape other characters. +comment : str (length 1), optional + Character indicating that the remainder of line should not be parsed. + If found at the beginning + of a line, the line will be ignored altogether. This parameter must be a + single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter ``header`` but not by + ``skiprows``. For example, if ``comment='#'``, parsing + ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in ``'a,b,c'`` being + treated as the header. +encoding : str, optional, default 'utf-8' + Encoding to use for UTF when reading/writing (ex. ``'utf-8'``). `List of Python + standard encodings + `_ . + +encoding_errors : str, optional, default 'strict' + How encoding errors are treated. `List of possible values + `_ . + + .. versionadded:: 1.3.0 + +dialect : str or csv.Dialect, optional + If provided, this parameter will override values (default or not) for the + following parameters: ``delimiter``, ``doublequote``, ``escapechar``, + ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to + override values, a ``ParserWarning`` will be issued. See ``csv.Dialect`` + documentation for more details. +on_bad_lines : {{'error', 'warn', 'skip'}} or Callable, default 'error' + Specifies what to do upon encountering a bad line (a line with too many fields). + Allowed values are : + + - ``'error'``, raise an Exception when a bad line is encountered. + - ``'warn'``, raise a warning when a bad line is encountered and skip that line. + - ``'skip'``, skip bad lines without raising or warning when they are encountered. + + .. versionadded:: 1.3.0 + + .. versionadded:: 1.4.0 + + - Callable, function with signature + ``(bad_line: list[str]) -> list[str] | None`` that will process a single + bad line. ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None``, the bad line will be ignored. + If the function returns a new ``list`` of strings with more elements than + expected, a ``ParserWarning`` will be emitted while dropping extra elements. + Only supported when ``engine='python'`` + + .. versionchanged:: 2.2.0 + + - Callable, function with signature + as described in `pyarrow documentation + `_ when ``engine='pyarrow'`` + +delim_whitespace : bool, default False + Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be + used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option + is set to ``True``, nothing should be passed in for the ``delimiter`` + parameter. + + .. deprecated:: 2.2.0 + Use ``sep="\\s+"`` instead. +low_memory : bool, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set ``False``, or specify the type with the ``dtype`` parameter. + Note that the entire file is read into a single :class:`~pandas.DataFrame` + regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in + chunks. (Only valid with C parser). +memory_map : bool, default False + If a filepath is provided for ``filepath_or_buffer``, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. +float_precision : {{'high', 'legacy', 'round_trip'}}, optional + Specifies which converter the C engine should use for floating-point + values. The options are ``None`` or ``'high'`` for the ordinary converter, + ``'legacy'`` for the original lower precision pandas converter, and + ``'round_trip'`` for the round-trip converter. + +{storage_options} + +dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + +Returns +------- +DataFrame or TextFileReader + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + +See Also +-------- +DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. +{see_also_func_name} : {see_also_func_summary} +read_fwf : Read a table of fixed-width formatted lines into DataFrame. + +Examples +-------- +>>> pd.{func_name}('data.csv') # doctest: +SKIP +""" +) + + +class _C_Parser_Defaults(TypedDict): + delim_whitespace: Literal[False] + na_filter: Literal[True] + low_memory: Literal[True] + memory_map: Literal[False] + float_precision: None + + +_c_parser_defaults: _C_Parser_Defaults = { + "delim_whitespace": False, + "na_filter": True, + "low_memory": True, + "memory_map": False, + "float_precision": None, +} + + +class _Fwf_Defaults(TypedDict): + colspecs: Literal["infer"] + infer_nrows: Literal[100] + widths: None + + +_fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} +_c_unsupported = {"skipfooter"} +_python_unsupported = {"low_memory", "float_precision"} +_pyarrow_unsupported = { + "skipfooter", + "float_precision", + "chunksize", + "comment", + "nrows", + "thousands", + "memory_map", + "dialect", + "delim_whitespace", + "quoting", + "lineterminator", + "converters", + "iterator", + "dayfirst", + "verbose", + "skipinitialspace", + "low_memory", +} + + +class _DeprecationConfig(NamedTuple): + default_value: Any + msg: str | None + + +@overload +def validate_integer(name: str, val: None, min_val: int = ...) -> None: + ... + + +@overload +def validate_integer(name: str, val: float, min_val: int = ...) -> int: + ... + + +@overload +def validate_integer(name: str, val: int | None, min_val: int = ...) -> int | None: + ... + + +def validate_integer( + name: str, val: int | float | None, min_val: int = 0 +) -> int | None: + """ + Checks whether the 'name' parameter for parsing is either + an integer OR float that can SAFELY be cast to an integer + without losing accuracy. Raises a ValueError if that is + not the case. + + Parameters + ---------- + name : str + Parameter name (used for error reporting) + val : int or float + The value to check + min_val : int + Minimum allowed value (val < min_val will result in a ValueError) + """ + if val is None: + return val + + msg = f"'{name:s}' must be an integer >={min_val:d}" + if is_float(val): + if int(val) != val: + raise ValueError(msg) + val = int(val) + elif not (is_integer(val) and val >= min_val): + raise ValueError(msg) + + return int(val) + + +def _validate_names(names: Sequence[Hashable] | None) -> None: + """ + Raise ValueError if the `names` parameter contains duplicates or has an + invalid data type. + + Parameters + ---------- + names : array-like or None + An array containing a list of the names used for the output DataFrame. + + Raises + ------ + ValueError + If names are not unique or are not ordered (e.g. set). + """ + if names is not None: + if len(names) != len(set(names)): + raise ValueError("Duplicate names are not allowed.") + if not ( + is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView) + ): + raise ValueError("Names should be an ordered collection.") + + +def _read( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds +) -> DataFrame | TextFileReader: + """Generic reader of line files.""" + # if we pass a date_parser and parse_dates=False, we should not parse the + # dates GH#44366 + if kwds.get("parse_dates", None) is None: + if ( + kwds.get("date_parser", lib.no_default) is lib.no_default + and kwds.get("date_format", None) is None + ): + kwds["parse_dates"] = False + else: + kwds["parse_dates"] = True + + # Extract some of the arguments (pass chunksize on). + iterator = kwds.get("iterator", False) + chunksize = kwds.get("chunksize", None) + if kwds.get("engine") == "pyarrow": + if iterator: + raise ValueError( + "The 'iterator' option is not supported with the 'pyarrow' engine" + ) + + if chunksize is not None: + raise ValueError( + "The 'chunksize' option is not supported with the 'pyarrow' engine" + ) + else: + chunksize = validate_integer("chunksize", chunksize, 1) + + nrows = kwds.get("nrows", None) + + # Check for duplicates in names. + _validate_names(kwds.get("names", None)) + + # Create the parser. + parser = TextFileReader(filepath_or_buffer, **kwds) + + if chunksize or iterator: + return parser + + with parser: + return parser.read(nrows) + + +# iterator=True -> TextFileReader +@overload +def read_csv( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = ..., + delimiter: str | None | lib.NoDefault = ..., + header: int | Sequence[int] | None | Literal["infer"] = ..., + names: Sequence[Hashable] | None | lib.NoDefault = ..., + index_col: IndexLabel | Literal[False] | None = ..., + usecols: UsecolsArgType = ..., + dtype: DtypeArg | None = ..., + engine: CSVEngine | None = ..., + converters: Mapping[Hashable, Callable] | None = ..., + true_values: list | None = ..., + false_values: list | None = ..., + skipinitialspace: bool = ..., + skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., + skipfooter: int = ..., + nrows: int | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., + na_filter: bool = ..., + verbose: bool | lib.NoDefault = ..., + skip_blank_lines: bool = ..., + parse_dates: bool | Sequence[Hashable] | None = ..., + infer_datetime_format: bool | lib.NoDefault = ..., + keep_date_col: bool | lib.NoDefault = ..., + date_parser: Callable | lib.NoDefault = ..., + date_format: str | dict[Hashable, str] | None = ..., + dayfirst: bool = ..., + cache_dates: bool = ..., + iterator: Literal[True], + chunksize: int | None = ..., + compression: CompressionOptions = ..., + thousands: str | None = ..., + decimal: str = ..., + lineterminator: str | None = ..., + quotechar: str = ..., + quoting: int = ..., + doublequote: bool = ..., + escapechar: str | None = ..., + comment: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + dialect: str | csv.Dialect | None = ..., + on_bad_lines=..., + delim_whitespace: bool | lib.NoDefault = ..., + low_memory: bool = ..., + memory_map: bool = ..., + float_precision: Literal["high", "legacy"] | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., +) -> TextFileReader: + ... + + +# chunksize=int -> TextFileReader +@overload +def read_csv( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = ..., + delimiter: str | None | lib.NoDefault = ..., + header: int | Sequence[int] | None | Literal["infer"] = ..., + names: Sequence[Hashable] | None | lib.NoDefault = ..., + index_col: IndexLabel | Literal[False] | None = ..., + usecols: UsecolsArgType = ..., + dtype: DtypeArg | None = ..., + engine: CSVEngine | None = ..., + converters: Mapping[Hashable, Callable] | None = ..., + true_values: list | None = ..., + false_values: list | None = ..., + skipinitialspace: bool = ..., + skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., + skipfooter: int = ..., + nrows: int | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., + keep_default_na: bool = ..., + na_filter: bool = ..., + verbose: bool | lib.NoDefault = ..., + skip_blank_lines: bool = ..., + parse_dates: bool | Sequence[Hashable] | None = ..., + infer_datetime_format: bool | lib.NoDefault = ..., + keep_date_col: bool | lib.NoDefault = ..., + date_parser: Callable | lib.NoDefault = ..., + date_format: str | dict[Hashable, str] | None = ..., + dayfirst: bool = ..., + cache_dates: bool = ..., + iterator: bool = ..., + chunksize: int, + compression: CompressionOptions = ..., + thousands: str | None = ..., + decimal: str = ..., + lineterminator: str | None = ..., + quotechar: str = ..., + quoting: int = ..., + doublequote: bool = ..., + escapechar: str | None = ..., + comment: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + dialect: str | csv.Dialect | None = ..., + on_bad_lines=..., + delim_whitespace: bool | lib.NoDefault = ..., + low_memory: bool = ..., + memory_map: bool = ..., + float_precision: Literal["high", "legacy"] | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., +) -> TextFileReader: + ... + + +# default case -> DataFrame +@overload +def read_csv( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = ..., + delimiter: str | None | lib.NoDefault = ..., + header: int | Sequence[int] | None | Literal["infer"] = ..., + names: Sequence[Hashable] | None | lib.NoDefault = ..., + index_col: IndexLabel | Literal[False] | None = ..., + usecols: UsecolsArgType = ..., + dtype: DtypeArg | None = ..., + engine: CSVEngine | None = ..., + converters: Mapping[Hashable, Callable] | None = ..., + true_values: list | None = ..., + false_values: list | None = ..., + skipinitialspace: bool = ..., + skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., + skipfooter: int = ..., + nrows: int | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., + keep_default_na: bool = ..., + na_filter: bool = ..., + verbose: bool | lib.NoDefault = ..., + skip_blank_lines: bool = ..., + parse_dates: bool | Sequence[Hashable] | None = ..., + infer_datetime_format: bool | lib.NoDefault = ..., + keep_date_col: bool | lib.NoDefault = ..., + date_parser: Callable | lib.NoDefault = ..., + date_format: str | dict[Hashable, str] | None = ..., + dayfirst: bool = ..., + cache_dates: bool = ..., + iterator: Literal[False] = ..., + chunksize: None = ..., + compression: CompressionOptions = ..., + thousands: str | None = ..., + decimal: str = ..., + lineterminator: str | None = ..., + quotechar: str = ..., + quoting: int = ..., + doublequote: bool = ..., + escapechar: str | None = ..., + comment: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + dialect: str | csv.Dialect | None = ..., + on_bad_lines=..., + delim_whitespace: bool | lib.NoDefault = ..., + low_memory: bool = ..., + memory_map: bool = ..., + float_precision: Literal["high", "legacy"] | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., +) -> DataFrame: + ... + + +# Unions -> DataFrame | TextFileReader +@overload +def read_csv( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = ..., + delimiter: str | None | lib.NoDefault = ..., + header: int | Sequence[int] | None | Literal["infer"] = ..., + names: Sequence[Hashable] | None | lib.NoDefault = ..., + index_col: IndexLabel | Literal[False] | None = ..., + usecols: UsecolsArgType = ..., + dtype: DtypeArg | None = ..., + engine: CSVEngine | None = ..., + converters: Mapping[Hashable, Callable] | None = ..., + true_values: list | None = ..., + false_values: list | None = ..., + skipinitialspace: bool = ..., + skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., + skipfooter: int = ..., + nrows: int | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., + keep_default_na: bool = ..., + na_filter: bool = ..., + verbose: bool | lib.NoDefault = ..., + skip_blank_lines: bool = ..., + parse_dates: bool | Sequence[Hashable] | None = ..., + infer_datetime_format: bool | lib.NoDefault = ..., + keep_date_col: bool | lib.NoDefault = ..., + date_parser: Callable | lib.NoDefault = ..., + date_format: str | dict[Hashable, str] | None = ..., + dayfirst: bool = ..., + cache_dates: bool = ..., + iterator: bool = ..., + chunksize: int | None = ..., + compression: CompressionOptions = ..., + thousands: str | None = ..., + decimal: str = ..., + lineterminator: str | None = ..., + quotechar: str = ..., + quoting: int = ..., + doublequote: bool = ..., + escapechar: str | None = ..., + comment: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + dialect: str | csv.Dialect | None = ..., + on_bad_lines=..., + delim_whitespace: bool | lib.NoDefault = ..., + low_memory: bool = ..., + memory_map: bool = ..., + float_precision: Literal["high", "legacy"] | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., +) -> DataFrame | TextFileReader: + ... + + +@Appender( + _doc_read_csv_and_table.format( + func_name="read_csv", + summary="Read a comma-separated values (csv) file into DataFrame.", + see_also_func_name="read_table", + see_also_func_summary="Read general delimited file into DataFrame.", + _default_sep="','", + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] + % "filepath_or_buffer", + ) +) +def read_csv( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = lib.no_default, + delimiter: str | None | lib.NoDefault = None, + # Column and Index Locations and Names + header: int | Sequence[int] | None | Literal["infer"] = "infer", + names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, + index_col: IndexLabel | Literal[False] | None = None, + usecols: UsecolsArgType = None, + # General Parsing Configuration + dtype: DtypeArg | None = None, + engine: CSVEngine | None = None, + converters: Mapping[Hashable, Callable] | None = None, + true_values: list | None = None, + false_values: list | None = None, + skipinitialspace: bool = False, + skiprows: list[int] | int | Callable[[Hashable], bool] | None = None, + skipfooter: int = 0, + nrows: int | None = None, + # NA and Missing Data Handling + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = None, + keep_default_na: bool = True, + na_filter: bool = True, + verbose: bool | lib.NoDefault = lib.no_default, + skip_blank_lines: bool = True, + # Datetime Handling + parse_dates: bool | Sequence[Hashable] | None = None, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, + keep_date_col: bool | lib.NoDefault = lib.no_default, + date_parser: Callable | lib.NoDefault = lib.no_default, + date_format: str | dict[Hashable, str] | None = None, + dayfirst: bool = False, + cache_dates: bool = True, + # Iteration + iterator: bool = False, + chunksize: int | None = None, + # Quoting, Compression, and File Format + compression: CompressionOptions = "infer", + thousands: str | None = None, + decimal: str = ".", + lineterminator: str | None = None, + quotechar: str = '"', + quoting: int = csv.QUOTE_MINIMAL, + doublequote: bool = True, + escapechar: str | None = None, + comment: str | None = None, + encoding: str | None = None, + encoding_errors: str | None = "strict", + dialect: str | csv.Dialect | None = None, + # Error Handling + on_bad_lines: str = "error", + # Internal + delim_whitespace: bool | lib.NoDefault = lib.no_default, + low_memory: bool = _c_parser_defaults["low_memory"], + memory_map: bool = False, + float_precision: Literal["high", "legacy"] | None = None, + storage_options: StorageOptions | None = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, +) -> DataFrame | TextFileReader: + if keep_date_col is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'keep_date_col' keyword in pd.read_csv is deprecated and " + "will be removed in a future version. Explicitly remove unwanted " + "columns after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + keep_date_col = False + + if lib.is_list_like(parse_dates): + # GH#55569 + depr = False + # error: Item "bool" of "bool | Sequence[Hashable] | None" has no + # attribute "__iter__" (not iterable) + if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] + depr = True + elif isinstance(parse_dates, dict) and any( + lib.is_list_like(x) for x in parse_dates.values() + ): + depr = True + if depr: + warnings.warn( + "Support for nested sequences for 'parse_dates' in pd.read_csv " + "is deprecated. Combine the desired columns with pd.to_datetime " + "after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if infer_datetime_format is not lib.no_default: + warnings.warn( + "The argument 'infer_datetime_format' is deprecated and will " + "be removed in a future version. " + "A strict version of it is now the default, see " + "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " + "You can safely remove this argument.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if delim_whitespace is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " + "will be removed in a future version. Use ``sep='\\s+'`` instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + delim_whitespace = False + + if verbose is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'verbose' keyword in pd.read_csv is deprecated and " + "will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + verbose = False + + # locals() should never be modified + kwds = locals().copy() + del kwds["filepath_or_buffer"] + del kwds["sep"] + + kwds_defaults = _refine_defaults_read( + dialect, + delimiter, + delim_whitespace, + engine, + sep, + on_bad_lines, + names, + defaults={"delimiter": ","}, + dtype_backend=dtype_backend, + ) + kwds.update(kwds_defaults) + + return _read(filepath_or_buffer, kwds) + + +# iterator=True -> TextFileReader +@overload +def read_table( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = ..., + delimiter: str | None | lib.NoDefault = ..., + header: int | Sequence[int] | None | Literal["infer"] = ..., + names: Sequence[Hashable] | None | lib.NoDefault = ..., + index_col: IndexLabel | Literal[False] | None = ..., + usecols: UsecolsArgType = ..., + dtype: DtypeArg | None = ..., + engine: CSVEngine | None = ..., + converters: Mapping[Hashable, Callable] | None = ..., + true_values: list | None = ..., + false_values: list | None = ..., + skipinitialspace: bool = ..., + skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., + skipfooter: int = ..., + nrows: int | None = ..., + na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + keep_default_na: bool = ..., + na_filter: bool = ..., + verbose: bool | lib.NoDefault = ..., + skip_blank_lines: bool = ..., + parse_dates: bool | Sequence[Hashable] = ..., + infer_datetime_format: bool | lib.NoDefault = ..., + keep_date_col: bool | lib.NoDefault = ..., + date_parser: Callable | lib.NoDefault = ..., + date_format: str | dict[Hashable, str] | None = ..., + dayfirst: bool = ..., + cache_dates: bool = ..., + iterator: Literal[True], + chunksize: int | None = ..., + compression: CompressionOptions = ..., + thousands: str | None = ..., + decimal: str = ..., + lineterminator: str | None = ..., + quotechar: str = ..., + quoting: int = ..., + doublequote: bool = ..., + escapechar: str | None = ..., + comment: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + dialect: str | csv.Dialect | None = ..., + on_bad_lines=..., + delim_whitespace: bool = ..., + low_memory: bool = ..., + memory_map: bool = ..., + float_precision: str | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., +) -> TextFileReader: + ... + + +# chunksize=int -> TextFileReader +@overload +def read_table( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = ..., + delimiter: str | None | lib.NoDefault = ..., + header: int | Sequence[int] | None | Literal["infer"] = ..., + names: Sequence[Hashable] | None | lib.NoDefault = ..., + index_col: IndexLabel | Literal[False] | None = ..., + usecols: UsecolsArgType = ..., + dtype: DtypeArg | None = ..., + engine: CSVEngine | None = ..., + converters: Mapping[Hashable, Callable] | None = ..., + true_values: list | None = ..., + false_values: list | None = ..., + skipinitialspace: bool = ..., + skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., + skipfooter: int = ..., + nrows: int | None = ..., + na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + keep_default_na: bool = ..., + na_filter: bool = ..., + verbose: bool | lib.NoDefault = ..., + skip_blank_lines: bool = ..., + parse_dates: bool | Sequence[Hashable] = ..., + infer_datetime_format: bool | lib.NoDefault = ..., + keep_date_col: bool | lib.NoDefault = ..., + date_parser: Callable | lib.NoDefault = ..., + date_format: str | dict[Hashable, str] | None = ..., + dayfirst: bool = ..., + cache_dates: bool = ..., + iterator: bool = ..., + chunksize: int, + compression: CompressionOptions = ..., + thousands: str | None = ..., + decimal: str = ..., + lineterminator: str | None = ..., + quotechar: str = ..., + quoting: int = ..., + doublequote: bool = ..., + escapechar: str | None = ..., + comment: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + dialect: str | csv.Dialect | None = ..., + on_bad_lines=..., + delim_whitespace: bool = ..., + low_memory: bool = ..., + memory_map: bool = ..., + float_precision: str | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., +) -> TextFileReader: + ... + + +# default -> DataFrame +@overload +def read_table( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = ..., + delimiter: str | None | lib.NoDefault = ..., + header: int | Sequence[int] | None | Literal["infer"] = ..., + names: Sequence[Hashable] | None | lib.NoDefault = ..., + index_col: IndexLabel | Literal[False] | None = ..., + usecols: UsecolsArgType = ..., + dtype: DtypeArg | None = ..., + engine: CSVEngine | None = ..., + converters: Mapping[Hashable, Callable] | None = ..., + true_values: list | None = ..., + false_values: list | None = ..., + skipinitialspace: bool = ..., + skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., + skipfooter: int = ..., + nrows: int | None = ..., + na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + keep_default_na: bool = ..., + na_filter: bool = ..., + verbose: bool | lib.NoDefault = ..., + skip_blank_lines: bool = ..., + parse_dates: bool | Sequence[Hashable] = ..., + infer_datetime_format: bool | lib.NoDefault = ..., + keep_date_col: bool | lib.NoDefault = ..., + date_parser: Callable | lib.NoDefault = ..., + date_format: str | dict[Hashable, str] | None = ..., + dayfirst: bool = ..., + cache_dates: bool = ..., + iterator: Literal[False] = ..., + chunksize: None = ..., + compression: CompressionOptions = ..., + thousands: str | None = ..., + decimal: str = ..., + lineterminator: str | None = ..., + quotechar: str = ..., + quoting: int = ..., + doublequote: bool = ..., + escapechar: str | None = ..., + comment: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + dialect: str | csv.Dialect | None = ..., + on_bad_lines=..., + delim_whitespace: bool = ..., + low_memory: bool = ..., + memory_map: bool = ..., + float_precision: str | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., +) -> DataFrame: + ... + + +# Unions -> DataFrame | TextFileReader +@overload +def read_table( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = ..., + delimiter: str | None | lib.NoDefault = ..., + header: int | Sequence[int] | None | Literal["infer"] = ..., + names: Sequence[Hashable] | None | lib.NoDefault = ..., + index_col: IndexLabel | Literal[False] | None = ..., + usecols: UsecolsArgType = ..., + dtype: DtypeArg | None = ..., + engine: CSVEngine | None = ..., + converters: Mapping[Hashable, Callable] | None = ..., + true_values: list | None = ..., + false_values: list | None = ..., + skipinitialspace: bool = ..., + skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., + skipfooter: int = ..., + nrows: int | None = ..., + na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + keep_default_na: bool = ..., + na_filter: bool = ..., + verbose: bool | lib.NoDefault = ..., + skip_blank_lines: bool = ..., + parse_dates: bool | Sequence[Hashable] = ..., + infer_datetime_format: bool | lib.NoDefault = ..., + keep_date_col: bool | lib.NoDefault = ..., + date_parser: Callable | lib.NoDefault = ..., + date_format: str | dict[Hashable, str] | None = ..., + dayfirst: bool = ..., + cache_dates: bool = ..., + iterator: bool = ..., + chunksize: int | None = ..., + compression: CompressionOptions = ..., + thousands: str | None = ..., + decimal: str = ..., + lineterminator: str | None = ..., + quotechar: str = ..., + quoting: int = ..., + doublequote: bool = ..., + escapechar: str | None = ..., + comment: str | None = ..., + encoding: str | None = ..., + encoding_errors: str | None = ..., + dialect: str | csv.Dialect | None = ..., + on_bad_lines=..., + delim_whitespace: bool = ..., + low_memory: bool = ..., + memory_map: bool = ..., + float_precision: str | None = ..., + storage_options: StorageOptions = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., +) -> DataFrame | TextFileReader: + ... + + +@Appender( + _doc_read_csv_and_table.format( + func_name="read_table", + summary="Read general delimited file into DataFrame.", + see_also_func_name="read_csv", + see_also_func_summary=( + "Read a comma-separated values (csv) file into DataFrame." + ), + _default_sep=r"'\\t' (tab-stop)", + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] + % "filepath_or_buffer", + ) +) +def read_table( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + sep: str | None | lib.NoDefault = lib.no_default, + delimiter: str | None | lib.NoDefault = None, + # Column and Index Locations and Names + header: int | Sequence[int] | None | Literal["infer"] = "infer", + names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, + index_col: IndexLabel | Literal[False] | None = None, + usecols: UsecolsArgType = None, + # General Parsing Configuration + dtype: DtypeArg | None = None, + engine: CSVEngine | None = None, + converters: Mapping[Hashable, Callable] | None = None, + true_values: list | None = None, + false_values: list | None = None, + skipinitialspace: bool = False, + skiprows: list[int] | int | Callable[[Hashable], bool] | None = None, + skipfooter: int = 0, + nrows: int | None = None, + # NA and Missing Data Handling + na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None, + keep_default_na: bool = True, + na_filter: bool = True, + verbose: bool | lib.NoDefault = lib.no_default, + skip_blank_lines: bool = True, + # Datetime Handling + parse_dates: bool | Sequence[Hashable] = False, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, + keep_date_col: bool | lib.NoDefault = lib.no_default, + date_parser: Callable | lib.NoDefault = lib.no_default, + date_format: str | dict[Hashable, str] | None = None, + dayfirst: bool = False, + cache_dates: bool = True, + # Iteration + iterator: bool = False, + chunksize: int | None = None, + # Quoting, Compression, and File Format + compression: CompressionOptions = "infer", + thousands: str | None = None, + decimal: str = ".", + lineterminator: str | None = None, + quotechar: str = '"', + quoting: int = csv.QUOTE_MINIMAL, + doublequote: bool = True, + escapechar: str | None = None, + comment: str | None = None, + encoding: str | None = None, + encoding_errors: str | None = "strict", + dialect: str | csv.Dialect | None = None, + # Error Handling + on_bad_lines: str = "error", + # Internal + delim_whitespace: bool | lib.NoDefault = lib.no_default, + low_memory: bool = _c_parser_defaults["low_memory"], + memory_map: bool = False, + float_precision: str | None = None, + storage_options: StorageOptions | None = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, +) -> DataFrame | TextFileReader: + if keep_date_col is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'keep_date_col' keyword in pd.read_table is deprecated and " + "will be removed in a future version. Explicitly remove unwanted " + "columns after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + keep_date_col = False + + # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__" + if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] + # GH#55569 + warnings.warn( + "Support for nested sequences for 'parse_dates' in pd.read_table " + "is deprecated. Combine the desired columns with pd.to_datetime " + "after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if infer_datetime_format is not lib.no_default: + warnings.warn( + "The argument 'infer_datetime_format' is deprecated and will " + "be removed in a future version. " + "A strict version of it is now the default, see " + "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " + "You can safely remove this argument.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if delim_whitespace is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'delim_whitespace' keyword in pd.read_table is deprecated and " + "will be removed in a future version. Use ``sep='\\s+'`` instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + delim_whitespace = False + + if verbose is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'verbose' keyword in pd.read_table is deprecated and " + "will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + verbose = False + + # locals() should never be modified + kwds = locals().copy() + del kwds["filepath_or_buffer"] + del kwds["sep"] + + kwds_defaults = _refine_defaults_read( + dialect, + delimiter, + delim_whitespace, + engine, + sep, + on_bad_lines, + names, + defaults={"delimiter": "\t"}, + dtype_backend=dtype_backend, + ) + kwds.update(kwds_defaults) + + return _read(filepath_or_buffer, kwds) + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: Literal[True], + chunksize: int | None = ..., + **kwds, +) -> TextFileReader: + ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: bool = ..., + chunksize: int, + **kwds, +) -> TextFileReader: + ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: Literal[False] = ..., + chunksize: None = ..., + **kwds, +) -> DataFrame: + ... + + +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = "infer", + widths: Sequence[int] | None = None, + infer_nrows: int = 100, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + iterator: bool = False, + chunksize: int | None = None, + **kwds, +) -> DataFrame | TextFileReader: + r""" + Read a table of fixed-width formatted lines into DataFrame. + + Also supports optionally iterating or breaking of the file + into chunks. + + Additional help can be found in the `online docs for IO Tools + `_. + + Parameters + ---------- + filepath_or_buffer : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a text ``read()`` function.The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.csv``. + colspecs : list of tuple (int, int) or 'infer'. optional + A list of tuples giving the extents of the fixed-width + fields of each line as half-open intervals (i.e., [from, to[ ). + String value 'infer' can be used to instruct the parser to try + detecting the column specifications from the first 100 rows of + the data which are not being skipped via skiprows (default='infer'). + widths : list of int, optional + A list of field widths which can be used instead of 'colspecs' if + the intervals are contiguous. + infer_nrows : int, default 100 + The number of rows to consider when letting the parser determine the + `colspecs`. + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + **kwds : optional + Optional keyword arguments can be passed to ``TextFileReader``. + + Returns + ------- + DataFrame or TextFileReader + A comma-separated values (csv) file is returned as two-dimensional + data structure with labeled axes. + + See Also + -------- + DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. + read_csv : Read a comma-separated values (csv) file into DataFrame. + + Examples + -------- + >>> pd.read_fwf('data.csv') # doctest: +SKIP + """ + # Check input arguments. + if colspecs is None and widths is None: + raise ValueError("Must specify either colspecs or widths") + if colspecs not in (None, "infer") and widths is not None: + raise ValueError("You must specify only one of 'widths' and 'colspecs'") + + # Compute 'colspecs' from 'widths', if specified. + if widths is not None: + colspecs, col = [], 0 + for w in widths: + colspecs.append((col, col + w)) + col += w + + # for mypy + assert colspecs is not None + + # GH#40830 + # Ensure length of `colspecs` matches length of `names` + names = kwds.get("names") + if names is not None: + if len(names) != len(colspecs) and colspecs != "infer": + # need to check len(index_col) as it might contain + # unnamed indices, in which case it's name is not required + len_index = 0 + if kwds.get("index_col") is not None: + index_col: Any = kwds.get("index_col") + if index_col is not False: + if not is_list_like(index_col): + len_index = 1 + else: + len_index = len(index_col) + if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): + # If usecols is used colspec may be longer than names + raise ValueError("Length of colspecs must match length of names") + + kwds["colspecs"] = colspecs + kwds["infer_nrows"] = infer_nrows + kwds["engine"] = "python-fwf" + kwds["iterator"] = iterator + kwds["chunksize"] = chunksize + + check_dtype_backend(dtype_backend) + kwds["dtype_backend"] = dtype_backend + return _read(filepath_or_buffer, kwds) + + +class TextFileReader(abc.Iterator): + """ + + Passed dialect overrides any of the related parser options + + """ + + def __init__( + self, + f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, + engine: CSVEngine | None = None, + **kwds, + ) -> None: + if engine is not None: + engine_specified = True + else: + engine = "python" + engine_specified = False + self.engine = engine + self._engine_specified = kwds.get("engine_specified", engine_specified) + + _validate_skipfooter(kwds) + + dialect = _extract_dialect(kwds) + if dialect is not None: + if engine == "pyarrow": + raise ValueError( + "The 'dialect' option is not supported with the 'pyarrow' engine" + ) + kwds = _merge_with_dialect_properties(dialect, kwds) + + if kwds.get("header", "infer") == "infer": + kwds["header"] = 0 if kwds.get("names") is None else None + + self.orig_options = kwds + + # miscellanea + self._currow = 0 + + options = self._get_options_with_defaults(engine) + options["storage_options"] = kwds.get("storage_options", None) + + self.chunksize = options.pop("chunksize", None) + self.nrows = options.pop("nrows", None) + + self._check_file_or_buffer(f, engine) + self.options, self.engine = self._clean_options(options, engine) + + if "has_index_names" in kwds: + self.options["has_index_names"] = kwds["has_index_names"] + + self.handles: IOHandles | None = None + self._engine = self._make_engine(f, self.engine) + + def close(self) -> None: + if self.handles is not None: + self.handles.close() + self._engine.close() + + def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: + kwds = self.orig_options + + options = {} + default: object | None + + for argname, default in parser_defaults.items(): + value = kwds.get(argname, default) + + # see gh-12935 + if ( + engine == "pyarrow" + and argname in _pyarrow_unsupported + and value != default + and value != getattr(value, "value", default) + ): + raise ValueError( + f"The {repr(argname)} option is not supported with the " + f"'pyarrow' engine" + ) + options[argname] = value + + for argname, default in _c_parser_defaults.items(): + if argname in kwds: + value = kwds[argname] + + if engine != "c" and value != default: + # TODO: Refactor this logic, its pretty convoluted + if "python" in engine and argname not in _python_unsupported: + pass + elif "pyarrow" in engine and argname not in _pyarrow_unsupported: + pass + else: + raise ValueError( + f"The {repr(argname)} option is not supported with the " + f"{repr(engine)} engine" + ) + else: + value = default + options[argname] = value + + if engine == "python-fwf": + for argname, default in _fwf_defaults.items(): + options[argname] = kwds.get(argname, default) + + return options + + def _check_file_or_buffer(self, f, engine: CSVEngine) -> None: + # see gh-16530 + if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"): + # The C engine doesn't need the file-like to have the "__iter__" + # attribute. However, the Python engine needs "__iter__(...)" + # when iterating through such an object, meaning it + # needs to have that attribute + raise ValueError( + "The 'python' engine cannot iterate through this file buffer." + ) + + def _clean_options( + self, options: dict[str, Any], engine: CSVEngine + ) -> tuple[dict[str, Any], CSVEngine]: + result = options.copy() + + fallback_reason = None + + # C engine not supported yet + if engine == "c": + if options["skipfooter"] > 0: + fallback_reason = "the 'c' engine does not support skipfooter" + engine = "python" + + sep = options["delimiter"] + delim_whitespace = options["delim_whitespace"] + + if sep is None and not delim_whitespace: + if engine in ("c", "pyarrow"): + fallback_reason = ( + f"the '{engine}' engine does not support " + "sep=None with delim_whitespace=False" + ) + engine = "python" + elif sep is not None and len(sep) > 1: + if engine == "c" and sep == r"\s+": + result["delim_whitespace"] = True + del result["delimiter"] + elif engine not in ("python", "python-fwf"): + # wait until regex engine integrated + fallback_reason = ( + f"the '{engine}' engine does not support " + "regex separators (separators > 1 char and " + r"different from '\s+' are interpreted as regex)" + ) + engine = "python" + elif delim_whitespace: + if "python" in engine: + result["delimiter"] = r"\s+" + elif sep is not None: + encodeable = True + encoding = sys.getfilesystemencoding() or "utf-8" + try: + if len(sep.encode(encoding)) > 1: + encodeable = False + except UnicodeDecodeError: + encodeable = False + if not encodeable and engine not in ("python", "python-fwf"): + fallback_reason = ( + f"the separator encoded in {encoding} " + f"is > 1 char long, and the '{engine}' engine " + "does not support such separators" + ) + engine = "python" + + quotechar = options["quotechar"] + if quotechar is not None and isinstance(quotechar, (str, bytes)): + if ( + len(quotechar) == 1 + and ord(quotechar) > 127 + and engine not in ("python", "python-fwf") + ): + fallback_reason = ( + "ord(quotechar) > 127, meaning the " + "quotechar is larger than one byte, " + f"and the '{engine}' engine does not support such quotechars" + ) + engine = "python" + + if fallback_reason and self._engine_specified: + raise ValueError(fallback_reason) + + if engine == "c": + for arg in _c_unsupported: + del result[arg] + + if "python" in engine: + for arg in _python_unsupported: + if fallback_reason and result[arg] != _c_parser_defaults.get(arg): + raise ValueError( + "Falling back to the 'python' engine because " + f"{fallback_reason}, but this causes {repr(arg)} to be " + "ignored as it is not supported by the 'python' engine." + ) + del result[arg] + + if fallback_reason: + warnings.warn( + ( + "Falling back to the 'python' engine because " + f"{fallback_reason}; you can avoid this warning by specifying " + "engine='python'." + ), + ParserWarning, + stacklevel=find_stack_level(), + ) + + index_col = options["index_col"] + names = options["names"] + converters = options["converters"] + na_values = options["na_values"] + skiprows = options["skiprows"] + + validate_header_arg(options["header"]) + + if index_col is True: + raise ValueError("The value of index_col couldn't be 'True'") + if is_index_col(index_col): + if not isinstance(index_col, (list, tuple, np.ndarray)): + index_col = [index_col] + result["index_col"] = index_col + + names = list(names) if names is not None else names + + # type conversion-related + if converters is not None: + if not isinstance(converters, dict): + raise TypeError( + "Type converters must be a dict or subclass, " + f"input was a {type(converters).__name__}" + ) + else: + converters = {} + + # Converting values to NA + keep_default_na = options["keep_default_na"] + floatify = engine != "pyarrow" + na_values, na_fvalues = _clean_na_values( + na_values, keep_default_na, floatify=floatify + ) + + # handle skiprows; this is internally handled by the + # c-engine, so only need for python and pyarrow parsers + if engine == "pyarrow": + if not is_integer(skiprows) and skiprows is not None: + # pyarrow expects skiprows to be passed as an integer + raise ValueError( + "skiprows argument must be an integer when using " + "engine='pyarrow'" + ) + else: + if is_integer(skiprows): + skiprows = list(range(skiprows)) + if skiprows is None: + skiprows = set() + elif not callable(skiprows): + skiprows = set(skiprows) + + # put stuff back + result["names"] = names + result["converters"] = converters + result["na_values"] = na_values + result["na_fvalues"] = na_fvalues + result["skiprows"] = skiprows + + return result, engine + + def __next__(self) -> DataFrame: + try: + return self.get_chunk() + except StopIteration: + self.close() + raise + + def _make_engine( + self, + f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO, + engine: CSVEngine = "c", + ) -> ParserBase: + mapping: dict[str, type[ParserBase]] = { + "c": CParserWrapper, + "python": PythonParser, + "pyarrow": ArrowParserWrapper, + "python-fwf": FixedWidthFieldParser, + } + if engine not in mapping: + raise ValueError( + f"Unknown engine: {engine} (valid options are {mapping.keys()})" + ) + if not isinstance(f, list): + # open file here + is_text = True + mode = "r" + if engine == "pyarrow": + is_text = False + mode = "rb" + elif ( + engine == "c" + and self.options.get("encoding", "utf-8") == "utf-8" + and isinstance(stringify_path(f), str) + ): + # c engine can decode utf-8 bytes, adding TextIOWrapper makes + # the c-engine especially for memory_map=True far slower + is_text = False + if "b" not in mode: + mode += "b" + self.handles = get_handle( + f, + mode, + encoding=self.options.get("encoding", None), + compression=self.options.get("compression", None), + memory_map=self.options.get("memory_map", False), + is_text=is_text, + errors=self.options.get("encoding_errors", "strict"), + storage_options=self.options.get("storage_options", None), + ) + assert self.handles is not None + f = self.handles.handle + + elif engine != "python": + msg = f"Invalid file path or buffer object type: {type(f)}" + raise ValueError(msg) + + try: + return mapping[engine](f, **self.options) + except Exception: + if self.handles is not None: + self.handles.close() + raise + + def _failover_to_python(self) -> None: + raise AbstractMethodError(self) + + def read(self, nrows: int | None = None) -> DataFrame: + if self.engine == "pyarrow": + try: + # error: "ParserBase" has no attribute "read" + df = self._engine.read() # type: ignore[attr-defined] + except Exception: + self.close() + raise + else: + nrows = validate_integer("nrows", nrows) + try: + # error: "ParserBase" has no attribute "read" + ( + index, + columns, + col_dict, + ) = self._engine.read( # type: ignore[attr-defined] + nrows + ) + except Exception: + self.close() + raise + + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(next(iter(col_dict.values()))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 + else: + new_rows = len(index) + + if hasattr(self, "orig_options"): + dtype_arg = self.orig_options.get("dtype", None) + else: + dtype_arg = None + + if isinstance(dtype_arg, dict): + dtype = defaultdict(lambda: None) # type: ignore[var-annotated] + dtype.update(dtype_arg) + elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( + np.str_, + np.object_, + ): + dtype = defaultdict(lambda: dtype_arg) + else: + dtype = None + + if dtype is not None: + new_col_dict = {} + for k, v in col_dict.items(): + d = ( + dtype[k] + if pandas_dtype(dtype[k]) in (np.str_, np.object_) + else None + ) + new_col_dict[k] = Series(v, index=index, dtype=d, copy=False) + else: + new_col_dict = col_dict + + df = DataFrame( + new_col_dict, + columns=columns, + index=index, + copy=not using_copy_on_write(), + ) + + self._currow += new_rows + return df + + def get_chunk(self, size: int | None = None) -> DataFrame: + if size is None: + size = self.chunksize + if self.nrows is not None: + if self._currow >= self.nrows: + raise StopIteration + size = min(size, self.nrows - self._currow) + return self.read(nrows=size) + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: + self.close() + + +def TextParser(*args, **kwds) -> TextFileReader: + """ + Converts lists of lists/tuples into DataFrames with proper type inference + and optional (e.g. string to datetime) conversion. Also enables iterating + lazily over chunks of large files + + Parameters + ---------- + data : file-like object or list + delimiter : separator character to use + dialect : str or csv.Dialect instance, optional + Ignored if delimiter is longer than 1 character + names : sequence, default + header : int, default 0 + Row to use to parse column labels. Defaults to the first row. Prior + rows will be discarded + index_col : int or list, optional + Column or columns to use as the (possibly hierarchical) index + has_index_names: bool, default False + True if the cols defined in index_col have an index name and are + not in the header. + na_values : scalar, str, list-like, or dict, optional + Additional strings to recognize as NA/NaN. + keep_default_na : bool, default True + thousands : str, optional + Thousands separator + comment : str, optional + Comment out remainder of line + parse_dates : bool, default False + keep_date_col : bool, default False + date_parser : function, optional + + .. deprecated:: 2.0.0 + date_format : str or dict of column -> format, default ``None`` + + .. versionadded:: 2.0.0 + skiprows : list of integers + Row numbers to skip + skipfooter : int + Number of line at bottom of file to skip + converters : dict, optional + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + encoding : str, optional + Encoding to use for UTF when reading/writing (ex. 'utf-8') + float_precision : str, optional + Specifies which converter the C engine should use for floating-point + values. The options are `None` or `high` for the ordinary converter, + `legacy` for the original lower precision pandas converter, and + `round_trip` for the round-trip converter. + """ + kwds["engine"] = "python" + return TextFileReader(*args, **kwds) + + +def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True): + na_fvalues: set | dict + if na_values is None: + if keep_default_na: + na_values = STR_NA_VALUES + else: + na_values = set() + na_fvalues = set() + elif isinstance(na_values, dict): + old_na_values = na_values.copy() + na_values = {} # Prevent aliasing. + + # Convert the values in the na_values dictionary + # into array-likes for further use. This is also + # where we append the default NaN values, provided + # that `keep_default_na=True`. + for k, v in old_na_values.items(): + if not is_list_like(v): + v = [v] + + if keep_default_na: + v = set(v) | STR_NA_VALUES + + na_values[k] = v + na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} + else: + if not is_list_like(na_values): + na_values = [na_values] + na_values = _stringify_na_values(na_values, floatify) + if keep_default_na: + na_values = na_values | STR_NA_VALUES + + na_fvalues = _floatify_na_values(na_values) + + return na_values, na_fvalues + + +def _floatify_na_values(na_values): + # create float versions of the na_values + result = set() + for v in na_values: + try: + v = float(v) + if not np.isnan(v): + result.add(v) + except (TypeError, ValueError, OverflowError): + pass + return result + + +def _stringify_na_values(na_values, floatify: bool): + """return a stringified and numeric for these values""" + result: list[str | float] = [] + for x in na_values: + result.append(str(x)) + result.append(x) + try: + v = float(x) + + # we are like 999 here + if v == int(v): + v = int(v) + result.append(f"{v}.0") + result.append(str(v)) + + if floatify: + result.append(v) + except (TypeError, ValueError, OverflowError): + pass + if floatify: + try: + result.append(int(x)) + except (TypeError, ValueError, OverflowError): + pass + return set(result) + + +def _refine_defaults_read( + dialect: str | csv.Dialect | None, + delimiter: str | None | lib.NoDefault, + delim_whitespace: bool, + engine: CSVEngine | None, + sep: str | None | lib.NoDefault, + on_bad_lines: str | Callable, + names: Sequence[Hashable] | None | lib.NoDefault, + defaults: dict[str, Any], + dtype_backend: DtypeBackend | lib.NoDefault, +): + """Validate/refine default values of input parameters of read_csv, read_table. + + Parameters + ---------- + dialect : str or csv.Dialect + If provided, this parameter will override values (default or not) for the + following parameters: `delimiter`, `doublequote`, `escapechar`, + `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to + override values, a ParserWarning will be issued. See csv.Dialect + documentation for more details. + delimiter : str or object + Alias for sep. + delim_whitespace : bool + Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be + used as the sep. Equivalent to setting ``sep='\\s+'``. If this option + is set to True, nothing should be passed in for the ``delimiter`` + parameter. + + .. deprecated:: 2.2.0 + Use ``sep="\\s+"`` instead. + engine : {{'c', 'python'}} + Parser engine to use. The C engine is faster while the python engine is + currently more feature-complete. + sep : str or object + A delimiter provided by the user (str) or a sentinel value, i.e. + pandas._libs.lib.no_default. + on_bad_lines : str, callable + An option for handling bad lines or a sentinel value(None). + names : array-like, optional + List of column names to use. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. + defaults: dict + Default values of input parameters. + + Returns + ------- + kwds : dict + Input parameters with correct values. + + Raises + ------ + ValueError : + If a delimiter was specified with ``sep`` (or ``delimiter``) and + ``delim_whitespace=True``. + """ + # fix types for sep, delimiter to Union(str, Any) + delim_default = defaults["delimiter"] + kwds: dict[str, Any] = {} + # gh-23761 + # + # When a dialect is passed, it overrides any of the overlapping + # parameters passed in directly. We don't want to warn if the + # default parameters were passed in (since it probably means + # that the user didn't pass them in explicitly in the first place). + # + # "delimiter" is the annoying corner case because we alias it to + # "sep" before doing comparison to the dialect values later on. + # Thus, we need a flag to indicate that we need to "override" + # the comparison to dialect values by checking if default values + # for BOTH "delimiter" and "sep" were provided. + if dialect is not None: + kwds["sep_override"] = delimiter is None and ( + sep is lib.no_default or sep == delim_default + ) + + if delimiter and (sep is not lib.no_default): + raise ValueError("Specified a sep and a delimiter; you can only specify one.") + + kwds["names"] = None if names is lib.no_default else names + + # Alias sep -> delimiter. + if delimiter is None: + delimiter = sep + + if delim_whitespace and (delimiter is not lib.no_default): + raise ValueError( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + + if delimiter == "\n": + raise ValueError( + r"Specified \n as separator or delimiter. This forces the python engine " + "which does not accept a line terminator. Hence it is not allowed to use " + "the line terminator as separator.", + ) + + if delimiter is lib.no_default: + # assign default separator value + kwds["delimiter"] = delim_default + else: + kwds["delimiter"] = delimiter + + if engine is not None: + kwds["engine_specified"] = True + else: + kwds["engine"] = "c" + kwds["engine_specified"] = False + + if on_bad_lines == "error": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + elif on_bad_lines == "warn": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + elif on_bad_lines == "skip": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP + elif callable(on_bad_lines): + if engine not in ["python", "pyarrow"]: + raise ValueError( + "on_bad_line can only be a callable function " + "if engine='python' or 'pyarrow'" + ) + kwds["on_bad_lines"] = on_bad_lines + else: + raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + + check_dtype_backend(dtype_backend) + + kwds["dtype_backend"] = dtype_backend + + return kwds + + +def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None: + """ + Extract concrete csv dialect instance. + + Returns + ------- + csv.Dialect or None + """ + if kwds.get("dialect") is None: + return None + + dialect = kwds["dialect"] + if dialect in csv.list_dialects(): + dialect = csv.get_dialect(dialect) + + _validate_dialect(dialect) + + return dialect + + +MANDATORY_DIALECT_ATTRS = ( + "delimiter", + "doublequote", + "escapechar", + "skipinitialspace", + "quotechar", + "quoting", +) + + +def _validate_dialect(dialect: csv.Dialect) -> None: + """ + Validate csv dialect instance. + + Raises + ------ + ValueError + If incorrect dialect is provided. + """ + for param in MANDATORY_DIALECT_ATTRS: + if not hasattr(dialect, param): + raise ValueError(f"Invalid dialect {dialect} provided") + + +def _merge_with_dialect_properties( + dialect: csv.Dialect, + defaults: dict[str, Any], +) -> dict[str, Any]: + """ + Merge default kwargs in TextFileReader with dialect parameters. + + Parameters + ---------- + dialect : csv.Dialect + Concrete csv dialect. See csv.Dialect documentation for more details. + defaults : dict + Keyword arguments passed to TextFileReader. + + Returns + ------- + kwds : dict + Updated keyword arguments, merged with dialect parameters. + """ + kwds = defaults.copy() + + for param in MANDATORY_DIALECT_ATTRS: + dialect_val = getattr(dialect, param) + + parser_default = parser_defaults[param] + provided = kwds.get(param, parser_default) + + # Messages for conflicting values between the dialect + # instance and the actual parameters provided. + conflict_msgs = [] + + # Don't warn if the default parameter was passed in, + # even if it conflicts with the dialect (gh-23761). + if provided not in (parser_default, dialect_val): + msg = ( + f"Conflicting values for '{param}': '{provided}' was " + f"provided, but the dialect specifies '{dialect_val}'. " + "Using the dialect-specified value." + ) + + # Annoying corner case for not warning about + # conflicts between dialect and delimiter parameter. + # Refer to the outer "_read_" function for more info. + if not (param == "delimiter" and kwds.pop("sep_override", False)): + conflict_msgs.append(msg) + + if conflict_msgs: + warnings.warn( + "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level() + ) + kwds[param] = dialect_val + return kwds + + +def _validate_skipfooter(kwds: dict[str, Any]) -> None: + """ + Check whether skipfooter is compatible with other kwargs in TextFileReader. + + Parameters + ---------- + kwds : dict + Keyword arguments passed to TextFileReader. + + Raises + ------ + ValueError + If skipfooter is not compatible with other parameters. + """ + if kwds.get("skipfooter"): + if kwds.get("iterator") or kwds.get("chunksize"): + raise ValueError("'skipfooter' not supported for iteration") + if kwds.get("nrows"): + raise ValueError("'skipfooter' not supported with 'nrows'") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..317730745b6e3a0278a48b7bb810cf43e718e787 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/__init__.py @@ -0,0 +1,3 @@ +from pandas.io.sas.sasreader import read_sas + +__all__ = ["read_sas"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sas7bdat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sas7bdat.py new file mode 100644 index 0000000000000000000000000000000000000000..c5bdfb554178816f9d668157f87514776f277eb9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sas7bdat.py @@ -0,0 +1,756 @@ +""" +Read SAS7BDAT files + +Based on code written by Jared Hobbs: + https://bitbucket.org/jaredhobbs/sas7bdat + +See also: + https://github.com/BioStatMatt/sas7bdat + +Partial documentation of the file format: + https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf + +Reference for binary data compression: + http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm +""" +from __future__ import annotations + +from collections import abc +from datetime import ( + datetime, + timedelta, +) +import sys +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs.byteswap import ( + read_double_with_byteswap, + read_float_with_byteswap, + read_uint16_with_byteswap, + read_uint32_with_byteswap, + read_uint64_with_byteswap, +) +from pandas._libs.sas import ( + Parser, + get_subheader_index, +) +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas.errors import EmptyDataError + +import pandas as pd +from pandas import ( + DataFrame, + Timestamp, + isna, +) + +from pandas.io.common import get_handle +import pandas.io.sas.sas_constants as const +from pandas.io.sas.sasreader import ReaderBase + +if TYPE_CHECKING: + from pandas._typing import ( + CompressionOptions, + FilePath, + ReadBuffer, + ) + + +_unix_origin = Timestamp("1970-01-01") +_sas_origin = Timestamp("1960-01-01") + + +def _parse_datetime(sas_datetime: float, unit: str): + if isna(sas_datetime): + return pd.NaT + + if unit == "s": + return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime) + + elif unit == "d": + return datetime(1960, 1, 1) + timedelta(days=sas_datetime) + + else: + raise ValueError("unit must be 'd' or 's'") + + +def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series: + """ + Convert to Timestamp if possible, otherwise to datetime.datetime. + SAS float64 lacks precision for more than ms resolution so the fit + to datetime.datetime is ok. + + Parameters + ---------- + sas_datetimes : {Series, Sequence[float]} + Dates or datetimes in SAS + unit : {'d', 's'} + "d" if the floats represent dates, "s" for datetimes + + Returns + ------- + Series + Series of datetime64 dtype or datetime.datetime. + """ + td = (_sas_origin - _unix_origin).as_unit("s") + if unit == "s": + millis = cast_from_unit_vectorized( + sas_datetimes._values, unit="s", out_unit="ms" + ) + dt64ms = millis.view("M8[ms]") + td + return pd.Series(dt64ms, index=sas_datetimes.index, copy=False) + else: + vals = np.array(sas_datetimes, dtype="M8[D]") + td + return pd.Series(vals, dtype="M8[s]", index=sas_datetimes.index, copy=False) + + +class _Column: + col_id: int + name: str | bytes + label: str | bytes + format: str | bytes + ctype: bytes + length: int + + def __init__( + self, + col_id: int, + # These can be bytes when convert_header_text is False + name: str | bytes, + label: str | bytes, + format: str | bytes, + ctype: bytes, + length: int, + ) -> None: + self.col_id = col_id + self.name = name + self.label = label + self.format = format + self.ctype = ctype + self.length = length + + +# SAS7BDAT represents a SAS data file in SAS7BDAT format. +class SAS7BDATReader(ReaderBase, abc.Iterator): + """ + Read SAS files in SAS7BDAT format. + + Parameters + ---------- + path_or_buf : path name or buffer + Name of SAS file or file-like object pointing to SAS file + contents. + index : column identifier, defaults to None + Column to use as index. + convert_dates : bool, defaults to True + Attempt to convert dates to Pandas datetime values. Note that + some rarely used SAS date formats may be unsupported. + blank_missing : bool, defaults to True + Convert empty strings to missing values (SAS uses blanks to + indicate missing character variables). + chunksize : int, defaults to None + Return SAS7BDATReader object for iterations, returns chunks + with given number of lines. + encoding : str, 'infer', defaults to None + String encoding acc. to Python standard encodings, + encoding='infer' tries to detect the encoding from the file header, + encoding=None will leave the data in binary format. + convert_text : bool, defaults to True + If False, text variables are left as raw bytes. + convert_header_text : bool, defaults to True + If False, header text, including column names, are left as raw + bytes. + """ + + _int_length: int + _cached_page: bytes | None + + def __init__( + self, + path_or_buf: FilePath | ReadBuffer[bytes], + index=None, + convert_dates: bool = True, + blank_missing: bool = True, + chunksize: int | None = None, + encoding: str | None = None, + convert_text: bool = True, + convert_header_text: bool = True, + compression: CompressionOptions = "infer", + ) -> None: + self.index = index + self.convert_dates = convert_dates + self.blank_missing = blank_missing + self.chunksize = chunksize + self.encoding = encoding + self.convert_text = convert_text + self.convert_header_text = convert_header_text + + self.default_encoding = "latin-1" + self.compression = b"" + self.column_names_raw: list[bytes] = [] + self.column_names: list[str | bytes] = [] + self.column_formats: list[str | bytes] = [] + self.columns: list[_Column] = [] + + self._current_page_data_subheader_pointers: list[tuple[int, int]] = [] + self._cached_page = None + self._column_data_lengths: list[int] = [] + self._column_data_offsets: list[int] = [] + self._column_types: list[bytes] = [] + + self._current_row_in_file_index = 0 + self._current_row_on_page_index = 0 + self._current_row_in_file_index = 0 + + self.handles = get_handle( + path_or_buf, "rb", is_text=False, compression=compression + ) + + self._path_or_buf = self.handles.handle + + # Same order as const.SASIndex + self._subheader_processors = [ + self._process_rowsize_subheader, + self._process_columnsize_subheader, + self._process_subheader_counts, + self._process_columntext_subheader, + self._process_columnname_subheader, + self._process_columnattributes_subheader, + self._process_format_subheader, + self._process_columnlist_subheader, + None, # Data + ] + + try: + self._get_properties() + self._parse_metadata() + except Exception: + self.close() + raise + + def column_data_lengths(self) -> np.ndarray: + """Return a numpy int64 array of the column data lengths""" + return np.asarray(self._column_data_lengths, dtype=np.int64) + + def column_data_offsets(self) -> np.ndarray: + """Return a numpy int64 array of the column offsets""" + return np.asarray(self._column_data_offsets, dtype=np.int64) + + def column_types(self) -> np.ndarray: + """ + Returns a numpy character array of the column types: + s (string) or d (double) + """ + return np.asarray(self._column_types, dtype=np.dtype("S1")) + + def close(self) -> None: + self.handles.close() + + def _get_properties(self) -> None: + # Check magic number + self._path_or_buf.seek(0) + self._cached_page = self._path_or_buf.read(288) + if self._cached_page[0 : len(const.magic)] != const.magic: + raise ValueError("magic number mismatch (not a SAS file?)") + + # Get alignment information + buf = self._read_bytes(const.align_1_offset, const.align_1_length) + if buf == const.u64_byte_checker_value: + self.U64 = True + self._int_length = 8 + self._page_bit_offset = const.page_bit_offset_x64 + self._subheader_pointer_length = const.subheader_pointer_length_x64 + else: + self.U64 = False + self._page_bit_offset = const.page_bit_offset_x86 + self._subheader_pointer_length = const.subheader_pointer_length_x86 + self._int_length = 4 + buf = self._read_bytes(const.align_2_offset, const.align_2_length) + if buf == const.align_1_checker_value: + align1 = const.align_2_value + else: + align1 = 0 + + # Get endianness information + buf = self._read_bytes(const.endianness_offset, const.endianness_length) + if buf == b"\x01": + self.byte_order = "<" + self.need_byteswap = sys.byteorder == "big" + else: + self.byte_order = ">" + self.need_byteswap = sys.byteorder == "little" + + # Get encoding information + buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0] + if buf in const.encoding_names: + self.inferred_encoding = const.encoding_names[buf] + if self.encoding == "infer": + self.encoding = self.inferred_encoding + else: + self.inferred_encoding = f"unknown (code={buf})" + + # Timestamp is epoch 01/01/1960 + epoch = datetime(1960, 1, 1) + x = self._read_float( + const.date_created_offset + align1, const.date_created_length + ) + self.date_created = epoch + pd.to_timedelta(x, unit="s") + x = self._read_float( + const.date_modified_offset + align1, const.date_modified_length + ) + self.date_modified = epoch + pd.to_timedelta(x, unit="s") + + self.header_length = self._read_uint( + const.header_size_offset + align1, const.header_size_length + ) + + # Read the rest of the header into cached_page. + buf = self._path_or_buf.read(self.header_length - 288) + self._cached_page += buf + # error: Argument 1 to "len" has incompatible type "Optional[bytes]"; + # expected "Sized" + if len(self._cached_page) != self.header_length: # type: ignore[arg-type] + raise ValueError("The SAS7BDAT file appears to be truncated.") + + self._page_length = self._read_uint( + const.page_size_offset + align1, const.page_size_length + ) + + def __next__(self) -> DataFrame: + da = self.read(nrows=self.chunksize or 1) + if da.empty: + self.close() + raise StopIteration + return da + + # Read a single float of the given width (4 or 8). + def _read_float(self, offset: int, width: int): + assert self._cached_page is not None + if width == 4: + return read_float_with_byteswap( + self._cached_page, offset, self.need_byteswap + ) + elif width == 8: + return read_double_with_byteswap( + self._cached_page, offset, self.need_byteswap + ) + else: + self.close() + raise ValueError("invalid float width") + + # Read a single unsigned integer of the given width (1, 2, 4 or 8). + def _read_uint(self, offset: int, width: int) -> int: + assert self._cached_page is not None + if width == 1: + return self._read_bytes(offset, 1)[0] + elif width == 2: + return read_uint16_with_byteswap( + self._cached_page, offset, self.need_byteswap + ) + elif width == 4: + return read_uint32_with_byteswap( + self._cached_page, offset, self.need_byteswap + ) + elif width == 8: + return read_uint64_with_byteswap( + self._cached_page, offset, self.need_byteswap + ) + else: + self.close() + raise ValueError("invalid int width") + + def _read_bytes(self, offset: int, length: int): + assert self._cached_page is not None + if offset + length > len(self._cached_page): + self.close() + raise ValueError("The cached page is too small.") + return self._cached_page[offset : offset + length] + + def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes: + return self._convert_header_text( + self._read_bytes(offset, length).rstrip(b"\x00 ") + ) + + def _parse_metadata(self) -> None: + done = False + while not done: + self._cached_page = self._path_or_buf.read(self._page_length) + if len(self._cached_page) <= 0: + break + if len(self._cached_page) != self._page_length: + raise ValueError("Failed to read a meta data page from the SAS file.") + done = self._process_page_meta() + + def _process_page_meta(self) -> bool: + self._read_page_header() + pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type] + if self._current_page_type in pt: + self._process_page_metadata() + is_data_page = self._current_page_type == const.page_data_type + is_mix_page = self._current_page_type == const.page_mix_type + return bool( + is_data_page + or is_mix_page + or self._current_page_data_subheader_pointers != [] + ) + + def _read_page_header(self) -> None: + bit_offset = self._page_bit_offset + tx = const.page_type_offset + bit_offset + self._current_page_type = ( + self._read_uint(tx, const.page_type_length) & const.page_type_mask2 + ) + tx = const.block_count_offset + bit_offset + self._current_page_block_count = self._read_uint(tx, const.block_count_length) + tx = const.subheader_count_offset + bit_offset + self._current_page_subheaders_count = self._read_uint( + tx, const.subheader_count_length + ) + + def _process_page_metadata(self) -> None: + bit_offset = self._page_bit_offset + + for i in range(self._current_page_subheaders_count): + offset = const.subheader_pointers_offset + bit_offset + total_offset = offset + self._subheader_pointer_length * i + + subheader_offset = self._read_uint(total_offset, self._int_length) + total_offset += self._int_length + + subheader_length = self._read_uint(total_offset, self._int_length) + total_offset += self._int_length + + subheader_compression = self._read_uint(total_offset, 1) + total_offset += 1 + + subheader_type = self._read_uint(total_offset, 1) + + if ( + subheader_length == 0 + or subheader_compression == const.truncated_subheader_id + ): + continue + + subheader_signature = self._read_bytes(subheader_offset, self._int_length) + subheader_index = get_subheader_index(subheader_signature) + subheader_processor = self._subheader_processors[subheader_index] + + if subheader_processor is None: + f1 = subheader_compression in (const.compressed_subheader_id, 0) + f2 = subheader_type == const.compressed_subheader_type + if self.compression and f1 and f2: + self._current_page_data_subheader_pointers.append( + (subheader_offset, subheader_length) + ) + else: + self.close() + raise ValueError( + f"Unknown subheader signature {subheader_signature}" + ) + else: + subheader_processor(subheader_offset, subheader_length) + + def _process_rowsize_subheader(self, offset: int, length: int) -> None: + int_len = self._int_length + lcs_offset = offset + lcp_offset = offset + if self.U64: + lcs_offset += 682 + lcp_offset += 706 + else: + lcs_offset += 354 + lcp_offset += 378 + + self.row_length = self._read_uint( + offset + const.row_length_offset_multiplier * int_len, + int_len, + ) + self.row_count = self._read_uint( + offset + const.row_count_offset_multiplier * int_len, + int_len, + ) + self.col_count_p1 = self._read_uint( + offset + const.col_count_p1_multiplier * int_len, int_len + ) + self.col_count_p2 = self._read_uint( + offset + const.col_count_p2_multiplier * int_len, int_len + ) + mx = const.row_count_on_mix_page_offset_multiplier * int_len + self._mix_page_row_count = self._read_uint(offset + mx, int_len) + self._lcs = self._read_uint(lcs_offset, 2) + self._lcp = self._read_uint(lcp_offset, 2) + + def _process_columnsize_subheader(self, offset: int, length: int) -> None: + int_len = self._int_length + offset += int_len + self.column_count = self._read_uint(offset, int_len) + if self.col_count_p1 + self.col_count_p2 != self.column_count: + print( + f"Warning: column count mismatch ({self.col_count_p1} + " + f"{self.col_count_p2} != {self.column_count})\n" + ) + + # Unknown purpose + def _process_subheader_counts(self, offset: int, length: int) -> None: + pass + + def _process_columntext_subheader(self, offset: int, length: int) -> None: + offset += self._int_length + text_block_size = self._read_uint(offset, const.text_block_size_length) + + buf = self._read_bytes(offset, text_block_size) + cname_raw = buf[0:text_block_size].rstrip(b"\x00 ") + self.column_names_raw.append(cname_raw) + + if len(self.column_names_raw) == 1: + compression_literal = b"" + for cl in const.compression_literals: + if cl in cname_raw: + compression_literal = cl + self.compression = compression_literal + offset -= self._int_length + + offset1 = offset + 16 + if self.U64: + offset1 += 4 + + buf = self._read_bytes(offset1, self._lcp) + compression_literal = buf.rstrip(b"\x00") + if compression_literal == b"": + self._lcs = 0 + offset1 = offset + 32 + if self.U64: + offset1 += 4 + buf = self._read_bytes(offset1, self._lcp) + self.creator_proc = buf[0 : self._lcp] + elif compression_literal == const.rle_compression: + offset1 = offset + 40 + if self.U64: + offset1 += 4 + buf = self._read_bytes(offset1, self._lcp) + self.creator_proc = buf[0 : self._lcp] + elif self._lcs > 0: + self._lcp = 0 + offset1 = offset + 16 + if self.U64: + offset1 += 4 + buf = self._read_bytes(offset1, self._lcs) + self.creator_proc = buf[0 : self._lcp] + if hasattr(self, "creator_proc"): + self.creator_proc = self._convert_header_text(self.creator_proc) + + def _process_columnname_subheader(self, offset: int, length: int) -> None: + int_len = self._int_length + offset += int_len + column_name_pointers_count = (length - 2 * int_len - 12) // 8 + for i in range(column_name_pointers_count): + text_subheader = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_text_subheader_offset + ) + col_name_offset = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_offset_offset + ) + col_name_length = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_length_offset + ) + + idx = self._read_uint( + text_subheader, const.column_name_text_subheader_length + ) + col_offset = self._read_uint( + col_name_offset, const.column_name_offset_length + ) + col_len = self._read_uint(col_name_length, const.column_name_length_length) + + name_raw = self.column_names_raw[idx] + cname = name_raw[col_offset : col_offset + col_len] + self.column_names.append(self._convert_header_text(cname)) + + def _process_columnattributes_subheader(self, offset: int, length: int) -> None: + int_len = self._int_length + column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8) + for i in range(column_attributes_vectors_count): + col_data_offset = ( + offset + int_len + const.column_data_offset_offset + i * (int_len + 8) + ) + col_data_len = ( + offset + + 2 * int_len + + const.column_data_length_offset + + i * (int_len + 8) + ) + col_types = ( + offset + 2 * int_len + const.column_type_offset + i * (int_len + 8) + ) + + x = self._read_uint(col_data_offset, int_len) + self._column_data_offsets.append(x) + + x = self._read_uint(col_data_len, const.column_data_length_length) + self._column_data_lengths.append(x) + + x = self._read_uint(col_types, const.column_type_length) + self._column_types.append(b"d" if x == 1 else b"s") + + def _process_columnlist_subheader(self, offset: int, length: int) -> None: + # unknown purpose + pass + + def _process_format_subheader(self, offset: int, length: int) -> None: + int_len = self._int_length + text_subheader_format = ( + offset + const.column_format_text_subheader_index_offset + 3 * int_len + ) + col_format_offset = offset + const.column_format_offset_offset + 3 * int_len + col_format_len = offset + const.column_format_length_offset + 3 * int_len + text_subheader_label = ( + offset + const.column_label_text_subheader_index_offset + 3 * int_len + ) + col_label_offset = offset + const.column_label_offset_offset + 3 * int_len + col_label_len = offset + const.column_label_length_offset + 3 * int_len + + x = self._read_uint( + text_subheader_format, const.column_format_text_subheader_index_length + ) + format_idx = min(x, len(self.column_names_raw) - 1) + + format_start = self._read_uint( + col_format_offset, const.column_format_offset_length + ) + format_len = self._read_uint(col_format_len, const.column_format_length_length) + + label_idx = self._read_uint( + text_subheader_label, const.column_label_text_subheader_index_length + ) + label_idx = min(label_idx, len(self.column_names_raw) - 1) + + label_start = self._read_uint( + col_label_offset, const.column_label_offset_length + ) + label_len = self._read_uint(col_label_len, const.column_label_length_length) + + label_names = self.column_names_raw[label_idx] + column_label = self._convert_header_text( + label_names[label_start : label_start + label_len] + ) + format_names = self.column_names_raw[format_idx] + column_format = self._convert_header_text( + format_names[format_start : format_start + format_len] + ) + current_column_number = len(self.columns) + + col = _Column( + current_column_number, + self.column_names[current_column_number], + column_label, + column_format, + self._column_types[current_column_number], + self._column_data_lengths[current_column_number], + ) + + self.column_formats.append(column_format) + self.columns.append(col) + + def read(self, nrows: int | None = None) -> DataFrame: + if (nrows is None) and (self.chunksize is not None): + nrows = self.chunksize + elif nrows is None: + nrows = self.row_count + + if len(self._column_types) == 0: + self.close() + raise EmptyDataError("No columns to parse from file") + + if nrows > 0 and self._current_row_in_file_index >= self.row_count: + return DataFrame() + + nrows = min(nrows, self.row_count - self._current_row_in_file_index) + + nd = self._column_types.count(b"d") + ns = self._column_types.count(b"s") + + self._string_chunk = np.empty((ns, nrows), dtype=object) + self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8) + + self._current_row_in_chunk_index = 0 + p = Parser(self) + p.read(nrows) + + rslt = self._chunk_to_dataframe() + if self.index is not None: + rslt = rslt.set_index(self.index) + + return rslt + + def _read_next_page(self): + self._current_page_data_subheader_pointers = [] + self._cached_page = self._path_or_buf.read(self._page_length) + if len(self._cached_page) <= 0: + return True + elif len(self._cached_page) != self._page_length: + self.close() + msg = ( + "failed to read complete page from file (read " + f"{len(self._cached_page):d} of {self._page_length:d} bytes)" + ) + raise ValueError(msg) + + self._read_page_header() + if self._current_page_type in const.page_meta_types: + self._process_page_metadata() + + if self._current_page_type not in const.page_meta_types + [ + const.page_data_type, + const.page_mix_type, + ]: + return self._read_next_page() + + return False + + def _chunk_to_dataframe(self) -> DataFrame: + n = self._current_row_in_chunk_index + m = self._current_row_in_file_index + ix = range(m - n, m) + rslt = {} + + js, jb = 0, 0 + for j in range(self.column_count): + name = self.column_names[j] + + if self._column_types[j] == b"d": + col_arr = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") + rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix, copy=False) + if self.convert_dates: + if self.column_formats[j] in const.sas_date_formats: + rslt[name] = _convert_datetimes(rslt[name], "d") + elif self.column_formats[j] in const.sas_datetime_formats: + rslt[name] = _convert_datetimes(rslt[name], "s") + jb += 1 + elif self._column_types[j] == b"s": + rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) + if self.convert_text and (self.encoding is not None): + rslt[name] = self._decode_string(rslt[name].str) + js += 1 + else: + self.close() + raise ValueError(f"unknown column type {repr(self._column_types[j])}") + + df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False) + return df + + def _decode_string(self, b): + return b.decode(self.encoding or self.default_encoding) + + def _convert_header_text(self, b: bytes) -> str | bytes: + if self.convert_header_text: + return self._decode_string(b) + else: + return b diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sas_constants.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sas_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..62c17bd03927e5f852af708e6b9ef6cf7e74d57c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sas_constants.py @@ -0,0 +1,310 @@ +from __future__ import annotations + +from typing import Final + +magic: Final = ( + b"\x00\x00\x00\x00\x00\x00\x00\x00" + b"\x00\x00\x00\x00\xc2\xea\x81\x60" + b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" + b"\x09\xc7\x31\x8c\x18\x1f\x10\x11" +) + +align_1_checker_value: Final = b"3" +align_1_offset: Final = 32 +align_1_length: Final = 1 +align_1_value: Final = 4 +u64_byte_checker_value: Final = b"3" +align_2_offset: Final = 35 +align_2_length: Final = 1 +align_2_value: Final = 4 +endianness_offset: Final = 37 +endianness_length: Final = 1 +platform_offset: Final = 39 +platform_length: Final = 1 +encoding_offset: Final = 70 +encoding_length: Final = 1 +dataset_offset: Final = 92 +dataset_length: Final = 64 +file_type_offset: Final = 156 +file_type_length: Final = 8 +date_created_offset: Final = 164 +date_created_length: Final = 8 +date_modified_offset: Final = 172 +date_modified_length: Final = 8 +header_size_offset: Final = 196 +header_size_length: Final = 4 +page_size_offset: Final = 200 +page_size_length: Final = 4 +page_count_offset: Final = 204 +page_count_length: Final = 4 +sas_release_offset: Final = 216 +sas_release_length: Final = 8 +sas_server_type_offset: Final = 224 +sas_server_type_length: Final = 16 +os_version_number_offset: Final = 240 +os_version_number_length: Final = 16 +os_maker_offset: Final = 256 +os_maker_length: Final = 16 +os_name_offset: Final = 272 +os_name_length: Final = 16 +page_bit_offset_x86: Final = 16 +page_bit_offset_x64: Final = 32 +subheader_pointer_length_x86: Final = 12 +subheader_pointer_length_x64: Final = 24 +page_type_offset: Final = 0 +page_type_length: Final = 2 +block_count_offset: Final = 2 +block_count_length: Final = 2 +subheader_count_offset: Final = 4 +subheader_count_length: Final = 2 +page_type_mask: Final = 0x0F00 +# Keep "page_comp_type" bits +page_type_mask2: Final = 0xF000 | page_type_mask +page_meta_type: Final = 0x0000 +page_data_type: Final = 0x0100 +page_mix_type: Final = 0x0200 +page_amd_type: Final = 0x0400 +page_meta2_type: Final = 0x4000 +page_comp_type: Final = 0x9000 +page_meta_types: Final = [page_meta_type, page_meta2_type] +subheader_pointers_offset: Final = 8 +truncated_subheader_id: Final = 1 +compressed_subheader_id: Final = 4 +compressed_subheader_type: Final = 1 +text_block_size_length: Final = 2 +row_length_offset_multiplier: Final = 5 +row_count_offset_multiplier: Final = 6 +col_count_p1_multiplier: Final = 9 +col_count_p2_multiplier: Final = 10 +row_count_on_mix_page_offset_multiplier: Final = 15 +column_name_pointer_length: Final = 8 +column_name_text_subheader_offset: Final = 0 +column_name_text_subheader_length: Final = 2 +column_name_offset_offset: Final = 2 +column_name_offset_length: Final = 2 +column_name_length_offset: Final = 4 +column_name_length_length: Final = 2 +column_data_offset_offset: Final = 8 +column_data_length_offset: Final = 8 +column_data_length_length: Final = 4 +column_type_offset: Final = 14 +column_type_length: Final = 1 +column_format_text_subheader_index_offset: Final = 22 +column_format_text_subheader_index_length: Final = 2 +column_format_offset_offset: Final = 24 +column_format_offset_length: Final = 2 +column_format_length_offset: Final = 26 +column_format_length_length: Final = 2 +column_label_text_subheader_index_offset: Final = 28 +column_label_text_subheader_index_length: Final = 2 +column_label_offset_offset: Final = 30 +column_label_offset_length: Final = 2 +column_label_length_offset: Final = 32 +column_label_length_length: Final = 2 +rle_compression: Final = b"SASYZCRL" +rdc_compression: Final = b"SASYZCR2" + +compression_literals: Final = [rle_compression, rdc_compression] + +# Incomplete list of encodings, using SAS nomenclature: +# https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html +# corresponding to the Python documentation of standard encodings +# https://docs.python.org/3/library/codecs.html#standard-encodings +encoding_names: Final = { + 20: "utf-8", + 29: "latin1", + 30: "latin2", + 31: "latin3", + 32: "latin4", + 33: "cyrillic", + 34: "arabic", + 35: "greek", + 36: "hebrew", + 37: "latin5", + 38: "latin6", + 39: "cp874", + 40: "latin9", + 41: "cp437", + 42: "cp850", + 43: "cp852", + 44: "cp857", + 45: "cp858", + 46: "cp862", + 47: "cp864", + 48: "cp865", + 49: "cp866", + 50: "cp869", + 51: "cp874", + # 52: "", # not found + # 53: "", # not found + # 54: "", # not found + 55: "cp720", + 56: "cp737", + 57: "cp775", + 58: "cp860", + 59: "cp863", + 60: "cp1250", + 61: "cp1251", + 62: "cp1252", + 63: "cp1253", + 64: "cp1254", + 65: "cp1255", + 66: "cp1256", + 67: "cp1257", + 68: "cp1258", + 118: "cp950", + # 119: "", # not found + 123: "big5", + 125: "gb2312", + 126: "cp936", + 134: "euc_jp", + 136: "cp932", + 138: "shift_jis", + 140: "euc-kr", + 141: "cp949", + 227: "latin8", + # 228: "", # not found + # 229: "" # not found +} + + +class SASIndex: + row_size_index: Final = 0 + column_size_index: Final = 1 + subheader_counts_index: Final = 2 + column_text_index: Final = 3 + column_name_index: Final = 4 + column_attributes_index: Final = 5 + format_and_label_index: Final = 6 + column_list_index: Final = 7 + data_subheader_index: Final = 8 + + +subheader_signature_to_index: Final = { + b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index, + b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index, + b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index, + b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index, + b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index, + b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index, + b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index, + b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index, + b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index, + b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index, + b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index, + b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index, + b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index, + b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index, + b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index, + b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index, + b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index, + b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index, + b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index, + b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index, + b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index, + b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index, + b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index, + b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index, +} + + +# List of frequently used SAS date and datetime formats +# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm +# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java +sas_date_formats: Final = ( + "DATE", + "DAY", + "DDMMYY", + "DOWNAME", + "JULDAY", + "JULIAN", + "MMDDYY", + "MMYY", + "MMYYC", + "MMYYD", + "MMYYP", + "MMYYS", + "MMYYN", + "MONNAME", + "MONTH", + "MONYY", + "QTR", + "QTRR", + "NENGO", + "WEEKDATE", + "WEEKDATX", + "WEEKDAY", + "WEEKV", + "WORDDATE", + "WORDDATX", + "YEAR", + "YYMM", + "YYMMC", + "YYMMD", + "YYMMP", + "YYMMS", + "YYMMN", + "YYMON", + "YYMMDD", + "YYQ", + "YYQC", + "YYQD", + "YYQP", + "YYQS", + "YYQN", + "YYQR", + "YYQRC", + "YYQRD", + "YYQRP", + "YYQRS", + "YYQRN", + "YYMMDDP", + "YYMMDDC", + "E8601DA", + "YYMMDDN", + "MMDDYYC", + "MMDDYYS", + "MMDDYYD", + "YYMMDDS", + "B8601DA", + "DDMMYYN", + "YYMMDDD", + "DDMMYYB", + "DDMMYYP", + "MMDDYYP", + "YYMMDDB", + "MMDDYYN", + "DDMMYYC", + "DDMMYYD", + "DDMMYYS", + "MINGUO", +) + +sas_datetime_formats: Final = ( + "DATETIME", + "DTWKDATX", + "B8601DN", + "B8601DT", + "B8601DX", + "B8601DZ", + "B8601LX", + "E8601DN", + "E8601DT", + "E8601DX", + "E8601DZ", + "E8601LX", + "DATEAMPM", + "DTDATE", + "DTMONYY", + "DTMONYY", + "DTWKDATX", + "DTYEAR", + "TOD", + "MDYAMPM", +) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sas_xport.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sas_xport.py new file mode 100644 index 0000000000000000000000000000000000000000..11b2ed0ee73168ba82e3b8d312f96bcea9398e49 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sas_xport.py @@ -0,0 +1,508 @@ +""" +Read a SAS XPort format file into a Pandas DataFrame. + +Based on code from Jack Cushman (github.com/jcushman/xport). + +The file format is defined here: + +https://support.sas.com/content/dam/SAS/support/en/technical-papers/record-layout-of-a-sas-version-5-or-6-data-set-in-sas-transport-xport-format.pdf +""" +from __future__ import annotations + +from collections import abc +from datetime import datetime +import struct +from typing import TYPE_CHECKING +import warnings + +import numpy as np + +from pandas.util._decorators import Appender +from pandas.util._exceptions import find_stack_level + +import pandas as pd + +from pandas.io.common import get_handle +from pandas.io.sas.sasreader import ReaderBase + +if TYPE_CHECKING: + from pandas._typing import ( + CompressionOptions, + DatetimeNaTType, + FilePath, + ReadBuffer, + ) +_correct_line1 = ( + "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_correct_header1 = ( + "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000" +) +_correct_header2 = ( + "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_correct_obs_header = ( + "HEADER RECORD*******OBS HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_fieldkeys = [ + "ntype", + "nhfun", + "field_length", + "nvar0", + "name", + "label", + "nform", + "nfl", + "num_decimals", + "nfj", + "nfill", + "niform", + "nifl", + "nifd", + "npos", + "_", +] + + +_base_params_doc = """\ +Parameters +---------- +filepath_or_buffer : str or file-like object + Path to SAS file or object implementing binary read method.""" + +_params2_doc = """\ +index : identifier of index column + Identifier of column that should be used as index of the DataFrame. +encoding : str + Encoding for text data. +chunksize : int + Read file `chunksize` lines at a time, returns iterator.""" + +_format_params_doc = """\ +format : str + File format, only `xport` is currently supported.""" + +_iterator_doc = """\ +iterator : bool, default False + Return XportReader object for reading file incrementally.""" + + +_read_sas_doc = f"""Read a SAS file into a DataFrame. + +{_base_params_doc} +{_format_params_doc} +{_params2_doc} +{_iterator_doc} + +Returns +------- +DataFrame or XportReader + +Examples +-------- +Read a SAS Xport file: + +>>> df = pd.read_sas('filename.XPT') + +Read a Xport file in 10,000 line chunks: + +>>> itr = pd.read_sas('filename.XPT', chunksize=10000) +>>> for chunk in itr: +>>> do_something(chunk) + +""" + +_xport_reader_doc = f"""\ +Class for reading SAS Xport files. + +{_base_params_doc} +{_params2_doc} + +Attributes +---------- +member_info : list + Contains information about the file +fields : list + Contains information about the variables in the file +""" + +_read_method_doc = """\ +Read observations from SAS Xport file, returning as data frame. + +Parameters +---------- +nrows : int + Number of rows to read from data file; if None, read whole + file. + +Returns +------- +A DataFrame. +""" + + +def _parse_date(datestr: str) -> DatetimeNaTType: + """Given a date in xport format, return Python date.""" + try: + # e.g. "16FEB11:10:07:55" + return datetime.strptime(datestr, "%d%b%y:%H:%M:%S") + except ValueError: + return pd.NaT + + +def _split_line(s: str, parts): + """ + Parameters + ---------- + s: str + Fixed-length string to split + parts: list of (name, length) pairs + Used to break up string, name '_' will be filtered from output. + + Returns + ------- + Dict of name:contents of string at given location. + """ + out = {} + start = 0 + for name, length in parts: + out[name] = s[start : start + length].strip() + start += length + del out["_"] + return out + + +def _handle_truncated_float_vec(vec, nbytes): + # This feature is not well documented, but some SAS XPORT files + # have 2-7 byte "truncated" floats. To read these truncated + # floats, pad them with zeros on the right to make 8 byte floats. + # + # References: + # https://github.com/jcushman/xport/pull/3 + # The R "foreign" library + + if nbytes != 8: + vec1 = np.zeros(len(vec), np.dtype("S8")) + dtype = np.dtype(f"S{nbytes},S{8 - nbytes}") + vec2 = vec1.view(dtype=dtype) + vec2["f0"] = vec + return vec2 + + return vec + + +def _parse_float_vec(vec): + """ + Parse a vector of float values representing IBM 8 byte floats into + native 8 byte floats. + """ + dtype = np.dtype(">u4,>u4") + vec1 = vec.view(dtype=dtype) + xport1 = vec1["f0"] + xport2 = vec1["f1"] + + # Start by setting first half of ieee number to first half of IBM + # number sans exponent + ieee1 = xport1 & 0x00FFFFFF + + # The fraction bit to the left of the binary point in the ieee + # format was set and the number was shifted 0, 1, 2, or 3 + # places. This will tell us how to adjust the ibm exponent to be a + # power of 2 ieee exponent and how to shift the fraction bits to + # restore the correct magnitude. + shift = np.zeros(len(vec), dtype=np.uint8) + shift[np.where(xport1 & 0x00200000)] = 1 + shift[np.where(xport1 & 0x00400000)] = 2 + shift[np.where(xport1 & 0x00800000)] = 3 + + # shift the ieee number down the correct number of places then + # set the second half of the ieee number to be the second half + # of the ibm number shifted appropriately, ored with the bits + # from the first half that would have been shifted in if we + # could shift a double. All we are worried about are the low + # order 3 bits of the first half since we're only shifting by + # 1, 2, or 3. + ieee1 >>= shift + ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift))) + + # clear the 1 bit to the left of the binary point + ieee1 &= 0xFFEFFFFF + + # set the exponent of the ieee number to be the actual exponent + # plus the shift count + 1023. Or this into the first half of the + # ieee number. The ibm exponent is excess 64 but is adjusted by 65 + # since during conversion to ibm format the exponent is + # incremented by 1 and the fraction bits left 4 positions to the + # right of the radix point. (had to add >> 24 because C treats & + # 0x7f as 0x7f000000 and Python doesn't) + ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | ( + xport1 & 0x80000000 + ) + + ieee = np.empty((len(ieee1),), dtype=">u4,>u4") + ieee["f0"] = ieee1 + ieee["f1"] = ieee2 + ieee = ieee.view(dtype=">f8") + ieee = ieee.astype("f8") + + return ieee + + +class XportReader(ReaderBase, abc.Iterator): + __doc__ = _xport_reader_doc + + def __init__( + self, + filepath_or_buffer: FilePath | ReadBuffer[bytes], + index=None, + encoding: str | None = "ISO-8859-1", + chunksize: int | None = None, + compression: CompressionOptions = "infer", + ) -> None: + self._encoding = encoding + self._lines_read = 0 + self._index = index + self._chunksize = chunksize + + self.handles = get_handle( + filepath_or_buffer, + "rb", + encoding=encoding, + is_text=False, + compression=compression, + ) + self.filepath_or_buffer = self.handles.handle + + try: + self._read_header() + except Exception: + self.close() + raise + + def close(self) -> None: + self.handles.close() + + def _get_row(self): + return self.filepath_or_buffer.read(80).decode() + + def _read_header(self) -> None: + self.filepath_or_buffer.seek(0) + + # read file header + line1 = self._get_row() + if line1 != _correct_line1: + if "**COMPRESSED**" in line1: + # this was created with the PROC CPORT method and can't be read + # https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/movefile/p1bm6aqp3fw4uin1hucwh718f6kp.htm + raise ValueError( + "Header record indicates a CPORT file, which is not readable." + ) + raise ValueError("Header record is not an XPORT file.") + + line2 = self._get_row() + fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]] + file_info = _split_line(line2, fif) + if file_info["prefix"] != "SAS SAS SASLIB": + raise ValueError("Header record has invalid prefix.") + file_info["created"] = _parse_date(file_info["created"]) + self.file_info = file_info + + line3 = self._get_row() + file_info["modified"] = _parse_date(line3[:16]) + + # read member header + header1 = self._get_row() + header2 = self._get_row() + headflag1 = header1.startswith(_correct_header1) + headflag2 = header2 == _correct_header2 + if not (headflag1 and headflag2): + raise ValueError("Member header not found") + # usually 140, could be 135 + fieldnamelength = int(header1[-5:-2]) + + # member info + mem = [ + ["prefix", 8], + ["set_name", 8], + ["sasdata", 8], + ["version", 8], + ["OS", 8], + ["_", 24], + ["created", 16], + ] + member_info = _split_line(self._get_row(), mem) + mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]] + member_info.update(_split_line(self._get_row(), mem)) + member_info["modified"] = _parse_date(member_info["modified"]) + member_info["created"] = _parse_date(member_info["created"]) + self.member_info = member_info + + # read field names + types = {1: "numeric", 2: "char"} + fieldcount = int(self._get_row()[54:58]) + datalength = fieldnamelength * fieldcount + # round up to nearest 80 + if datalength % 80: + datalength += 80 - datalength % 80 + fielddata = self.filepath_or_buffer.read(datalength) + fields = [] + obs_length = 0 + while len(fielddata) >= fieldnamelength: + # pull data for one field + fieldbytes, fielddata = ( + fielddata[:fieldnamelength], + fielddata[fieldnamelength:], + ) + + # rest at end gets ignored, so if field is short, pad out + # to match struct pattern below + fieldbytes = fieldbytes.ljust(140) + + fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", fieldbytes) + field = dict(zip(_fieldkeys, fieldstruct)) + del field["_"] + field["ntype"] = types[field["ntype"]] + fl = field["field_length"] + if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)): + msg = f"Floating field width {fl} is not between 2 and 8." + raise TypeError(msg) + + for k, v in field.items(): + try: + field[k] = v.strip() + except AttributeError: + pass + + obs_length += field["field_length"] + fields += [field] + + header = self._get_row() + if not header == _correct_obs_header: + raise ValueError("Observation header not found.") + + self.fields = fields + self.record_length = obs_length + self.record_start = self.filepath_or_buffer.tell() + + self.nobs = self._record_count() + self.columns = [x["name"].decode() for x in self.fields] + + # Setup the dtype. + dtypel = [ + ("s" + str(i), "S" + str(field["field_length"])) + for i, field in enumerate(self.fields) + ] + dtype = np.dtype(dtypel) + self._dtype = dtype + + def __next__(self) -> pd.DataFrame: + return self.read(nrows=self._chunksize or 1) + + def _record_count(self) -> int: + """ + Get number of records in file. + + This is maybe suboptimal because we have to seek to the end of + the file. + + Side effect: returns file position to record_start. + """ + self.filepath_or_buffer.seek(0, 2) + total_records_length = self.filepath_or_buffer.tell() - self.record_start + + if total_records_length % 80 != 0: + warnings.warn( + "xport file may be corrupted.", + stacklevel=find_stack_level(), + ) + + if self.record_length > 80: + self.filepath_or_buffer.seek(self.record_start) + return total_records_length // self.record_length + + self.filepath_or_buffer.seek(-80, 2) + last_card_bytes = self.filepath_or_buffer.read(80) + last_card = np.frombuffer(last_card_bytes, dtype=np.uint64) + + # 8 byte blank + ix = np.flatnonzero(last_card == 2314885530818453536) + + if len(ix) == 0: + tail_pad = 0 + else: + tail_pad = 8 * len(ix) + + self.filepath_or_buffer.seek(self.record_start) + + return (total_records_length - tail_pad) // self.record_length + + def get_chunk(self, size: int | None = None) -> pd.DataFrame: + """ + Reads lines from Xport file and returns as dataframe + + Parameters + ---------- + size : int, defaults to None + Number of lines to read. If None, reads whole file. + + Returns + ------- + DataFrame + """ + if size is None: + size = self._chunksize + return self.read(nrows=size) + + def _missing_double(self, vec): + v = vec.view(dtype="u1,u1,u2,u4") + miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0) + miss1 = ( + ((v["f0"] >= 0x41) & (v["f0"] <= 0x5A)) + | (v["f0"] == 0x5F) + | (v["f0"] == 0x2E) + ) + miss &= miss1 + return miss + + @Appender(_read_method_doc) + def read(self, nrows: int | None = None) -> pd.DataFrame: + if nrows is None: + nrows = self.nobs + + read_lines = min(nrows, self.nobs - self._lines_read) + read_len = read_lines * self.record_length + if read_len <= 0: + self.close() + raise StopIteration + raw = self.filepath_or_buffer.read(read_len) + data = np.frombuffer(raw, dtype=self._dtype, count=read_lines) + + df_data = {} + for j, x in enumerate(self.columns): + vec = data["s" + str(j)] + ntype = self.fields[j]["ntype"] + if ntype == "numeric": + vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"]) + miss = self._missing_double(vec) + v = _parse_float_vec(vec) + v[miss] = np.nan + elif self.fields[j]["ntype"] == "char": + v = [y.rstrip() for y in vec] + + if self._encoding is not None: + v = [y.decode(self._encoding) for y in v] + + df_data.update({x: v}) + df = pd.DataFrame(df_data) + + if self._index is None: + df.index = pd.Index(range(self._lines_read, self._lines_read + read_lines)) + else: + df = df.set_index(self._index) + + self._lines_read += read_lines + + return df diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sasreader.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sasreader.py new file mode 100644 index 0000000000000000000000000000000000000000..c39313d5dc6548fcc014f7a886988a2b9d9001ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/io/sas/sasreader.py @@ -0,0 +1,178 @@ +""" +Read SAS sas7bdat or xport files. +""" +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +from typing import ( + TYPE_CHECKING, + overload, +) + +from pandas.util._decorators import doc + +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import stringify_path + +if TYPE_CHECKING: + from collections.abc import Hashable + from types import TracebackType + + from pandas._typing import ( + CompressionOptions, + FilePath, + ReadBuffer, + Self, + ) + + from pandas import DataFrame + + +class ReaderBase(ABC): + """ + Protocol for XportReader and SAS7BDATReader classes. + """ + + @abstractmethod + def read(self, nrows: int | None = None) -> DataFrame: + ... + + @abstractmethod + def close(self) -> None: + ... + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: + self.close() + + +@overload +def read_sas( + filepath_or_buffer: FilePath | ReadBuffer[bytes], + *, + format: str | None = ..., + index: Hashable | None = ..., + encoding: str | None = ..., + chunksize: int = ..., + iterator: bool = ..., + compression: CompressionOptions = ..., +) -> ReaderBase: + ... + + +@overload +def read_sas( + filepath_or_buffer: FilePath | ReadBuffer[bytes], + *, + format: str | None = ..., + index: Hashable | None = ..., + encoding: str | None = ..., + chunksize: None = ..., + iterator: bool = ..., + compression: CompressionOptions = ..., +) -> DataFrame | ReaderBase: + ... + + +@doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer") +def read_sas( + filepath_or_buffer: FilePath | ReadBuffer[bytes], + *, + format: str | None = None, + index: Hashable | None = None, + encoding: str | None = None, + chunksize: int | None = None, + iterator: bool = False, + compression: CompressionOptions = "infer", +) -> DataFrame | ReaderBase: + """ + Read SAS files stored as either XPORT or SAS7BDAT format files. + + Parameters + ---------- + filepath_or_buffer : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``read()`` function. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.sas7bdat``. + format : str {{'xport', 'sas7bdat'}} or None + If None, file format is inferred from file extension. If 'xport' or + 'sas7bdat', uses the corresponding format. + index : identifier of index column, defaults to None + Identifier of column that should be used as index of the DataFrame. + encoding : str, default is None + Encoding for text data. If None, text data are stored as raw bytes. + chunksize : int + Read file `chunksize` lines at a time, returns iterator. + iterator : bool, defaults to False + If True, returns an iterator for reading the file incrementally. + {decompression_options} + + Returns + ------- + DataFrame if iterator=False and chunksize=None, else SAS7BDATReader + or XportReader + + Examples + -------- + >>> df = pd.read_sas("sas_data.sas7bdat") # doctest: +SKIP + """ + if format is None: + buffer_error_msg = ( + "If this is a buffer object rather " + "than a string name, you must specify a format string" + ) + filepath_or_buffer = stringify_path(filepath_or_buffer) + if not isinstance(filepath_or_buffer, str): + raise ValueError(buffer_error_msg) + fname = filepath_or_buffer.lower() + if ".xpt" in fname: + format = "xport" + elif ".sas7bdat" in fname: + format = "sas7bdat" + else: + raise ValueError( + f"unable to infer format of SAS file from filename: {repr(fname)}" + ) + + reader: ReaderBase + if format.lower() == "xport": + from pandas.io.sas.sas_xport import XportReader + + reader = XportReader( + filepath_or_buffer, + index=index, + encoding=encoding, + chunksize=chunksize, + compression=compression, + ) + elif format.lower() == "sas7bdat": + from pandas.io.sas.sas7bdat import SAS7BDATReader + + reader = SAS7BDATReader( + filepath_or_buffer, + index=index, + encoding=encoding, + chunksize=chunksize, + compression=compression, + ) + else: + raise ValueError("unknown SAS format") + + if iterator or chunksize: + return reader + + with reader: + return reader.read() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2cd98944ff508f8a0a8faeac854816e86bb9f73 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__pycache__/_core.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__pycache__/_core.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2f2cc4a92f15bdf335e41fb5514404cea609d23 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__pycache__/_core.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__pycache__/_misc.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__pycache__/_misc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94dc1fc563b75a9485275ee8abbf9768660f8016 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/__pycache__/_misc.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..75c61da03795af0d4f60cd4d4a8b8e0dd45e3d5e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__init__.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pandas.plotting._matplotlib.boxplot import ( + BoxPlot, + boxplot, + boxplot_frame, + boxplot_frame_groupby, +) +from pandas.plotting._matplotlib.converter import ( + deregister, + register, +) +from pandas.plotting._matplotlib.core import ( + AreaPlot, + BarhPlot, + BarPlot, + HexBinPlot, + LinePlot, + PiePlot, + ScatterPlot, +) +from pandas.plotting._matplotlib.hist import ( + HistPlot, + KdePlot, + hist_frame, + hist_series, +) +from pandas.plotting._matplotlib.misc import ( + andrews_curves, + autocorrelation_plot, + bootstrap_plot, + lag_plot, + parallel_coordinates, + radviz, + scatter_matrix, +) +from pandas.plotting._matplotlib.tools import table + +if TYPE_CHECKING: + from pandas.plotting._matplotlib.core import MPLPlot + +PLOT_CLASSES: dict[str, type[MPLPlot]] = { + "line": LinePlot, + "bar": BarPlot, + "barh": BarhPlot, + "box": BoxPlot, + "hist": HistPlot, + "kde": KdePlot, + "area": AreaPlot, + "pie": PiePlot, + "scatter": ScatterPlot, + "hexbin": HexBinPlot, +} + + +def plot(data, kind, **kwargs): + # Importing pyplot at the top of the file (before the converters are + # registered) causes problems in matplotlib 2 (converters seem to not + # work) + import matplotlib.pyplot as plt + + if kwargs.pop("reuse_plot", False): + ax = kwargs.get("ax") + if ax is None and len(plt.get_fignums()) > 0: + with plt.rc_context(): + ax = plt.gca() + kwargs["ax"] = getattr(ax, "left_ax", ax) + plot_obj = PLOT_CLASSES[kind](data, **kwargs) + plot_obj.generate() + plot_obj.draw() + return plot_obj.result + + +__all__ = [ + "plot", + "hist_series", + "hist_frame", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "table", + "andrews_curves", + "autocorrelation_plot", + "bootstrap_plot", + "lag_plot", + "parallel_coordinates", + "radviz", + "scatter_matrix", + "register", + "deregister", +] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8419aff2560e8b496c96be4f3f74bcda79a033f1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/boxplot.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/boxplot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ad7910fbcd7848546002220c2e12d14fd7aa15b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/boxplot.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/converter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/converter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c109e0660c3f6631e942158bc00486fcc0c2d33 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/converter.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/core.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/core.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..201fc7dc7ba152256ae8e2d6bc365da7f83ebb77 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/core.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/groupby.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/groupby.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2acfb45cdf1d95707bbe2e6225ac60c7e8ee8c30 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/groupby.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/hist.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/hist.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9daf43bbc475cdbeb62c674195eb3b557d554433 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/hist.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/misc.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/misc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7dd4cea727c2993906a3b0818141c0e1e11f75f9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/misc.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/style.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/style.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2f269d2bbddbbadeae4b2734a8c945606b10e49 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/style.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/timeseries.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/timeseries.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a9910b57cdaf6af50b68e0b094236e08ede5c12 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/timeseries.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/tools.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/tools.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae5b4e46c80a8bda5894610319be1f48bf15f6c6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__pycache__/tools.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/boxplot.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/boxplot.py new file mode 100644 index 0000000000000000000000000000000000000000..d2b76decaa75d3e7adc9792763db4e276e514ff1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/boxplot.py @@ -0,0 +1,572 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Literal, + NamedTuple, +) +import warnings + +from matplotlib.artist import setp +import numpy as np + +from pandas._libs import lib +from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import is_dict_like +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import remove_na_arraylike + +import pandas as pd +import pandas.core.common as com + +from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib.core import ( + LinePlot, + MPLPlot, +) +from pandas.plotting._matplotlib.groupby import create_iter_data_given_by +from pandas.plotting._matplotlib.style import get_standard_colors +from pandas.plotting._matplotlib.tools import ( + create_subplots, + flatten_axes, + maybe_adjust_figure, +) + +if TYPE_CHECKING: + from collections.abc import Collection + + from matplotlib.axes import Axes + from matplotlib.figure import Figure + from matplotlib.lines import Line2D + + from pandas._typing import MatplotlibColor + + +def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> None: + """Set the tick labels of a given axis. + + Due to https://github.com/matplotlib/matplotlib/pull/17266, we need to handle the + case of repeated ticks (due to `FixedLocator`) and thus we duplicate the number of + labels. + """ + ticks = ax.get_xticks() if is_vertical else ax.get_yticks() + if len(ticks) != len(labels): + i, remainder = divmod(len(ticks), len(labels)) + assert remainder == 0, remainder + labels *= i + if is_vertical: + ax.set_xticklabels(labels, **kwargs) + else: + ax.set_yticklabels(labels, **kwargs) + + +class BoxPlot(LinePlot): + @property + def _kind(self) -> Literal["box"]: + return "box" + + _layout_type = "horizontal" + + _valid_return_types = (None, "axes", "dict", "both") + + class BP(NamedTuple): + # namedtuple to hold results + ax: Axes + lines: dict[str, list[Line2D]] + + def __init__(self, data, return_type: str = "axes", **kwargs) -> None: + if return_type not in self._valid_return_types: + raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}") + + self.return_type = return_type + # Do not call LinePlot.__init__ which may fill nan + MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called + + if self.subplots: + # Disable label ax sharing. Otherwise, all subplots shows last + # column label + if self.orientation == "vertical": + self.sharex = False + else: + self.sharey = False + + # error: Signature of "_plot" incompatible with supertype "MPLPlot" + @classmethod + def _plot( # type: ignore[override] + cls, ax: Axes, y: np.ndarray, column_num=None, return_type: str = "axes", **kwds + ): + ys: np.ndarray | list[np.ndarray] + if y.ndim == 2: + ys = [remove_na_arraylike(v) for v in y] + # Boxplot fails with empty arrays, so need to add a NaN + # if any cols are empty + # GH 8181 + ys = [v if v.size > 0 else np.array([np.nan]) for v in ys] + else: + ys = remove_na_arraylike(y) + bp = ax.boxplot(ys, **kwds) + + if return_type == "dict": + return bp, bp + elif return_type == "both": + return cls.BP(ax=ax, lines=bp), bp + else: + return ax, bp + + def _validate_color_args(self, color, colormap): + if color is lib.no_default: + return None + + if colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used " + "simultaneously. Using 'color'", + stacklevel=find_stack_level(), + ) + + if isinstance(color, dict): + valid_keys = ["boxes", "whiskers", "medians", "caps"] + for key in color: + if key not in valid_keys: + raise ValueError( + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" + ) + return color + + @cache_readonly + def _color_attrs(self): + # get standard colors for default + # use 2 colors by default, for box/whisker and median + # flier colors isn't needed here + # because it can be specified by ``sym`` kw + return get_standard_colors(num_colors=3, colormap=self.colormap, color=None) + + @cache_readonly + def _boxes_c(self): + return self._color_attrs[0] + + @cache_readonly + def _whiskers_c(self): + return self._color_attrs[0] + + @cache_readonly + def _medians_c(self): + return self._color_attrs[2] + + @cache_readonly + def _caps_c(self): + return self._color_attrs[0] + + def _get_colors( + self, + num_colors=None, + color_kwds: dict[str, MatplotlibColor] + | MatplotlibColor + | Collection[MatplotlibColor] + | None = "color", + ) -> None: + pass + + def maybe_color_bp(self, bp) -> None: + if isinstance(self.color, dict): + boxes = self.color.get("boxes", self._boxes_c) + whiskers = self.color.get("whiskers", self._whiskers_c) + medians = self.color.get("medians", self._medians_c) + caps = self.color.get("caps", self._caps_c) + else: + # Other types are forwarded to matplotlib + # If None, use default colors + boxes = self.color or self._boxes_c + whiskers = self.color or self._whiskers_c + medians = self.color or self._medians_c + caps = self.color or self._caps_c + + color_tup = (boxes, whiskers, medians, caps) + maybe_color_bp(bp, color_tup=color_tup, **self.kwds) + + def _make_plot(self, fig: Figure) -> None: + if self.subplots: + self._return_obj = pd.Series(dtype=object) + + # Re-create iterated data if `by` is assigned by users + data = ( + create_iter_data_given_by(self.data, self._kind) + if self.by is not None + else self.data + ) + + # error: Argument "data" to "_iter_data" of "MPLPlot" has + # incompatible type "object"; expected "DataFrame | + # dict[Hashable, Series | DataFrame]" + for i, (label, y) in enumerate(self._iter_data(data=data)): # type: ignore[arg-type] + ax = self._get_ax(i) + kwds = self.kwds.copy() + + # When by is applied, show title for subplots to know which group it is + # just like df.boxplot, and need to apply T on y to provide right input + if self.by is not None: + y = y.T + ax.set_title(pprint_thing(label)) + + # When `by` is assigned, the ticklabels will become unique grouped + # values, instead of label which is used as subtitle in this case. + # error: "Index" has no attribute "levels"; maybe "nlevels"? + levels = self.data.columns.levels # type: ignore[attr-defined] + ticklabels = [pprint_thing(col) for col in levels[0]] + else: + ticklabels = [pprint_thing(label)] + + ret, bp = self._plot( + ax, y, column_num=i, return_type=self.return_type, **kwds + ) + self.maybe_color_bp(bp) + self._return_obj[label] = ret + _set_ticklabels( + ax=ax, labels=ticklabels, is_vertical=self.orientation == "vertical" + ) + else: + y = self.data.values.T + ax = self._get_ax(0) + kwds = self.kwds.copy() + + ret, bp = self._plot( + ax, y, column_num=0, return_type=self.return_type, **kwds + ) + self.maybe_color_bp(bp) + self._return_obj = ret + + labels = [pprint_thing(left) for left in self.data.columns] + if not self.use_index: + labels = [pprint_thing(key) for key in range(len(labels))] + _set_ticklabels( + ax=ax, labels=labels, is_vertical=self.orientation == "vertical" + ) + + def _make_legend(self) -> None: + pass + + def _post_plot_logic(self, ax: Axes, data) -> None: + # GH 45465: make sure that the boxplot doesn't ignore xlabel/ylabel + if self.xlabel: + ax.set_xlabel(pprint_thing(self.xlabel)) + if self.ylabel: + ax.set_ylabel(pprint_thing(self.ylabel)) + + @property + def orientation(self) -> Literal["horizontal", "vertical"]: + if self.kwds.get("vert", True): + return "vertical" + else: + return "horizontal" + + @property + def result(self): + if self.return_type is None: + return super().result + else: + return self._return_obj + + +def maybe_color_bp(bp, color_tup, **kwds) -> None: + # GH#30346, when users specifying those arguments explicitly, our defaults + # for these four kwargs should be overridden; if not, use Pandas settings + if not kwds.get("boxprops"): + setp(bp["boxes"], color=color_tup[0], alpha=1) + if not kwds.get("whiskerprops"): + setp(bp["whiskers"], color=color_tup[1], alpha=1) + if not kwds.get("medianprops"): + setp(bp["medians"], color=color_tup[2], alpha=1) + if not kwds.get("capprops"): + setp(bp["caps"], color=color_tup[3], alpha=1) + + +def _grouped_plot_by_column( + plotf, + data, + columns=None, + by=None, + numeric_only: bool = True, + grid: bool = False, + figsize: tuple[float, float] | None = None, + ax=None, + layout=None, + return_type=None, + **kwargs, +): + grouped = data.groupby(by, observed=False) + if columns is None: + if not isinstance(by, (list, tuple)): + by = [by] + columns = data._get_numeric_data().columns.difference(by) + naxes = len(columns) + fig, axes = create_subplots( + naxes=naxes, + sharex=kwargs.pop("sharex", True), + sharey=kwargs.pop("sharey", True), + figsize=figsize, + ax=ax, + layout=layout, + ) + + _axes = flatten_axes(axes) + + # GH 45465: move the "by" label based on "vert" + xlabel, ylabel = kwargs.pop("xlabel", None), kwargs.pop("ylabel", None) + if kwargs.get("vert", True): + xlabel = xlabel or by + else: + ylabel = ylabel or by + + ax_values = [] + + for i, col in enumerate(columns): + ax = _axes[i] + gp_col = grouped[col] + keys, values = zip(*gp_col) + re_plotf = plotf(keys, values, ax, xlabel=xlabel, ylabel=ylabel, **kwargs) + ax.set_title(col) + ax_values.append(re_plotf) + ax.grid(grid) + + result = pd.Series(ax_values, index=columns, copy=False) + + # Return axes in multiplot case, maybe revisit later # 985 + if return_type is None: + result = axes + + byline = by[0] if len(by) == 1 else by + fig.suptitle(f"Boxplot grouped by {byline}") + maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) + + return result + + +def boxplot( + data, + column=None, + by=None, + ax=None, + fontsize: int | None = None, + rot: int = 0, + grid: bool = True, + figsize: tuple[float, float] | None = None, + layout=None, + return_type=None, + **kwds, +): + import matplotlib.pyplot as plt + + # validate return_type: + if return_type not in BoxPlot._valid_return_types: + raise ValueError("return_type must be {'axes', 'dict', 'both'}") + + if isinstance(data, ABCSeries): + data = data.to_frame("x") + column = "x" + + def _get_colors(): + # num_colors=3 is required as method maybe_color_bp takes the colors + # in positions 0 and 2. + # if colors not provided, use same defaults as DataFrame.plot.box + result = get_standard_colors(num_colors=3) + result = np.take(result, [0, 0, 2]) + result = np.append(result, "k") + + colors = kwds.pop("color", None) + if colors: + if is_dict_like(colors): + # replace colors in result array with user-specified colors + # taken from the colors dict parameter + # "boxes" value placed in position 0, "whiskers" in 1, etc. + valid_keys = ["boxes", "whiskers", "medians", "caps"] + key_to_index = dict(zip(valid_keys, range(4))) + for key, value in colors.items(): + if key in valid_keys: + result[key_to_index[key]] = value + else: + raise ValueError( + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" + ) + else: + result.fill(colors) + + return result + + def plot_group(keys, values, ax: Axes, **kwds): + # GH 45465: xlabel/ylabel need to be popped out before plotting happens + xlabel, ylabel = kwds.pop("xlabel", None), kwds.pop("ylabel", None) + if xlabel: + ax.set_xlabel(pprint_thing(xlabel)) + if ylabel: + ax.set_ylabel(pprint_thing(ylabel)) + + keys = [pprint_thing(x) for x in keys] + values = [np.asarray(remove_na_arraylike(v), dtype=object) for v in values] + bp = ax.boxplot(values, **kwds) + if fontsize is not None: + ax.tick_params(axis="both", labelsize=fontsize) + + # GH 45465: x/y are flipped when "vert" changes + _set_ticklabels( + ax=ax, labels=keys, is_vertical=kwds.get("vert", True), rotation=rot + ) + maybe_color_bp(bp, color_tup=colors, **kwds) + + # Return axes in multiplot case, maybe revisit later # 985 + if return_type == "dict": + return bp + elif return_type == "both": + return BoxPlot.BP(ax=ax, lines=bp) + else: + return ax + + colors = _get_colors() + if column is None: + columns = None + elif isinstance(column, (list, tuple)): + columns = column + else: + columns = [column] + + if by is not None: + # Prefer array return type for 2-D plots to match the subplot layout + # https://github.com/pandas-dev/pandas/pull/12216#issuecomment-241175580 + result = _grouped_plot_by_column( + plot_group, + data, + columns=columns, + by=by, + grid=grid, + figsize=figsize, + ax=ax, + layout=layout, + return_type=return_type, + **kwds, + ) + else: + if return_type is None: + return_type = "axes" + if layout is not None: + raise ValueError("The 'layout' keyword is not supported when 'by' is None") + + if ax is None: + rc = {"figure.figsize": figsize} if figsize is not None else {} + with plt.rc_context(rc): + ax = plt.gca() + data = data._get_numeric_data() + naxes = len(data.columns) + if naxes == 0: + raise ValueError( + "boxplot method requires numerical columns, nothing to plot." + ) + if columns is None: + columns = data.columns + else: + data = data[columns] + + result = plot_group(columns, data.values.T, ax, **kwds) + ax.grid(grid) + + return result + + +def boxplot_frame( + self, + column=None, + by=None, + ax=None, + fontsize: int | None = None, + rot: int = 0, + grid: bool = True, + figsize: tuple[float, float] | None = None, + layout=None, + return_type=None, + **kwds, +): + import matplotlib.pyplot as plt + + ax = boxplot( + self, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + grid=grid, + rot=rot, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwds, + ) + plt.draw_if_interactive() + return ax + + +def boxplot_frame_groupby( + grouped, + subplots: bool = True, + column=None, + fontsize: int | None = None, + rot: int = 0, + grid: bool = True, + ax=None, + figsize: tuple[float, float] | None = None, + layout=None, + sharex: bool = False, + sharey: bool = True, + **kwds, +): + if subplots is True: + naxes = len(grouped) + fig, axes = create_subplots( + naxes=naxes, + squeeze=False, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + ) + axes = flatten_axes(axes) + + ret = pd.Series(dtype=object) + + for (key, group), ax in zip(grouped, axes): + d = group.boxplot( + ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds + ) + ax.set_title(pprint_thing(key)) + ret.loc[key] = d + maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) + else: + keys, frames = zip(*grouped) + if grouped.axis == 0: + df = pd.concat(frames, keys=keys, axis=1) + elif len(frames) > 1: + df = frames[0].join(frames[1::]) + else: + df = frames[0] + + # GH 16748, DataFrameGroupby fails when subplots=False and `column` argument + # is assigned, and in this case, since `df` here becomes MI after groupby, + # so we need to couple the keys (grouped values) and column (original df + # column) together to search for subset to plot + if column is not None: + column = com.convert_to_list_like(column) + multi_key = pd.MultiIndex.from_product([keys, column]) + column = list(multi_key.values) + ret = df.boxplot( + column=column, + fontsize=fontsize, + rot=rot, + grid=grid, + ax=ax, + figsize=figsize, + layout=layout, + **kwds, + ) + return ret diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/converter.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/converter.py new file mode 100644 index 0000000000000000000000000000000000000000..9acb93ce69a9ca25962139891e6bb1e5e163add8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/converter.py @@ -0,0 +1,1139 @@ +from __future__ import annotations + +import contextlib +import datetime as pydt +from datetime import ( + datetime, + timedelta, + tzinfo, +) +import functools +from typing import ( + TYPE_CHECKING, + Any, + cast, +) +import warnings + +import matplotlib.dates as mdates +from matplotlib.ticker import ( + AutoLocator, + Formatter, + Locator, +) +from matplotlib.transforms import nonsingular +import matplotlib.units as munits +import numpy as np + +from pandas._libs import lib +from pandas._libs.tslibs import ( + Timestamp, + to_offset, +) +from pandas._libs.tslibs.dtypes import ( + FreqGroup, + periods_per_day, +) +from pandas._typing import ( + F, + npt, +) + +from pandas.core.dtypes.common import ( + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_nested_list_like, +) + +from pandas import ( + Index, + Series, + get_option, +) +import pandas.core.common as com +from pandas.core.indexes.datetimes import date_range +from pandas.core.indexes.period import ( + Period, + PeriodIndex, + period_range, +) +import pandas.core.tools.datetimes as tools + +if TYPE_CHECKING: + from collections.abc import Generator + + from matplotlib.axis import Axis + + from pandas._libs.tslibs.offsets import BaseOffset + + +_mpl_units = {} # Cache for units overwritten by us + + +def get_pairs(): + pairs = [ + (Timestamp, DatetimeConverter), + (Period, PeriodConverter), + (pydt.datetime, DatetimeConverter), + (pydt.date, DatetimeConverter), + (pydt.time, TimeConverter), + (np.datetime64, DatetimeConverter), + ] + return pairs + + +def register_pandas_matplotlib_converters(func: F) -> F: + """ + Decorator applying pandas_converters. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + with pandas_converters(): + return func(*args, **kwargs) + + return cast(F, wrapper) + + +@contextlib.contextmanager +def pandas_converters() -> Generator[None, None, None]: + """ + Context manager registering pandas' converters for a plot. + + See Also + -------- + register_pandas_matplotlib_converters : Decorator that applies this. + """ + value = get_option("plotting.matplotlib.register_converters") + + if value: + # register for True or "auto" + register() + try: + yield + finally: + if value == "auto": + # only deregister for "auto" + deregister() + + +def register() -> None: + pairs = get_pairs() + for type_, cls in pairs: + # Cache previous converter if present + if type_ in munits.registry and not isinstance(munits.registry[type_], cls): + previous = munits.registry[type_] + _mpl_units[type_] = previous + # Replace with pandas converter + munits.registry[type_] = cls() + + +def deregister() -> None: + # Renamed in pandas.plotting.__init__ + for type_, cls in get_pairs(): + # We use type to catch our classes directly, no inheritance + if type(munits.registry.get(type_)) is cls: + munits.registry.pop(type_) + + # restore the old keys + for unit, formatter in _mpl_units.items(): + if type(formatter) not in {DatetimeConverter, PeriodConverter, TimeConverter}: + # make it idempotent by excluding ours. + munits.registry[unit] = formatter + + +def _to_ordinalf(tm: pydt.time) -> float: + tot_sec = tm.hour * 3600 + tm.minute * 60 + tm.second + tm.microsecond / 10**6 + return tot_sec + + +def time2num(d): + if isinstance(d, str): + parsed = Timestamp(d) + return _to_ordinalf(parsed.time()) + if isinstance(d, pydt.time): + return _to_ordinalf(d) + return d + + +class TimeConverter(munits.ConversionInterface): + @staticmethod + def convert(value, unit, axis): + valid_types = (str, pydt.time) + if isinstance(value, valid_types) or is_integer(value) or is_float(value): + return time2num(value) + if isinstance(value, Index): + return value.map(time2num) + if isinstance(value, (list, tuple, np.ndarray, Index)): + return [time2num(x) for x in value] + return value + + @staticmethod + def axisinfo(unit, axis) -> munits.AxisInfo | None: + if unit != "time": + return None + + majloc = AutoLocator() + majfmt = TimeFormatter(majloc) + return munits.AxisInfo(majloc=majloc, majfmt=majfmt, label="time") + + @staticmethod + def default_units(x, axis) -> str: + return "time" + + +# time formatter +class TimeFormatter(Formatter): + def __init__(self, locs) -> None: + self.locs = locs + + def __call__(self, x, pos: int | None = 0) -> str: + """ + Return the time of day as a formatted string. + + Parameters + ---------- + x : float + The time of day specified as seconds since 00:00 (midnight), + with up to microsecond precision. + pos + Unused + + Returns + ------- + str + A string in HH:MM:SS.mmmuuu format. Microseconds, + milliseconds and seconds are only displayed if non-zero. + """ + fmt = "%H:%M:%S.%f" + s = int(x) + msus = round((x - s) * 10**6) + ms = msus // 1000 + us = msus % 1000 + m, s = divmod(s, 60) + h, m = divmod(m, 60) + _, h = divmod(h, 24) + if us != 0: + return pydt.time(h, m, s, msus).strftime(fmt) + elif ms != 0: + return pydt.time(h, m, s, msus).strftime(fmt)[:-3] + elif s != 0: + return pydt.time(h, m, s).strftime("%H:%M:%S") + + return pydt.time(h, m).strftime("%H:%M") + + +# Period Conversion + + +class PeriodConverter(mdates.DateConverter): + @staticmethod + def convert(values, units, axis): + if is_nested_list_like(values): + values = [PeriodConverter._convert_1d(v, units, axis) for v in values] + else: + values = PeriodConverter._convert_1d(values, units, axis) + return values + + @staticmethod + def _convert_1d(values, units, axis): + if not hasattr(axis, "freq"): + raise TypeError("Axis must have `freq` set to convert to Periods") + valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + if ( + isinstance(values, valid_types) + or is_integer(values) + or is_float(values) + ): + return get_datevalue(values, axis.freq) + elif isinstance(values, PeriodIndex): + return values.asfreq(axis.freq).asi8 + elif isinstance(values, Index): + return values.map(lambda x: get_datevalue(x, axis.freq)) + elif lib.infer_dtype(values, skipna=False) == "period": + # https://github.com/pandas-dev/pandas/issues/24304 + # convert ndarray[period] -> PeriodIndex + return PeriodIndex(values, freq=axis.freq).asi8 + elif isinstance(values, (list, tuple, np.ndarray, Index)): + return [get_datevalue(x, axis.freq) for x in values] + return values + + +def get_datevalue(date, freq): + if isinstance(date, Period): + return date.asfreq(freq).ordinal + elif isinstance(date, (str, datetime, pydt.date, pydt.time, np.datetime64)): + return Period(date, freq).ordinal + elif ( + is_integer(date) + or is_float(date) + or (isinstance(date, (np.ndarray, Index)) and (date.size == 1)) + ): + return date + elif date is None: + return None + raise ValueError(f"Unrecognizable date '{date}'") + + +# Datetime Conversion +class DatetimeConverter(mdates.DateConverter): + @staticmethod + def convert(values, unit, axis): + # values might be a 1-d array, or a list-like of arrays. + if is_nested_list_like(values): + values = [DatetimeConverter._convert_1d(v, unit, axis) for v in values] + else: + values = DatetimeConverter._convert_1d(values, unit, axis) + return values + + @staticmethod + def _convert_1d(values, unit, axis): + def try_parse(values): + try: + return mdates.date2num(tools.to_datetime(values)) + except Exception: + return values + + if isinstance(values, (datetime, pydt.date, np.datetime64, pydt.time)): + return mdates.date2num(values) + elif is_integer(values) or is_float(values): + return values + elif isinstance(values, str): + return try_parse(values) + elif isinstance(values, (list, tuple, np.ndarray, Index, Series)): + if isinstance(values, Series): + # https://github.com/matplotlib/matplotlib/issues/11391 + # Series was skipped. Convert to DatetimeIndex to get asi8 + values = Index(values) + if isinstance(values, Index): + values = values.values + if not isinstance(values, np.ndarray): + values = com.asarray_tuplesafe(values) + + if is_integer_dtype(values) or is_float_dtype(values): + return values + + try: + values = tools.to_datetime(values) + except Exception: + pass + + values = mdates.date2num(values) + + return values + + @staticmethod + def axisinfo(unit: tzinfo | None, axis) -> munits.AxisInfo: + """ + Return the :class:`~matplotlib.units.AxisInfo` for *unit*. + + *unit* is a tzinfo instance or None. + The *axis* argument is required but not used. + """ + tz = unit + + majloc = PandasAutoDateLocator(tz=tz) + majfmt = PandasAutoDateFormatter(majloc, tz=tz) + datemin = pydt.date(2000, 1, 1) + datemax = pydt.date(2010, 1, 1) + + return munits.AxisInfo( + majloc=majloc, majfmt=majfmt, label="", default_limits=(datemin, datemax) + ) + + +class PandasAutoDateFormatter(mdates.AutoDateFormatter): + def __init__(self, locator, tz=None, defaultfmt: str = "%Y-%m-%d") -> None: + mdates.AutoDateFormatter.__init__(self, locator, tz, defaultfmt) + + +class PandasAutoDateLocator(mdates.AutoDateLocator): + def get_locator(self, dmin, dmax): + """Pick the best locator based on a distance.""" + tot_sec = (dmax - dmin).total_seconds() + + if abs(tot_sec) < self.minticks: + self._freq = -1 + locator = MilliSecondLocator(self.tz) + locator.set_axis(self.axis) + + # error: Item "None" of "Axis | _DummyAxis | _AxisWrapper | None" + # has no attribute "get_data_interval" + locator.axis.set_view_interval( # type: ignore[union-attr] + *self.axis.get_view_interval() # type: ignore[union-attr] + ) + locator.axis.set_data_interval( # type: ignore[union-attr] + *self.axis.get_data_interval() # type: ignore[union-attr] + ) + return locator + + return mdates.AutoDateLocator.get_locator(self, dmin, dmax) + + def _get_unit(self): + return MilliSecondLocator.get_unit_generic(self._freq) + + +class MilliSecondLocator(mdates.DateLocator): + UNIT = 1.0 / (24 * 3600 * 1000) + + def __init__(self, tz) -> None: + mdates.DateLocator.__init__(self, tz) + self._interval = 1.0 + + def _get_unit(self): + return self.get_unit_generic(-1) + + @staticmethod + def get_unit_generic(freq): + unit = mdates.RRuleLocator.get_unit_generic(freq) + if unit < 0: + return MilliSecondLocator.UNIT + return unit + + def __call__(self): + # if no data have been set, this will tank with a ValueError + try: + dmin, dmax = self.viewlim_to_dt() + except ValueError: + return [] + + # We need to cap at the endpoints of valid datetime + nmax, nmin = mdates.date2num((dmax, dmin)) + + num = (nmax - nmin) * 86400 * 1000 + max_millis_ticks = 6 + for interval in [1, 10, 50, 100, 200, 500]: + if num <= interval * (max_millis_ticks - 1): + self._interval = interval + break + # We went through the whole loop without breaking, default to 1 + self._interval = 1000.0 + + estimate = (nmax - nmin) / (self._get_unit() * self._get_interval()) + + if estimate > self.MAXTICKS * 2: + raise RuntimeError( + "MillisecondLocator estimated to generate " + f"{estimate:d} ticks from {dmin} to {dmax}: exceeds Locator.MAXTICKS" + f"* 2 ({self.MAXTICKS * 2:d}) " + ) + + interval = self._get_interval() + freq = f"{interval}ms" + tz = self.tz.tzname(None) + st = dmin.replace(tzinfo=None) + ed = dmin.replace(tzinfo=None) + all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object) + + try: + if len(all_dates) > 0: + locs = self.raise_if_exceeds(mdates.date2num(all_dates)) + return locs + except Exception: # pragma: no cover + pass + + lims = mdates.date2num([dmin, dmax]) + return lims + + def _get_interval(self): + return self._interval + + def autoscale(self): + """ + Set the view limits to include the data range. + """ + # We need to cap at the endpoints of valid datetime + dmin, dmax = self.datalim_to_dt() + + vmin = mdates.date2num(dmin) + vmax = mdates.date2num(dmax) + + return self.nonsingular(vmin, vmax) + + +def _from_ordinal(x, tz: tzinfo | None = None) -> datetime: + ix = int(x) + dt = datetime.fromordinal(ix) + remainder = float(x) - ix + hour, remainder = divmod(24 * remainder, 1) + minute, remainder = divmod(60 * remainder, 1) + second, remainder = divmod(60 * remainder, 1) + microsecond = int(1_000_000 * remainder) + if microsecond < 10: + microsecond = 0 # compensate for rounding errors + dt = datetime( + dt.year, dt.month, dt.day, int(hour), int(minute), int(second), microsecond + ) + if tz is not None: + dt = dt.astimezone(tz) + + if microsecond > 999990: # compensate for rounding errors + dt += timedelta(microseconds=1_000_000 - microsecond) + + return dt + + +# Fixed frequency dynamic tick locators and formatters + +# ------------------------------------------------------------------------- +# --- Locators --- +# ------------------------------------------------------------------------- + + +def _get_default_annual_spacing(nyears) -> tuple[int, int]: + """ + Returns a default spacing between consecutive ticks for annual data. + """ + if nyears < 11: + (min_spacing, maj_spacing) = (1, 1) + elif nyears < 20: + (min_spacing, maj_spacing) = (1, 2) + elif nyears < 50: + (min_spacing, maj_spacing) = (1, 5) + elif nyears < 100: + (min_spacing, maj_spacing) = (5, 10) + elif nyears < 200: + (min_spacing, maj_spacing) = (5, 25) + elif nyears < 600: + (min_spacing, maj_spacing) = (10, 50) + else: + factor = nyears // 1000 + 1 + (min_spacing, maj_spacing) = (factor * 20, factor * 100) + return (min_spacing, maj_spacing) + + +def _period_break(dates: PeriodIndex, period: str) -> npt.NDArray[np.intp]: + """ + Returns the indices where the given period changes. + + Parameters + ---------- + dates : PeriodIndex + Array of intervals to monitor. + period : str + Name of the period to monitor. + """ + mask = _period_break_mask(dates, period) + return np.nonzero(mask)[0] + + +def _period_break_mask(dates: PeriodIndex, period: str) -> npt.NDArray[np.bool_]: + current = getattr(dates, period) + previous = getattr(dates - 1 * dates.freq, period) + return current != previous + + +def has_level_label(label_flags: npt.NDArray[np.intp], vmin: float) -> bool: + """ + Returns true if the ``label_flags`` indicate there is at least one label + for this level. + + if the minimum view limit is not an exact integer, then the first tick + label won't be shown, so we must adjust for that. + """ + if label_flags.size == 0 or ( + label_flags.size == 1 and label_flags[0] == 0 and vmin % 1 > 0.0 + ): + return False + else: + return True + + +def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: + # error: "BaseOffset" has no attribute "_period_dtype_code" + dtype_code = freq._period_dtype_code # type: ignore[attr-defined] + freq_group = FreqGroup.from_period_dtype_code(dtype_code) + + ppd = -1 # placeholder for above-day freqs + + if dtype_code >= FreqGroup.FR_HR.value: + # error: "BaseOffset" has no attribute "_creso" + ppd = periods_per_day(freq._creso) # type: ignore[attr-defined] + ppm = 28 * ppd + ppy = 365 * ppd + elif freq_group == FreqGroup.FR_BUS: + ppm = 19 + ppy = 261 + elif freq_group == FreqGroup.FR_DAY: + ppm = 28 + ppy = 365 + elif freq_group == FreqGroup.FR_WK: + ppm = 3 + ppy = 52 + elif freq_group == FreqGroup.FR_MTH: + ppm = 1 + ppy = 12 + elif freq_group == FreqGroup.FR_QTR: + ppm = -1 # placerholder + ppy = 4 + elif freq_group == FreqGroup.FR_ANN: + ppm = -1 # placeholder + ppy = 1 + else: + raise NotImplementedError(f"Unsupported frequency: {dtype_code}") + + return ppd, ppm, ppy + + +@functools.cache +def _daily_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: + # error: "BaseOffset" has no attribute "_period_dtype_code" + dtype_code = freq._period_dtype_code # type: ignore[attr-defined] + + periodsperday, periodspermonth, periodsperyear = _get_periods_per_ymd(freq) + + # save this for later usage + vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + dates_ = period_range( + start=Period(ordinal=vmin, freq=freq), + end=Period(ordinal=vmax, freq=freq), + freq=freq, + ) + + # Initialize the output + info = np.zeros( + span, dtype=[("val", np.int64), ("maj", bool), ("min", bool), ("fmt", "|S20")] + ) + info["val"][:] = dates_.asi8 + info["fmt"][:] = "" + info["maj"][[0, -1]] = True + # .. and set some shortcuts + info_maj = info["maj"] + info_min = info["min"] + info_fmt = info["fmt"] + + def first_label(label_flags): + if (label_flags[0] == 0) and (label_flags.size > 1) and ((vmin_orig % 1) > 0.0): + return label_flags[1] + else: + return label_flags[0] + + # Case 1. Less than a month + if span <= periodspermonth: + day_start = _period_break(dates_, "day") + month_start = _period_break(dates_, "month") + year_start = _period_break(dates_, "year") + + def _hour_finder(label_interval: int, force_year_start: bool) -> None: + target = dates_.hour + mask = _period_break_mask(dates_, "hour") + info_maj[day_start] = True + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M" + info_fmt[day_start] = "%H:%M\n%d-%b" + info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" + if force_year_start and not has_level_label(year_start, vmin_orig): + info_fmt[first_label(day_start)] = "%H:%M\n%d-%b\n%Y" + + def _minute_finder(label_interval: int) -> None: + target = dates_.minute + hour_start = _period_break(dates_, "hour") + mask = _period_break_mask(dates_, "minute") + info_maj[hour_start] = True + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M" + info_fmt[day_start] = "%H:%M\n%d-%b" + info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" + + def _second_finder(label_interval: int) -> None: + target = dates_.second + minute_start = _period_break(dates_, "minute") + mask = _period_break_mask(dates_, "second") + info_maj[minute_start] = True + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M:%S" + info_fmt[day_start] = "%H:%M:%S\n%d-%b" + info_fmt[year_start] = "%H:%M:%S\n%d-%b\n%Y" + + if span < periodsperday / 12000: + _second_finder(1) + elif span < periodsperday / 6000: + _second_finder(2) + elif span < periodsperday / 2400: + _second_finder(5) + elif span < periodsperday / 1200: + _second_finder(10) + elif span < periodsperday / 800: + _second_finder(15) + elif span < periodsperday / 400: + _second_finder(30) + elif span < periodsperday / 150: + _minute_finder(1) + elif span < periodsperday / 70: + _minute_finder(2) + elif span < periodsperday / 24: + _minute_finder(5) + elif span < periodsperday / 12: + _minute_finder(15) + elif span < periodsperday / 6: + _minute_finder(30) + elif span < periodsperday / 2.5: + _hour_finder(1, False) + elif span < periodsperday / 1.5: + _hour_finder(2, False) + elif span < periodsperday * 1.25: + _hour_finder(3, False) + elif span < periodsperday * 2.5: + _hour_finder(6, True) + elif span < periodsperday * 4: + _hour_finder(12, True) + else: + info_maj[month_start] = True + info_min[day_start] = True + info_fmt[day_start] = "%d" + info_fmt[month_start] = "%d\n%b" + info_fmt[year_start] = "%d\n%b\n%Y" + if not has_level_label(year_start, vmin_orig): + if not has_level_label(month_start, vmin_orig): + info_fmt[first_label(day_start)] = "%d\n%b\n%Y" + else: + info_fmt[first_label(month_start)] = "%d\n%b\n%Y" + + # Case 2. Less than three months + elif span <= periodsperyear // 4: + month_start = _period_break(dates_, "month") + info_maj[month_start] = True + if dtype_code < FreqGroup.FR_HR.value: + info["min"] = True + else: + day_start = _period_break(dates_, "day") + info["min"][day_start] = True + week_start = _period_break(dates_, "week") + year_start = _period_break(dates_, "year") + info_fmt[week_start] = "%d" + info_fmt[month_start] = "\n\n%b" + info_fmt[year_start] = "\n\n%b\n%Y" + if not has_level_label(year_start, vmin_orig): + if not has_level_label(month_start, vmin_orig): + info_fmt[first_label(week_start)] = "\n\n%b\n%Y" + else: + info_fmt[first_label(month_start)] = "\n\n%b\n%Y" + # Case 3. Less than 14 months ............... + elif span <= 1.15 * periodsperyear: + year_start = _period_break(dates_, "year") + month_start = _period_break(dates_, "month") + week_start = _period_break(dates_, "week") + info_maj[month_start] = True + info_min[week_start] = True + info_min[year_start] = False + info_min[month_start] = False + info_fmt[month_start] = "%b" + info_fmt[year_start] = "%b\n%Y" + if not has_level_label(year_start, vmin_orig): + info_fmt[first_label(month_start)] = "%b\n%Y" + # Case 4. Less than 2.5 years ............... + elif span <= 2.5 * periodsperyear: + year_start = _period_break(dates_, "year") + quarter_start = _period_break(dates_, "quarter") + month_start = _period_break(dates_, "month") + info_maj[quarter_start] = True + info_min[month_start] = True + info_fmt[quarter_start] = "%b" + info_fmt[year_start] = "%b\n%Y" + # Case 4. Less than 4 years ................. + elif span <= 4 * periodsperyear: + year_start = _period_break(dates_, "year") + month_start = _period_break(dates_, "month") + info_maj[year_start] = True + info_min[month_start] = True + info_min[year_start] = False + + month_break = dates_[month_start].month + jan_or_jul = month_start[(month_break == 1) | (month_break == 7)] + info_fmt[jan_or_jul] = "%b" + info_fmt[year_start] = "%b\n%Y" + # Case 5. Less than 11 years ................ + elif span <= 11 * periodsperyear: + year_start = _period_break(dates_, "year") + quarter_start = _period_break(dates_, "quarter") + info_maj[year_start] = True + info_min[quarter_start] = True + info_min[year_start] = False + info_fmt[year_start] = "%Y" + # Case 6. More than 12 years ................ + else: + year_start = _period_break(dates_, "year") + year_break = dates_[year_start].year + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + major_idx = year_start[(year_break % maj_anndef == 0)] + info_maj[major_idx] = True + minor_idx = year_start[(year_break % min_anndef == 0)] + info_min[minor_idx] = True + info_fmt[major_idx] = "%Y" + + return info + + +@functools.cache +def _monthly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: + _, _, periodsperyear = _get_periods_per_ymd(freq) + + vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 + + # Initialize the output + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + dates_ = info["val"] + info["fmt"] = "" + year_start = (dates_ % 12 == 0).nonzero()[0] + info_maj = info["maj"] + info_fmt = info["fmt"] + + if span <= 1.15 * periodsperyear: + info_maj[year_start] = True + info["min"] = True + + info_fmt[:] = "%b" + info_fmt[year_start] = "%b\n%Y" + + if not has_level_label(year_start, vmin_orig): + if dates_.size > 1: + idx = 1 + else: + idx = 0 + info_fmt[idx] = "%b\n%Y" + + elif span <= 2.5 * periodsperyear: + quarter_start = (dates_ % 3 == 0).nonzero() + info_maj[year_start] = True + # TODO: Check the following : is it really info['fmt'] ? + # 2023-09-15 this is reached in test_finder_monthly + info["fmt"][quarter_start] = True + info["min"] = True + + info_fmt[quarter_start] = "%b" + info_fmt[year_start] = "%b\n%Y" + + elif span <= 4 * periodsperyear: + info_maj[year_start] = True + info["min"] = True + + jan_or_jul = (dates_ % 12 == 0) | (dates_ % 12 == 6) + info_fmt[jan_or_jul] = "%b" + info_fmt[year_start] = "%b\n%Y" + + elif span <= 11 * periodsperyear: + quarter_start = (dates_ % 3 == 0).nonzero() + info_maj[year_start] = True + info["min"][quarter_start] = True + + info_fmt[year_start] = "%Y" + + else: + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + years = dates_[year_start] // 12 + 1 + major_idx = year_start[(years % maj_anndef == 0)] + info_maj[major_idx] = True + info["min"][year_start[(years % min_anndef == 0)]] = True + + info_fmt[major_idx] = "%Y" + + return info + + +@functools.cache +def _quarterly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: + _, _, periodsperyear = _get_periods_per_ymd(freq) + vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 + + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + info["fmt"] = "" + dates_ = info["val"] + info_maj = info["maj"] + info_fmt = info["fmt"] + year_start = (dates_ % 4 == 0).nonzero()[0] + + if span <= 3.5 * periodsperyear: + info_maj[year_start] = True + info["min"] = True + + info_fmt[:] = "Q%q" + info_fmt[year_start] = "Q%q\n%F" + if not has_level_label(year_start, vmin_orig): + if dates_.size > 1: + idx = 1 + else: + idx = 0 + info_fmt[idx] = "Q%q\n%F" + + elif span <= 11 * periodsperyear: + info_maj[year_start] = True + info["min"] = True + info_fmt[year_start] = "%F" + + else: + # https://github.com/pandas-dev/pandas/pull/47602 + years = dates_[year_start] // 4 + 1970 + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + major_idx = year_start[(years % maj_anndef == 0)] + info_maj[major_idx] = True + info["min"][year_start[(years % min_anndef == 0)]] = True + info_fmt[major_idx] = "%F" + + return info + + +@functools.cache +def _annual_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray: + # Note: small difference here vs other finders in adding 1 to vmax + (vmin, vmax) = (int(vmin), int(vmax + 1)) + span = vmax - vmin + 1 + + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + info["fmt"] = "" + dates_ = info["val"] + + (min_anndef, maj_anndef) = _get_default_annual_spacing(span) + major_idx = dates_ % maj_anndef == 0 + minor_idx = dates_ % min_anndef == 0 + info["maj"][major_idx] = True + info["min"][minor_idx] = True + info["fmt"][major_idx] = "%Y" + + return info + + +def get_finder(freq: BaseOffset): + # error: "BaseOffset" has no attribute "_period_dtype_code" + dtype_code = freq._period_dtype_code # type: ignore[attr-defined] + fgroup = FreqGroup.from_period_dtype_code(dtype_code) + + if fgroup == FreqGroup.FR_ANN: + return _annual_finder + elif fgroup == FreqGroup.FR_QTR: + return _quarterly_finder + elif fgroup == FreqGroup.FR_MTH: + return _monthly_finder + elif (dtype_code >= FreqGroup.FR_BUS.value) or fgroup == FreqGroup.FR_WK: + return _daily_finder + else: # pragma: no cover + raise NotImplementedError(f"Unsupported frequency: {dtype_code}") + + +class TimeSeries_DateLocator(Locator): + """ + Locates the ticks along an axis controlled by a :class:`Series`. + + Parameters + ---------- + freq : BaseOffset + Valid frequency specifier. + minor_locator : {False, True}, optional + Whether the locator is for minor ticks (True) or not. + dynamic_mode : {True, False}, optional + Whether the locator should work in dynamic mode. + base : {int}, optional + quarter : {int}, optional + month : {int}, optional + day : {int}, optional + """ + + axis: Axis + + def __init__( + self, + freq: BaseOffset, + minor_locator: bool = False, + dynamic_mode: bool = True, + base: int = 1, + quarter: int = 1, + month: int = 1, + day: int = 1, + plot_obj=None, + ) -> None: + freq = to_offset(freq, is_period=True) + self.freq = freq + self.base = base + (self.quarter, self.month, self.day) = (quarter, month, day) + self.isminor = minor_locator + self.isdynamic = dynamic_mode + self.offset = 0 + self.plot_obj = plot_obj + self.finder = get_finder(freq) + + def _get_default_locs(self, vmin, vmax): + """Returns the default locations of ticks.""" + locator = self.finder(vmin, vmax, self.freq) + + if self.isminor: + return np.compress(locator["min"], locator["val"]) + return np.compress(locator["maj"], locator["val"]) + + def __call__(self): + """Return the locations of the ticks.""" + # axis calls Locator.set_axis inside set_m_formatter + + vi = tuple(self.axis.get_view_interval()) + vmin, vmax = vi + if vmax < vmin: + vmin, vmax = vmax, vmin + if self.isdynamic: + locs = self._get_default_locs(vmin, vmax) + else: # pragma: no cover + base = self.base + (d, m) = divmod(vmin, base) + vmin = (d + 1) * base + # error: No overload variant of "range" matches argument types "float", + # "float", "int" + locs = list(range(vmin, vmax + 1, base)) # type: ignore[call-overload] + return locs + + def autoscale(self): + """ + Sets the view limits to the nearest multiples of base that contain the + data. + """ + # requires matplotlib >= 0.98.0 + (vmin, vmax) = self.axis.get_data_interval() + + locs = self._get_default_locs(vmin, vmax) + (vmin, vmax) = locs[[0, -1]] + if vmin == vmax: + vmin -= 1 + vmax += 1 + return nonsingular(vmin, vmax) + + +# ------------------------------------------------------------------------- +# --- Formatter --- +# ------------------------------------------------------------------------- + + +class TimeSeries_DateFormatter(Formatter): + """ + Formats the ticks along an axis controlled by a :class:`PeriodIndex`. + + Parameters + ---------- + freq : BaseOffset + Valid frequency specifier. + minor_locator : bool, default False + Whether the current formatter should apply to minor ticks (True) or + major ticks (False). + dynamic_mode : bool, default True + Whether the formatter works in dynamic mode or not. + """ + + axis: Axis + + def __init__( + self, + freq: BaseOffset, + minor_locator: bool = False, + dynamic_mode: bool = True, + plot_obj=None, + ) -> None: + freq = to_offset(freq, is_period=True) + self.format = None + self.freq = freq + self.locs: list[Any] = [] # unused, for matplotlib compat + self.formatdict: dict[Any, Any] | None = None + self.isminor = minor_locator + self.isdynamic = dynamic_mode + self.offset = 0 + self.plot_obj = plot_obj + self.finder = get_finder(freq) + + def _set_default_format(self, vmin, vmax): + """Returns the default ticks spacing.""" + info = self.finder(vmin, vmax, self.freq) + + if self.isminor: + format = np.compress(info["min"] & np.logical_not(info["maj"]), info) + else: + format = np.compress(info["maj"], info) + self.formatdict = {x: f for (x, _, _, f) in format} + return self.formatdict + + def set_locs(self, locs) -> None: + """Sets the locations of the ticks""" + # don't actually use the locs. This is just needed to work with + # matplotlib. Force to use vmin, vmax + + self.locs = locs + + (vmin, vmax) = tuple(self.axis.get_view_interval()) + if vmax < vmin: + (vmin, vmax) = (vmax, vmin) + self._set_default_format(vmin, vmax) + + def __call__(self, x, pos: int | None = 0) -> str: + if self.formatdict is None: + return "" + else: + fmt = self.formatdict.pop(x, "") + if isinstance(fmt, np.bytes_): + fmt = fmt.decode("utf-8") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Period with BDay freq is deprecated", + category=FutureWarning, + ) + period = Period(ordinal=int(x), freq=self.freq) + assert isinstance(period, Period) + return period.strftime(fmt) + + +class TimeSeries_TimedeltaFormatter(Formatter): + """ + Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`. + """ + + axis: Axis + + @staticmethod + def format_timedelta_ticks(x, pos, n_decimals: int) -> str: + """ + Convert seconds to 'D days HH:MM:SS.F' + """ + s, ns = divmod(x, 10**9) # TODO(non-nano): this looks like it assumes ns + m, s = divmod(s, 60) + h, m = divmod(m, 60) + d, h = divmod(h, 24) + decimals = int(ns * 10 ** (n_decimals - 9)) + s = f"{int(h):02d}:{int(m):02d}:{int(s):02d}" + if n_decimals > 0: + s += f".{decimals:0{n_decimals}d}" + if d != 0: + s = f"{int(d):d} days {s}" + return s + + def __call__(self, x, pos: int | None = 0) -> str: + (vmin, vmax) = tuple(self.axis.get_view_interval()) + n_decimals = min(int(np.ceil(np.log10(100 * 10**9 / abs(vmax - vmin)))), 9) + return self.format_timedelta_ticks(x, pos, n_decimals) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/core.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/core.py new file mode 100644 index 0000000000000000000000000000000000000000..3a1e589c2279bdadb736ce85312bc2c84f5793eb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/core.py @@ -0,0 +1,2125 @@ +from __future__ import annotations + +from abc import ( + ABC, + abstractmethod, +) +from collections.abc import ( + Hashable, + Iterable, + Iterator, + Sequence, +) +from typing import ( + TYPE_CHECKING, + Any, + Literal, + cast, + final, +) +import warnings + +import matplotlib as mpl +import numpy as np + +from pandas._libs import lib +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_any_real_numeric_dtype, + is_bool, + is_float, + is_float_dtype, + is_hashable, + is_integer, + is_integer_dtype, + is_iterator, + is_list_like, + is_number, + is_numeric_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + ExtensionDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCDatetimeIndex, + ABCIndex, + ABCMultiIndex, + ABCPeriodIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +import pandas.core.common as com +from pandas.core.frame import DataFrame +from pandas.util.version import Version + +from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib import tools +from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters +from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by +from pandas.plotting._matplotlib.misc import unpack_single_str_list +from pandas.plotting._matplotlib.style import get_standard_colors +from pandas.plotting._matplotlib.timeseries import ( + decorate_axes, + format_dateaxis, + maybe_convert_index, + maybe_resample, + use_dynamic_x, +) +from pandas.plotting._matplotlib.tools import ( + create_subplots, + flatten_axes, + format_date_labels, + get_all_lines, + get_xlim, + handle_shared_axes, +) + +if TYPE_CHECKING: + from matplotlib.artist import Artist + from matplotlib.axes import Axes + from matplotlib.axis import Axis + from matplotlib.figure import Figure + + from pandas._typing import ( + IndexLabel, + NDFrameT, + PlottingOrientation, + npt, + ) + + from pandas import Series + + +def _color_in_style(style: str) -> bool: + """ + Check if there is a color letter in the style string. + """ + from matplotlib.colors import BASE_COLORS + + return not set(BASE_COLORS).isdisjoint(style) + + +class MPLPlot(ABC): + """ + Base class for assembling a pandas plot using matplotlib + + Parameters + ---------- + data : + + """ + + @property + @abstractmethod + def _kind(self) -> str: + """Specify kind str. Must be overridden in child class""" + raise NotImplementedError + + _layout_type = "vertical" + _default_rot = 0 + + @property + def orientation(self) -> str | None: + return None + + data: DataFrame + + def __init__( + self, + data, + kind=None, + by: IndexLabel | None = None, + subplots: bool | Sequence[Sequence[str]] = False, + sharex: bool | None = None, + sharey: bool = False, + use_index: bool = True, + figsize: tuple[float, float] | None = None, + grid=None, + legend: bool | str = True, + rot=None, + ax=None, + fig=None, + title=None, + xlim=None, + ylim=None, + xticks=None, + yticks=None, + xlabel: Hashable | None = None, + ylabel: Hashable | None = None, + fontsize: int | None = None, + secondary_y: bool | tuple | list | np.ndarray = False, + colormap=None, + table: bool = False, + layout=None, + include_bool: bool = False, + column: IndexLabel | None = None, + *, + logx: bool | None | Literal["sym"] = False, + logy: bool | None | Literal["sym"] = False, + loglog: bool | None | Literal["sym"] = False, + mark_right: bool = True, + stacked: bool = False, + label: Hashable | None = None, + style=None, + **kwds, + ) -> None: + import matplotlib.pyplot as plt + + # if users assign an empty list or tuple, raise `ValueError` + # similar to current `df.box` and `df.hist` APIs. + if by in ([], ()): + raise ValueError("No group keys passed!") + self.by = com.maybe_make_list(by) + + # Assign the rest of columns into self.columns if by is explicitly defined + # while column is not, only need `columns` in hist/box plot when it's DF + # TODO: Might deprecate `column` argument in future PR (#28373) + if isinstance(data, DataFrame): + if column: + self.columns = com.maybe_make_list(column) + elif self.by is None: + self.columns = [ + col for col in data.columns if is_numeric_dtype(data[col]) + ] + else: + self.columns = [ + col + for col in data.columns + if col not in self.by and is_numeric_dtype(data[col]) + ] + + # For `hist` plot, need to get grouped original data before `self.data` is + # updated later + if self.by is not None and self._kind == "hist": + self._grouped = data.groupby(unpack_single_str_list(self.by)) + + self.kind = kind + + self.subplots = type(self)._validate_subplots_kwarg( + subplots, data, kind=self._kind + ) + + self.sharex = type(self)._validate_sharex(sharex, ax, by) + self.sharey = sharey + self.figsize = figsize + self.layout = layout + + self.xticks = xticks + self.yticks = yticks + self.xlim = xlim + self.ylim = ylim + self.title = title + self.use_index = use_index + self.xlabel = xlabel + self.ylabel = ylabel + + self.fontsize = fontsize + + if rot is not None: + self.rot = rot + # need to know for format_date_labels since it's rotated to 30 by + # default + self._rot_set = True + else: + self._rot_set = False + self.rot = self._default_rot + + if grid is None: + grid = False if secondary_y else plt.rcParams["axes.grid"] + + self.grid = grid + self.legend = legend + self.legend_handles: list[Artist] = [] + self.legend_labels: list[Hashable] = [] + + self.logx = type(self)._validate_log_kwd("logx", logx) + self.logy = type(self)._validate_log_kwd("logy", logy) + self.loglog = type(self)._validate_log_kwd("loglog", loglog) + self.label = label + self.style = style + self.mark_right = mark_right + self.stacked = stacked + + # ax may be an Axes object or (if self.subplots) an ndarray of + # Axes objects + self.ax = ax + # TODO: deprecate fig keyword as it is ignored, not passed in tests + # as of 2023-11-05 + + # parse errorbar input if given + xerr = kwds.pop("xerr", None) + yerr = kwds.pop("yerr", None) + nseries = self._get_nseries(data) + xerr, data = type(self)._parse_errorbars("xerr", xerr, data, nseries) + yerr, data = type(self)._parse_errorbars("yerr", yerr, data, nseries) + self.errors = {"xerr": xerr, "yerr": yerr} + self.data = data + + if not isinstance(secondary_y, (bool, tuple, list, np.ndarray, ABCIndex)): + secondary_y = [secondary_y] + self.secondary_y = secondary_y + + # ugly TypeError if user passes matplotlib's `cmap` name. + # Probably better to accept either. + if "cmap" in kwds and colormap: + raise TypeError("Only specify one of `cmap` and `colormap`.") + if "cmap" in kwds: + self.colormap = kwds.pop("cmap") + else: + self.colormap = colormap + + self.table = table + self.include_bool = include_bool + + self.kwds = kwds + + color = kwds.pop("color", lib.no_default) + self.color = self._validate_color_args(color, self.colormap) + assert "color" not in self.kwds + + self.data = self._ensure_frame(self.data) + + @final + @staticmethod + def _validate_sharex(sharex: bool | None, ax, by) -> bool: + if sharex is None: + # if by is defined, subplots are used and sharex should be False + if ax is None and by is None: # pylint: disable=simplifiable-if-statement + sharex = True + else: + # if we get an axis, the users should do the visibility + # setting... + sharex = False + elif not is_bool(sharex): + raise TypeError("sharex must be a bool or None") + return bool(sharex) + + @classmethod + def _validate_log_kwd( + cls, + kwd: str, + value: bool | None | Literal["sym"], + ) -> bool | None | Literal["sym"]: + if ( + value is None + or isinstance(value, bool) + or (isinstance(value, str) and value == "sym") + ): + return value + raise ValueError( + f"keyword '{kwd}' should be bool, None, or 'sym', not '{value}'" + ) + + @final + @staticmethod + def _validate_subplots_kwarg( + subplots: bool | Sequence[Sequence[str]], data: Series | DataFrame, kind: str + ) -> bool | list[tuple[int, ...]]: + """ + Validate the subplots parameter + + - check type and content + - check for duplicate columns + - check for invalid column names + - convert column names into indices + - add missing columns in a group of their own + See comments in code below for more details. + + Parameters + ---------- + subplots : subplots parameters as passed to PlotAccessor + + Returns + ------- + validated subplots : a bool or a list of tuples of column indices. Columns + in the same tuple will be grouped together in the resulting plot. + """ + + if isinstance(subplots, bool): + return subplots + elif not isinstance(subplots, Iterable): + raise ValueError("subplots should be a bool or an iterable") + + supported_kinds = ( + "line", + "bar", + "barh", + "hist", + "kde", + "density", + "area", + "pie", + ) + if kind not in supported_kinds: + raise ValueError( + "When subplots is an iterable, kind must be " + f"one of {', '.join(supported_kinds)}. Got {kind}." + ) + + if isinstance(data, ABCSeries): + raise NotImplementedError( + "An iterable subplots for a Series is not supported." + ) + + columns = data.columns + if isinstance(columns, ABCMultiIndex): + raise NotImplementedError( + "An iterable subplots for a DataFrame with a MultiIndex column " + "is not supported." + ) + + if columns.nunique() != len(columns): + raise NotImplementedError( + "An iterable subplots for a DataFrame with non-unique column " + "labels is not supported." + ) + + # subplots is a list of tuples where each tuple is a group of + # columns to be grouped together (one ax per group). + # we consolidate the subplots list such that: + # - the tuples contain indices instead of column names + # - the columns that aren't yet in the list are added in a group + # of their own. + # For example with columns from a to g, and + # subplots = [(a, c), (b, f, e)], + # we end up with [(ai, ci), (bi, fi, ei), (di,), (gi,)] + # This way, we can handle self.subplots in a homogeneous manner + # later. + # TODO: also accept indices instead of just names? + + out = [] + seen_columns: set[Hashable] = set() + for group in subplots: + if not is_list_like(group): + raise ValueError( + "When subplots is an iterable, each entry " + "should be a list/tuple of column names." + ) + idx_locs = columns.get_indexer_for(group) + if (idx_locs == -1).any(): + bad_labels = np.extract(idx_locs == -1, group) + raise ValueError( + f"Column label(s) {list(bad_labels)} not found in the DataFrame." + ) + unique_columns = set(group) + duplicates = seen_columns.intersection(unique_columns) + if duplicates: + raise ValueError( + "Each column should be in only one subplot. " + f"Columns {duplicates} were found in multiple subplots." + ) + seen_columns = seen_columns.union(unique_columns) + out.append(tuple(idx_locs)) + + unseen_columns = columns.difference(seen_columns) + for column in unseen_columns: + idx_loc = columns.get_loc(column) + out.append((idx_loc,)) + return out + + def _validate_color_args(self, color, colormap): + if color is lib.no_default: + # It was not provided by the user + if "colors" in self.kwds and colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used simultaneously. " + "Using 'color'", + stacklevel=find_stack_level(), + ) + return None + if self.nseries == 1 and color is not None and not is_list_like(color): + # support series.plot(color='green') + color = [color] + + if isinstance(color, tuple) and self.nseries == 1 and len(color) in (3, 4): + # support RGB and RGBA tuples in series plot + color = [color] + + if colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used simultaneously. Using 'color'", + stacklevel=find_stack_level(), + ) + + if self.style is not None: + if is_list_like(self.style): + styles = self.style + else: + styles = [self.style] + # need only a single match + for s in styles: + if _color_in_style(s): + raise ValueError( + "Cannot pass 'style' string with a color symbol and " + "'color' keyword argument. Please use one or the " + "other or pass 'style' without a color symbol" + ) + return color + + @final + @staticmethod + def _iter_data( + data: DataFrame | dict[Hashable, Series | DataFrame] + ) -> Iterator[tuple[Hashable, np.ndarray]]: + for col, values in data.items(): + # This was originally written to use values.values before EAs + # were implemented; adding np.asarray(...) to keep consistent + # typing. + yield col, np.asarray(values.values) + + def _get_nseries(self, data: Series | DataFrame) -> int: + # When `by` is explicitly assigned, grouped data size will be defined, and + # this will determine number of subplots to have, aka `self.nseries` + if data.ndim == 1: + return 1 + elif self.by is not None and self._kind == "hist": + return len(self._grouped) + elif self.by is not None and self._kind == "box": + return len(self.columns) + else: + return data.shape[1] + + @final + @property + def nseries(self) -> int: + return self._get_nseries(self.data) + + @final + def draw(self) -> None: + self.plt.draw_if_interactive() + + @final + def generate(self) -> None: + self._compute_plot_data() + fig = self.fig + self._make_plot(fig) + self._add_table() + self._make_legend() + self._adorn_subplots(fig) + + for ax in self.axes: + self._post_plot_logic_common(ax) + self._post_plot_logic(ax, self.data) + + @final + @staticmethod + def _has_plotted_object(ax: Axes) -> bool: + """check whether ax has data""" + return len(ax.lines) != 0 or len(ax.artists) != 0 or len(ax.containers) != 0 + + @final + def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes: + if not self.on_right(axes_num): + # secondary axes may be passed via ax kw + return self._get_ax_layer(ax) + + if hasattr(ax, "right_ax"): + # if it has right_ax property, ``ax`` must be left axes + return ax.right_ax + elif hasattr(ax, "left_ax"): + # if it has left_ax property, ``ax`` must be right axes + return ax + else: + # otherwise, create twin axes + orig_ax, new_ax = ax, ax.twinx() + # TODO: use Matplotlib public API when available + new_ax._get_lines = orig_ax._get_lines # type: ignore[attr-defined] + # TODO #54485 + new_ax._get_patches_for_fill = ( # type: ignore[attr-defined] + orig_ax._get_patches_for_fill # type: ignore[attr-defined] + ) + # TODO #54485 + orig_ax.right_ax, new_ax.left_ax = ( # type: ignore[attr-defined] + new_ax, + orig_ax, + ) + + if not self._has_plotted_object(orig_ax): # no data on left y + orig_ax.get_yaxis().set_visible(False) + + if self.logy is True or self.loglog is True: + new_ax.set_yscale("log") + elif self.logy == "sym" or self.loglog == "sym": + new_ax.set_yscale("symlog") + return new_ax + + @final + @cache_readonly + def fig(self) -> Figure: + return self._axes_and_fig[1] + + @final + @cache_readonly + # TODO: can we annotate this as both a Sequence[Axes] and ndarray[object]? + def axes(self) -> Sequence[Axes]: + return self._axes_and_fig[0] + + @final + @cache_readonly + def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: + if self.subplots: + naxes = ( + self.nseries if isinstance(self.subplots, bool) else len(self.subplots) + ) + fig, axes = create_subplots( + naxes=naxes, + sharex=self.sharex, + sharey=self.sharey, + figsize=self.figsize, + ax=self.ax, + layout=self.layout, + layout_type=self._layout_type, + ) + elif self.ax is None: + fig = self.plt.figure(figsize=self.figsize) + axes = fig.add_subplot(111) + else: + fig = self.ax.get_figure() + if self.figsize is not None: + fig.set_size_inches(self.figsize) + axes = self.ax + + axes = flatten_axes(axes) + + if self.logx is True or self.loglog is True: + [a.set_xscale("log") for a in axes] + elif self.logx == "sym" or self.loglog == "sym": + [a.set_xscale("symlog") for a in axes] + + if self.logy is True or self.loglog is True: + [a.set_yscale("log") for a in axes] + elif self.logy == "sym" or self.loglog == "sym": + [a.set_yscale("symlog") for a in axes] + + axes_seq = cast(Sequence["Axes"], axes) + return axes_seq, fig + + @property + def result(self): + """ + Return result axes + """ + if self.subplots: + if self.layout is not None and not is_list_like(self.ax): + # error: "Sequence[Any]" has no attribute "reshape" + return self.axes.reshape(*self.layout) # type: ignore[attr-defined] + else: + return self.axes + else: + sec_true = isinstance(self.secondary_y, bool) and self.secondary_y + # error: Argument 1 to "len" has incompatible type "Union[bool, + # Tuple[Any, ...], List[Any], ndarray[Any, Any]]"; expected "Sized" + all_sec = ( + is_list_like(self.secondary_y) + and len(self.secondary_y) == self.nseries # type: ignore[arg-type] + ) + if sec_true or all_sec: + # if all data is plotted on secondary, return right axes + return self._get_ax_layer(self.axes[0], primary=False) + else: + return self.axes[0] + + @final + @staticmethod + def _convert_to_ndarray(data): + # GH31357: categorical columns are processed separately + if isinstance(data.dtype, CategoricalDtype): + return data + + # GH32073: cast to float if values contain nulled integers + if (is_integer_dtype(data.dtype) or is_float_dtype(data.dtype)) and isinstance( + data.dtype, ExtensionDtype + ): + return data.to_numpy(dtype="float", na_value=np.nan) + + # GH25587: cast ExtensionArray of pandas (IntegerArray, etc.) to + # np.ndarray before plot. + if len(data) > 0: + return np.asarray(data) + + return data + + @final + def _ensure_frame(self, data) -> DataFrame: + if isinstance(data, ABCSeries): + label = self.label + if label is None and data.name is None: + label = "" + if label is None: + # We'll end up with columns of [0] instead of [None] + data = data.to_frame() + else: + data = data.to_frame(name=label) + elif self._kind in ("hist", "box"): + cols = self.columns if self.by is None else self.columns + self.by + data = data.loc[:, cols] + return data + + @final + def _compute_plot_data(self) -> None: + data = self.data + + # GH15079 reconstruct data if by is defined + if self.by is not None: + self.subplots = True + data = reconstruct_data_with_by(self.data, by=self.by, cols=self.columns) + + # GH16953, infer_objects is needed as fallback, for ``Series`` + # with ``dtype == object`` + data = data.infer_objects(copy=False) + include_type = [np.number, "datetime", "datetimetz", "timedelta"] + + # GH23719, allow plotting boolean + if self.include_bool is True: + include_type.append(np.bool_) + + # GH22799, exclude datetime-like type for boxplot + exclude_type = None + if self._kind == "box": + # TODO: change after solving issue 27881 + include_type = [np.number] + exclude_type = ["timedelta"] + + # GH 18755, include object and category type for scatter plot + if self._kind == "scatter": + include_type.extend(["object", "category", "string"]) + + numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type) + + is_empty = numeric_data.shape[-1] == 0 + # no non-numeric frames or series allowed + if is_empty: + raise TypeError("no numeric data to plot") + + self.data = numeric_data.apply(type(self)._convert_to_ndarray) + + def _make_plot(self, fig: Figure) -> None: + raise AbstractMethodError(self) + + @final + def _add_table(self) -> None: + if self.table is False: + return + elif self.table is True: + data = self.data.transpose() + else: + data = self.table + ax = self._get_ax(0) + tools.table(ax, data) + + @final + def _post_plot_logic_common(self, ax: Axes) -> None: + """Common post process for each axes""" + if self.orientation == "vertical" or self.orientation is None: + type(self)._apply_axis_properties( + ax.xaxis, rot=self.rot, fontsize=self.fontsize + ) + type(self)._apply_axis_properties(ax.yaxis, fontsize=self.fontsize) + + if hasattr(ax, "right_ax"): + type(self)._apply_axis_properties( + ax.right_ax.yaxis, fontsize=self.fontsize + ) + + elif self.orientation == "horizontal": + type(self)._apply_axis_properties( + ax.yaxis, rot=self.rot, fontsize=self.fontsize + ) + type(self)._apply_axis_properties(ax.xaxis, fontsize=self.fontsize) + + if hasattr(ax, "right_ax"): + type(self)._apply_axis_properties( + ax.right_ax.yaxis, fontsize=self.fontsize + ) + else: # pragma no cover + raise ValueError + + @abstractmethod + def _post_plot_logic(self, ax: Axes, data) -> None: + """Post process for each axes. Overridden in child classes""" + + @final + def _adorn_subplots(self, fig: Figure) -> None: + """Common post process unrelated to data""" + if len(self.axes) > 0: + all_axes = self._get_subplots(fig) + nrows, ncols = self._get_axes_layout(fig) + handle_shared_axes( + axarr=all_axes, + nplots=len(all_axes), + naxes=nrows * ncols, + nrows=nrows, + ncols=ncols, + sharex=self.sharex, + sharey=self.sharey, + ) + + for ax in self.axes: + ax = getattr(ax, "right_ax", ax) + if self.yticks is not None: + ax.set_yticks(self.yticks) + + if self.xticks is not None: + ax.set_xticks(self.xticks) + + if self.ylim is not None: + ax.set_ylim(self.ylim) + + if self.xlim is not None: + ax.set_xlim(self.xlim) + + # GH9093, currently Pandas does not show ylabel, so if users provide + # ylabel will set it as ylabel in the plot. + if self.ylabel is not None: + ax.set_ylabel(pprint_thing(self.ylabel)) + + ax.grid(self.grid) + + if self.title: + if self.subplots: + if is_list_like(self.title): + if len(self.title) != self.nseries: + raise ValueError( + "The length of `title` must equal the number " + "of columns if using `title` of type `list` " + "and `subplots=True`.\n" + f"length of title = {len(self.title)}\n" + f"number of columns = {self.nseries}" + ) + + for ax, title in zip(self.axes, self.title): + ax.set_title(title) + else: + fig.suptitle(self.title) + else: + if is_list_like(self.title): + msg = ( + "Using `title` of type `list` is not supported " + "unless `subplots=True` is passed" + ) + raise ValueError(msg) + self.axes[0].set_title(self.title) + + @final + @staticmethod + def _apply_axis_properties( + axis: Axis, rot=None, fontsize: int | None = None + ) -> None: + """ + Tick creation within matplotlib is reasonably expensive and is + internally deferred until accessed as Ticks are created/destroyed + multiple times per draw. It's therefore beneficial for us to avoid + accessing unless we will act on the Tick. + """ + if rot is not None or fontsize is not None: + # rot=0 is a valid setting, hence the explicit None check + labels = axis.get_majorticklabels() + axis.get_minorticklabels() + for label in labels: + if rot is not None: + label.set_rotation(rot) + if fontsize is not None: + label.set_fontsize(fontsize) + + @final + @property + def legend_title(self) -> str | None: + if not isinstance(self.data.columns, ABCMultiIndex): + name = self.data.columns.name + if name is not None: + name = pprint_thing(name) + return name + else: + stringified = map(pprint_thing, self.data.columns.names) + return ",".join(stringified) + + @final + def _mark_right_label(self, label: str, index: int) -> str: + """ + Append ``(right)`` to the label of a line if it's plotted on the right axis. + + Note that ``(right)`` is only appended when ``subplots=False``. + """ + if not self.subplots and self.mark_right and self.on_right(index): + label += " (right)" + return label + + @final + def _append_legend_handles_labels(self, handle: Artist, label: str) -> None: + """ + Append current handle and label to ``legend_handles`` and ``legend_labels``. + + These will be used to make the legend. + """ + self.legend_handles.append(handle) + self.legend_labels.append(label) + + def _make_legend(self) -> None: + ax, leg = self._get_ax_legend(self.axes[0]) + + handles = [] + labels = [] + title = "" + + if not self.subplots: + if leg is not None: + title = leg.get_title().get_text() + # Replace leg.legend_handles because it misses marker info + if Version(mpl.__version__) < Version("3.7"): + handles = leg.legendHandles + else: + handles = leg.legend_handles + labels = [x.get_text() for x in leg.get_texts()] + + if self.legend: + if self.legend == "reverse": + handles += reversed(self.legend_handles) + labels += reversed(self.legend_labels) + else: + handles += self.legend_handles + labels += self.legend_labels + + if self.legend_title is not None: + title = self.legend_title + + if len(handles) > 0: + ax.legend(handles, labels, loc="best", title=title) + + elif self.subplots and self.legend: + for ax in self.axes: + if ax.get_visible(): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "No artists with labels found to put in legend.", + UserWarning, + ) + ax.legend(loc="best") + + @final + @staticmethod + def _get_ax_legend(ax: Axes): + """ + Take in axes and return ax and legend under different scenarios + """ + leg = ax.get_legend() + + other_ax = getattr(ax, "left_ax", None) or getattr(ax, "right_ax", None) + other_leg = None + if other_ax is not None: + other_leg = other_ax.get_legend() + if leg is None and other_leg is not None: + leg = other_leg + ax = other_ax + return ax, leg + + @final + @cache_readonly + def plt(self): + import matplotlib.pyplot as plt + + return plt + + _need_to_set_index = False + + @final + def _get_xticks(self): + index = self.data.index + is_datetype = index.inferred_type in ("datetime", "date", "datetime64", "time") + + # TODO: be stricter about x? + x: list[int] | np.ndarray + if self.use_index: + if isinstance(index, ABCPeriodIndex): + # test_mixed_freq_irreg_period + x = index.to_timestamp()._mpl_repr() + # TODO: why do we need to do to_timestamp() here but not other + # places where we call mpl_repr? + elif is_any_real_numeric_dtype(index.dtype): + # Matplotlib supports numeric values or datetime objects as + # xaxis values. Taking LBYL approach here, by the time + # matplotlib raises exception when using non numeric/datetime + # values for xaxis, several actions are already taken by plt. + x = index._mpl_repr() + elif isinstance(index, ABCDatetimeIndex) or is_datetype: + x = index._mpl_repr() + else: + self._need_to_set_index = True + x = list(range(len(index))) + else: + x = list(range(len(index))) + + return x + + @classmethod + @register_pandas_matplotlib_converters + def _plot( + cls, ax: Axes, x, y: np.ndarray, style=None, is_errorbar: bool = False, **kwds + ): + mask = isna(y) + if mask.any(): + y = np.ma.array(y) + y = np.ma.masked_where(mask, y) + + if isinstance(x, ABCIndex): + x = x._mpl_repr() + + if is_errorbar: + if "xerr" in kwds: + kwds["xerr"] = np.array(kwds.get("xerr")) + if "yerr" in kwds: + kwds["yerr"] = np.array(kwds.get("yerr")) + return ax.errorbar(x, y, **kwds) + else: + # prevent style kwarg from going to errorbar, where it is unsupported + args = (x, y, style) if style is not None else (x, y) + return ax.plot(*args, **kwds) + + def _get_custom_index_name(self): + """Specify whether xlabel/ylabel should be used to override index name""" + return self.xlabel + + @final + def _get_index_name(self) -> str | None: + if isinstance(self.data.index, ABCMultiIndex): + name = self.data.index.names + if com.any_not_none(*name): + name = ",".join([pprint_thing(x) for x in name]) + else: + name = None + else: + name = self.data.index.name + if name is not None: + name = pprint_thing(name) + + # GH 45145, override the default axis label if one is provided. + index_name = self._get_custom_index_name() + if index_name is not None: + name = pprint_thing(index_name) + + return name + + @final + @classmethod + def _get_ax_layer(cls, ax, primary: bool = True): + """get left (primary) or right (secondary) axes""" + if primary: + return getattr(ax, "left_ax", ax) + else: + return getattr(ax, "right_ax", ax) + + @final + def _col_idx_to_axis_idx(self, col_idx: int) -> int: + """Return the index of the axis where the column at col_idx should be plotted""" + if isinstance(self.subplots, list): + # Subplots is a list: some columns will be grouped together in the same ax + return next( + group_idx + for (group_idx, group) in enumerate(self.subplots) + if col_idx in group + ) + else: + # subplots is True: one ax per column + return col_idx + + @final + def _get_ax(self, i: int): + # get the twinx ax if appropriate + if self.subplots: + i = self._col_idx_to_axis_idx(i) + ax = self.axes[i] + ax = self._maybe_right_yaxis(ax, i) + # error: Unsupported target for indexed assignment ("Sequence[Any]") + self.axes[i] = ax # type: ignore[index] + else: + ax = self.axes[0] + ax = self._maybe_right_yaxis(ax, i) + + ax.get_yaxis().set_visible(True) + return ax + + @final + def on_right(self, i: int): + if isinstance(self.secondary_y, bool): + return self.secondary_y + + if isinstance(self.secondary_y, (tuple, list, np.ndarray, ABCIndex)): + return self.data.columns[i] in self.secondary_y + + @final + def _apply_style_colors( + self, colors, kwds: dict[str, Any], col_num: int, label: str + ): + """ + Manage style and color based on column number and its label. + Returns tuple of appropriate style and kwds which "color" may be added. + """ + style = None + if self.style is not None: + if isinstance(self.style, list): + try: + style = self.style[col_num] + except IndexError: + pass + elif isinstance(self.style, dict): + style = self.style.get(label, style) + else: + style = self.style + + has_color = "color" in kwds or self.colormap is not None + nocolor_style = style is None or not _color_in_style(style) + if (has_color or self.subplots) and nocolor_style: + if isinstance(colors, dict): + kwds["color"] = colors[label] + else: + kwds["color"] = colors[col_num % len(colors)] + return style, kwds + + def _get_colors( + self, + num_colors: int | None = None, + color_kwds: str = "color", + ): + if num_colors is None: + num_colors = self.nseries + if color_kwds == "color": + color = self.color + else: + color = self.kwds.get(color_kwds) + return get_standard_colors( + num_colors=num_colors, + colormap=self.colormap, + color=color, + ) + + # TODO: tighter typing for first return? + @final + @staticmethod + def _parse_errorbars( + label: str, err, data: NDFrameT, nseries: int + ) -> tuple[Any, NDFrameT]: + """ + Look for error keyword arguments and return the actual errorbar data + or return the error DataFrame/dict + + Error bars can be specified in several ways: + Series: the user provides a pandas.Series object of the same + length as the data + ndarray: provides a np.ndarray of the same length as the data + DataFrame/dict: error values are paired with keys matching the + key in the plotted DataFrame + str: the name of the column within the plotted DataFrame + + Asymmetrical error bars are also supported, however raw error values + must be provided in this case. For a ``N`` length :class:`Series`, a + ``2xN`` array should be provided indicating lower and upper (or left + and right) errors. For a ``MxN`` :class:`DataFrame`, asymmetrical errors + should be in a ``Mx2xN`` array. + """ + if err is None: + return None, data + + def match_labels(data, e): + e = e.reindex(data.index) + return e + + # key-matched DataFrame + if isinstance(err, ABCDataFrame): + err = match_labels(data, err) + # key-matched dict + elif isinstance(err, dict): + pass + + # Series of error values + elif isinstance(err, ABCSeries): + # broadcast error series across data + err = match_labels(data, err) + err = np.atleast_2d(err) + err = np.tile(err, (nseries, 1)) + + # errors are a column in the dataframe + elif isinstance(err, str): + evalues = data[err].values + data = data[data.columns.drop(err)] + err = np.atleast_2d(evalues) + err = np.tile(err, (nseries, 1)) + + elif is_list_like(err): + if is_iterator(err): + err = np.atleast_2d(list(err)) + else: + # raw error values + err = np.atleast_2d(err) + + err_shape = err.shape + + # asymmetrical error bars + if isinstance(data, ABCSeries) and err_shape[0] == 2: + err = np.expand_dims(err, 0) + err_shape = err.shape + if err_shape[2] != len(data): + raise ValueError( + "Asymmetrical error bars should be provided " + f"with the shape (2, {len(data)})" + ) + elif isinstance(data, ABCDataFrame) and err.ndim == 3: + if ( + (err_shape[0] != nseries) + or (err_shape[1] != 2) + or (err_shape[2] != len(data)) + ): + raise ValueError( + "Asymmetrical error bars should be provided " + f"with the shape ({nseries}, 2, {len(data)})" + ) + + # broadcast errors to each data series + if len(err) == 1: + err = np.tile(err, (nseries, 1)) + + elif is_number(err): + err = np.tile( + [err], + (nseries, len(data)), + ) + + else: + msg = f"No valid {label} detected" + raise ValueError(msg) + + return err, data + + @final + def _get_errorbars( + self, label=None, index=None, xerr: bool = True, yerr: bool = True + ) -> dict[str, Any]: + errors = {} + + for kw, flag in zip(["xerr", "yerr"], [xerr, yerr]): + if flag: + err = self.errors[kw] + # user provided label-matched dataframe of errors + if isinstance(err, (ABCDataFrame, dict)): + if label is not None and label in err.keys(): + err = err[label] + else: + err = None + elif index is not None and err is not None: + err = err[index] + + if err is not None: + errors[kw] = err + return errors + + @final + def _get_subplots(self, fig: Figure): + if Version(mpl.__version__) < Version("3.8"): + from matplotlib.axes import Subplot as Klass + else: + from matplotlib.axes import Axes as Klass + + return [ + ax + for ax in fig.get_axes() + if (isinstance(ax, Klass) and ax.get_subplotspec() is not None) + ] + + @final + def _get_axes_layout(self, fig: Figure) -> tuple[int, int]: + axes = self._get_subplots(fig) + x_set = set() + y_set = set() + for ax in axes: + # check axes coordinates to estimate layout + points = ax.get_position().get_points() + x_set.add(points[0][0]) + y_set.add(points[0][1]) + return (len(y_set), len(x_set)) + + +class PlanePlot(MPLPlot, ABC): + """ + Abstract class for plotting on plane, currently scatter and hexbin. + """ + + _layout_type = "single" + + def __init__(self, data, x, y, **kwargs) -> None: + MPLPlot.__init__(self, data, **kwargs) + if x is None or y is None: + raise ValueError(self._kind + " requires an x and y column") + if is_integer(x) and not self.data.columns._holds_integer(): + x = self.data.columns[x] + if is_integer(y) and not self.data.columns._holds_integer(): + y = self.data.columns[y] + + self.x = x + self.y = y + + @final + def _get_nseries(self, data: Series | DataFrame) -> int: + return 1 + + @final + def _post_plot_logic(self, ax: Axes, data) -> None: + x, y = self.x, self.y + xlabel = self.xlabel if self.xlabel is not None else pprint_thing(x) + ylabel = self.ylabel if self.ylabel is not None else pprint_thing(y) + # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible + # type "Hashable"; expected "str" + ax.set_xlabel(xlabel) # type: ignore[arg-type] + ax.set_ylabel(ylabel) # type: ignore[arg-type] + + @final + def _plot_colorbar(self, ax: Axes, *, fig: Figure, **kwds): + # Addresses issues #10611 and #10678: + # When plotting scatterplots and hexbinplots in IPython + # inline backend the colorbar axis height tends not to + # exactly match the parent axis height. + # The difference is due to small fractional differences + # in floating points with similar representation. + # To deal with this, this method forces the colorbar + # height to take the height of the parent axes. + # For a more detailed description of the issue + # see the following link: + # https://github.com/ipython/ipython/issues/11215 + + # GH33389, if ax is used multiple times, we should always + # use the last one which contains the latest information + # about the ax + img = ax.collections[-1] + return fig.colorbar(img, ax=ax, **kwds) + + +class ScatterPlot(PlanePlot): + @property + def _kind(self) -> Literal["scatter"]: + return "scatter" + + def __init__( + self, + data, + x, + y, + s=None, + c=None, + *, + colorbar: bool | lib.NoDefault = lib.no_default, + norm=None, + **kwargs, + ) -> None: + if s is None: + # hide the matplotlib default for size, in case we want to change + # the handling of this argument later + s = 20 + elif is_hashable(s) and s in data.columns: + s = data[s] + self.s = s + + self.colorbar = colorbar + self.norm = norm + + super().__init__(data, x, y, **kwargs) + if is_integer(c) and not self.data.columns._holds_integer(): + c = self.data.columns[c] + self.c = c + + def _make_plot(self, fig: Figure) -> None: + x, y, c, data = self.x, self.y, self.c, self.data + ax = self.axes[0] + + c_is_column = is_hashable(c) and c in self.data.columns + + color_by_categorical = c_is_column and isinstance( + self.data[c].dtype, CategoricalDtype + ) + + color = self.color + c_values = self._get_c_values(color, color_by_categorical, c_is_column) + norm, cmap = self._get_norm_and_cmap(c_values, color_by_categorical) + cb = self._get_colorbar(c_values, c_is_column) + + if self.legend: + label = self.label + else: + label = None + scatter = ax.scatter( + data[x].values, + data[y].values, + c=c_values, + label=label, + cmap=cmap, + norm=norm, + s=self.s, + **self.kwds, + ) + if cb: + cbar_label = c if c_is_column else "" + cbar = self._plot_colorbar(ax, fig=fig, label=cbar_label) + if color_by_categorical: + n_cats = len(self.data[c].cat.categories) + cbar.set_ticks(np.linspace(0.5, n_cats - 0.5, n_cats)) + cbar.ax.set_yticklabels(self.data[c].cat.categories) + + if label is not None: + self._append_legend_handles_labels( + # error: Argument 2 to "_append_legend_handles_labels" of + # "MPLPlot" has incompatible type "Hashable"; expected "str" + scatter, + label, # type: ignore[arg-type] + ) + + errors_x = self._get_errorbars(label=x, index=0, yerr=False) + errors_y = self._get_errorbars(label=y, index=0, xerr=False) + if len(errors_x) > 0 or len(errors_y) > 0: + err_kwds = dict(errors_x, **errors_y) + err_kwds["ecolor"] = scatter.get_facecolor()[0] + ax.errorbar(data[x].values, data[y].values, linestyle="none", **err_kwds) + + def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool): + c = self.c + if c is not None and color is not None: + raise TypeError("Specify exactly one of `c` and `color`") + if c is None and color is None: + c_values = self.plt.rcParams["patch.facecolor"] + elif color is not None: + c_values = color + elif color_by_categorical: + c_values = self.data[c].cat.codes + elif c_is_column: + c_values = self.data[c].values + else: + c_values = c + return c_values + + def _get_norm_and_cmap(self, c_values, color_by_categorical: bool): + c = self.c + if self.colormap is not None: + cmap = mpl.colormaps.get_cmap(self.colormap) + # cmap is only used if c_values are integers, otherwise UserWarning. + # GH-53908: additionally call isinstance() because is_integer_dtype + # returns True for "b" (meaning "blue" and not int8 in this context) + elif not isinstance(c_values, str) and is_integer_dtype(c_values): + # pandas uses colormap, matplotlib uses cmap. + cmap = mpl.colormaps["Greys"] + else: + cmap = None + + if color_by_categorical and cmap is not None: + from matplotlib import colors + + n_cats = len(self.data[c].cat.categories) + cmap = colors.ListedColormap([cmap(i) for i in range(cmap.N)]) + bounds = np.linspace(0, n_cats, n_cats + 1) + norm = colors.BoundaryNorm(bounds, cmap.N) + # TODO: warn that we are ignoring self.norm if user specified it? + # Doesn't happen in any tests 2023-11-09 + else: + norm = self.norm + return norm, cmap + + def _get_colorbar(self, c_values, c_is_column: bool) -> bool: + # plot colorbar if + # 1. colormap is assigned, and + # 2.`c` is a column containing only numeric values + plot_colorbar = self.colormap or c_is_column + cb = self.colorbar + if cb is lib.no_default: + return is_numeric_dtype(c_values) and plot_colorbar + return cb + + +class HexBinPlot(PlanePlot): + @property + def _kind(self) -> Literal["hexbin"]: + return "hexbin" + + def __init__(self, data, x, y, C=None, *, colorbar: bool = True, **kwargs) -> None: + super().__init__(data, x, y, **kwargs) + if is_integer(C) and not self.data.columns._holds_integer(): + C = self.data.columns[C] + self.C = C + + self.colorbar = colorbar + + # Scatter plot allows to plot objects data + if len(self.data[self.x]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires x column to be numeric") + if len(self.data[self.y]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires y column to be numeric") + + def _make_plot(self, fig: Figure) -> None: + x, y, data, C = self.x, self.y, self.data, self.C + ax = self.axes[0] + # pandas uses colormap, matplotlib uses cmap. + cmap = self.colormap or "BuGn" + cmap = mpl.colormaps.get_cmap(cmap) + cb = self.colorbar + + if C is None: + c_values = None + else: + c_values = data[C].values + + ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap, **self.kwds) + if cb: + self._plot_colorbar(ax, fig=fig) + + def _make_legend(self) -> None: + pass + + +class LinePlot(MPLPlot): + _default_rot = 0 + + @property + def orientation(self) -> PlottingOrientation: + return "vertical" + + @property + def _kind(self) -> Literal["line", "area", "hist", "kde", "box"]: + return "line" + + def __init__(self, data, **kwargs) -> None: + from pandas.plotting import plot_params + + MPLPlot.__init__(self, data, **kwargs) + if self.stacked: + self.data = self.data.fillna(value=0) + self.x_compat = plot_params["x_compat"] + if "x_compat" in self.kwds: + self.x_compat = bool(self.kwds.pop("x_compat")) + + @final + def _is_ts_plot(self) -> bool: + # this is slightly deceptive + return not self.x_compat and self.use_index and self._use_dynamic_x() + + @final + def _use_dynamic_x(self) -> bool: + return use_dynamic_x(self._get_ax(0), self.data) + + def _make_plot(self, fig: Figure) -> None: + if self._is_ts_plot(): + data = maybe_convert_index(self._get_ax(0), self.data) + + x = data.index # dummy, not used + plotf = self._ts_plot + it = data.items() + else: + x = self._get_xticks() + # error: Incompatible types in assignment (expression has type + # "Callable[[Any, Any, Any, Any, Any, Any, KwArg(Any)], Any]", variable has + # type "Callable[[Any, Any, Any, Any, KwArg(Any)], Any]") + plotf = self._plot # type: ignore[assignment] + # error: Incompatible types in assignment (expression has type + # "Iterator[tuple[Hashable, ndarray[Any, Any]]]", variable has + # type "Iterable[tuple[Hashable, Series]]") + it = self._iter_data(data=self.data) # type: ignore[assignment] + + stacking_id = self._get_stacking_id() + is_errorbar = com.any_not_none(*self.errors.values()) + + colors = self._get_colors() + for i, (label, y) in enumerate(it): + ax = self._get_ax(i) + kwds = self.kwds.copy() + if self.color is not None: + kwds["color"] = self.color + style, kwds = self._apply_style_colors( + colors, + kwds, + i, + # error: Argument 4 to "_apply_style_colors" of "MPLPlot" has + # incompatible type "Hashable"; expected "str" + label, # type: ignore[arg-type] + ) + + errors = self._get_errorbars(label=label, index=i) + kwds = dict(kwds, **errors) + + label = pprint_thing(label) + label = self._mark_right_label(label, index=i) + kwds["label"] = label + + newlines = plotf( + ax, + x, + y, + style=style, + column_num=i, + stacking_id=stacking_id, + is_errorbar=is_errorbar, + **kwds, + ) + self._append_legend_handles_labels(newlines[0], label) + + if self._is_ts_plot(): + # reset of xlim should be used for ts data + # TODO: GH28021, should find a way to change view limit on xaxis + lines = get_all_lines(ax) + left, right = get_xlim(lines) + ax.set_xlim(left, right) + + # error: Signature of "_plot" incompatible with supertype "MPLPlot" + @classmethod + def _plot( # type: ignore[override] + cls, + ax: Axes, + x, + y: np.ndarray, + style=None, + column_num=None, + stacking_id=None, + **kwds, + ): + # column_num is used to get the target column from plotf in line and + # area plots + if column_num == 0: + cls._initialize_stacker(ax, stacking_id, len(y)) + y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"]) + lines = MPLPlot._plot(ax, x, y_values, style=style, **kwds) + cls._update_stacker(ax, stacking_id, y) + return lines + + @final + def _ts_plot(self, ax: Axes, x, data: Series, style=None, **kwds): + # accept x to be consistent with normal plot func, + # x is not passed to tsplot as it uses data.index as x coordinate + # column_num must be in kwds for stacking purpose + freq, data = maybe_resample(data, ax, kwds) + + # Set ax with freq info + decorate_axes(ax, freq) + # digging deeper + if hasattr(ax, "left_ax"): + decorate_axes(ax.left_ax, freq) + if hasattr(ax, "right_ax"): + decorate_axes(ax.right_ax, freq) + # TODO #54485 + ax._plot_data.append((data, self._kind, kwds)) # type: ignore[attr-defined] + + lines = self._plot(ax, data.index, np.asarray(data.values), style=style, **kwds) + # set date formatter, locators and rescale limits + # TODO #54485 + format_dateaxis(ax, ax.freq, data.index) # type: ignore[arg-type, attr-defined] + return lines + + @final + def _get_stacking_id(self) -> int | None: + if self.stacked: + return id(self.data) + else: + return None + + @final + @classmethod + def _initialize_stacker(cls, ax: Axes, stacking_id, n: int) -> None: + if stacking_id is None: + return + if not hasattr(ax, "_stacker_pos_prior"): + # TODO #54485 + ax._stacker_pos_prior = {} # type: ignore[attr-defined] + if not hasattr(ax, "_stacker_neg_prior"): + # TODO #54485 + ax._stacker_neg_prior = {} # type: ignore[attr-defined] + # TODO #54485 + ax._stacker_pos_prior[stacking_id] = np.zeros(n) # type: ignore[attr-defined] + # TODO #54485 + ax._stacker_neg_prior[stacking_id] = np.zeros(n) # type: ignore[attr-defined] + + @final + @classmethod + def _get_stacked_values( + cls, ax: Axes, stacking_id: int | None, values: np.ndarray, label + ) -> np.ndarray: + if stacking_id is None: + return values + if not hasattr(ax, "_stacker_pos_prior"): + # stacker may not be initialized for subplots + cls._initialize_stacker(ax, stacking_id, len(values)) + + if (values >= 0).all(): + # TODO #54485 + return ( + ax._stacker_pos_prior[stacking_id] # type: ignore[attr-defined] + + values + ) + elif (values <= 0).all(): + # TODO #54485 + return ( + ax._stacker_neg_prior[stacking_id] # type: ignore[attr-defined] + + values + ) + + raise ValueError( + "When stacked is True, each column must be either " + "all positive or all negative. " + f"Column '{label}' contains both positive and negative values" + ) + + @final + @classmethod + def _update_stacker(cls, ax: Axes, stacking_id: int | None, values) -> None: + if stacking_id is None: + return + if (values >= 0).all(): + # TODO #54485 + ax._stacker_pos_prior[stacking_id] += values # type: ignore[attr-defined] + elif (values <= 0).all(): + # TODO #54485 + ax._stacker_neg_prior[stacking_id] += values # type: ignore[attr-defined] + + def _post_plot_logic(self, ax: Axes, data) -> None: + from matplotlib.ticker import FixedLocator + + def get_label(i): + if is_float(i) and i.is_integer(): + i = int(i) + try: + return pprint_thing(data.index[i]) + except Exception: + return "" + + if self._need_to_set_index: + xticks = ax.get_xticks() + xticklabels = [get_label(x) for x in xticks] + # error: Argument 1 to "FixedLocator" has incompatible type "ndarray[Any, + # Any]"; expected "Sequence[float]" + ax.xaxis.set_major_locator(FixedLocator(xticks)) # type: ignore[arg-type] + ax.set_xticklabels(xticklabels) + + # If the index is an irregular time series, then by default + # we rotate the tick labels. The exception is if there are + # subplots which don't share their x-axes, in which we case + # we don't rotate the ticklabels as by default the subplots + # would be too close together. + condition = ( + not self._use_dynamic_x() + and (data.index._is_all_dates and self.use_index) + and (not self.subplots or (self.subplots and self.sharex)) + ) + + index_name = self._get_index_name() + + if condition: + # irregular TS rotated 30 deg. by default + # probably a better place to check / set this. + if not self._rot_set: + self.rot = 30 + format_date_labels(ax, rot=self.rot) + + if index_name is not None and self.use_index: + ax.set_xlabel(index_name) + + +class AreaPlot(LinePlot): + @property + def _kind(self) -> Literal["area"]: + return "area" + + def __init__(self, data, **kwargs) -> None: + kwargs.setdefault("stacked", True) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + data = data.fillna(value=0) + LinePlot.__init__(self, data, **kwargs) + + if not self.stacked: + # use smaller alpha to distinguish overlap + self.kwds.setdefault("alpha", 0.5) + + if self.logy or self.loglog: + raise ValueError("Log-y scales are not supported in area plot") + + # error: Signature of "_plot" incompatible with supertype "MPLPlot" + @classmethod + def _plot( # type: ignore[override] + cls, + ax: Axes, + x, + y: np.ndarray, + style=None, + column_num=None, + stacking_id=None, + is_errorbar: bool = False, + **kwds, + ): + if column_num == 0: + cls._initialize_stacker(ax, stacking_id, len(y)) + y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"]) + + # need to remove label, because subplots uses mpl legend as it is + line_kwds = kwds.copy() + line_kwds.pop("label") + lines = MPLPlot._plot(ax, x, y_values, style=style, **line_kwds) + + # get data from the line to get coordinates for fill_between + xdata, y_values = lines[0].get_data(orig=False) + + # unable to use ``_get_stacked_values`` here to get starting point + if stacking_id is None: + start = np.zeros(len(y)) + elif (y >= 0).all(): + # TODO #54485 + start = ax._stacker_pos_prior[stacking_id] # type: ignore[attr-defined] + elif (y <= 0).all(): + # TODO #54485 + start = ax._stacker_neg_prior[stacking_id] # type: ignore[attr-defined] + else: + start = np.zeros(len(y)) + + if "color" not in kwds: + kwds["color"] = lines[0].get_color() + + rect = ax.fill_between(xdata, start, y_values, **kwds) + cls._update_stacker(ax, stacking_id, y) + + # LinePlot expects list of artists + res = [rect] + return res + + def _post_plot_logic(self, ax: Axes, data) -> None: + LinePlot._post_plot_logic(self, ax, data) + + is_shared_y = len(list(ax.get_shared_y_axes())) > 0 + # do not override the default axis behaviour in case of shared y axes + if self.ylim is None and not is_shared_y: + if (data >= 0).all().all(): + ax.set_ylim(0, None) + elif (data <= 0).all().all(): + ax.set_ylim(None, 0) + + +class BarPlot(MPLPlot): + @property + def _kind(self) -> Literal["bar", "barh"]: + return "bar" + + _default_rot = 90 + + @property + def orientation(self) -> PlottingOrientation: + return "vertical" + + def __init__( + self, + data, + *, + align="center", + bottom=0, + left=0, + width=0.5, + position=0.5, + log=False, + **kwargs, + ) -> None: + # we have to treat a series differently than a + # 1-column DataFrame w.r.t. color handling + self._is_series = isinstance(data, ABCSeries) + self.bar_width = width + self._align = align + self._position = position + self.tick_pos = np.arange(len(data)) + + if is_list_like(bottom): + bottom = np.array(bottom) + if is_list_like(left): + left = np.array(left) + self.bottom = bottom + self.left = left + + self.log = log + + MPLPlot.__init__(self, data, **kwargs) + + @cache_readonly + def ax_pos(self) -> np.ndarray: + return self.tick_pos - self.tickoffset + + @cache_readonly + def tickoffset(self): + if self.stacked or self.subplots: + return self.bar_width * self._position + elif self._align == "edge": + w = self.bar_width / self.nseries + return self.bar_width * (self._position - 0.5) + w * 0.5 + else: + return self.bar_width * self._position + + @cache_readonly + def lim_offset(self): + if self.stacked or self.subplots: + if self._align == "edge": + return self.bar_width / 2 + else: + return 0 + elif self._align == "edge": + w = self.bar_width / self.nseries + return w * 0.5 + else: + return 0 + + # error: Signature of "_plot" incompatible with supertype "MPLPlot" + @classmethod + def _plot( # type: ignore[override] + cls, + ax: Axes, + x, + y: np.ndarray, + w, + start: int | npt.NDArray[np.intp] = 0, + log: bool = False, + **kwds, + ): + return ax.bar(x, y, w, bottom=start, log=log, **kwds) + + @property + def _start_base(self): + return self.bottom + + def _make_plot(self, fig: Figure) -> None: + colors = self._get_colors() + ncolors = len(colors) + + pos_prior = neg_prior = np.zeros(len(self.data)) + K = self.nseries + + data = self.data.fillna(0) + for i, (label, y) in enumerate(self._iter_data(data=data)): + ax = self._get_ax(i) + kwds = self.kwds.copy() + if self._is_series: + kwds["color"] = colors + elif isinstance(colors, dict): + kwds["color"] = colors[label] + else: + kwds["color"] = colors[i % ncolors] + + errors = self._get_errorbars(label=label, index=i) + kwds = dict(kwds, **errors) + + label = pprint_thing(label) + label = self._mark_right_label(label, index=i) + + if (("yerr" in kwds) or ("xerr" in kwds)) and (kwds.get("ecolor") is None): + kwds["ecolor"] = mpl.rcParams["xtick.color"] + + start = 0 + if self.log and (y >= 1).all(): + start = 1 + start = start + self._start_base + + kwds["align"] = self._align + if self.subplots: + w = self.bar_width / 2 + rect = self._plot( + ax, + self.ax_pos + w, + y, + self.bar_width, + start=start, + label=label, + log=self.log, + **kwds, + ) + ax.set_title(label) + elif self.stacked: + mask = y > 0 + start = np.where(mask, pos_prior, neg_prior) + self._start_base + w = self.bar_width / 2 + rect = self._plot( + ax, + self.ax_pos + w, + y, + self.bar_width, + start=start, + label=label, + log=self.log, + **kwds, + ) + pos_prior = pos_prior + np.where(mask, y, 0) + neg_prior = neg_prior + np.where(mask, 0, y) + else: + w = self.bar_width / K + rect = self._plot( + ax, + self.ax_pos + (i + 0.5) * w, + y, + w, + start=start, + label=label, + log=self.log, + **kwds, + ) + self._append_legend_handles_labels(rect, label) + + def _post_plot_logic(self, ax: Axes, data) -> None: + if self.use_index: + str_index = [pprint_thing(key) for key in data.index] + else: + str_index = [pprint_thing(key) for key in range(data.shape[0])] + + s_edge = self.ax_pos[0] - 0.25 + self.lim_offset + e_edge = self.ax_pos[-1] + 0.25 + self.bar_width + self.lim_offset + + self._decorate_ticks(ax, self._get_index_name(), str_index, s_edge, e_edge) + + def _decorate_ticks( + self, + ax: Axes, + name: str | None, + ticklabels: list[str], + start_edge: float, + end_edge: float, + ) -> None: + ax.set_xlim((start_edge, end_edge)) + + if self.xticks is not None: + ax.set_xticks(np.array(self.xticks)) + else: + ax.set_xticks(self.tick_pos) + ax.set_xticklabels(ticklabels) + + if name is not None and self.use_index: + ax.set_xlabel(name) + + +class BarhPlot(BarPlot): + @property + def _kind(self) -> Literal["barh"]: + return "barh" + + _default_rot = 0 + + @property + def orientation(self) -> Literal["horizontal"]: + return "horizontal" + + @property + def _start_base(self): + return self.left + + # error: Signature of "_plot" incompatible with supertype "MPLPlot" + @classmethod + def _plot( # type: ignore[override] + cls, + ax: Axes, + x, + y: np.ndarray, + w, + start: int | npt.NDArray[np.intp] = 0, + log: bool = False, + **kwds, + ): + return ax.barh(x, y, w, left=start, log=log, **kwds) + + def _get_custom_index_name(self): + return self.ylabel + + def _decorate_ticks( + self, + ax: Axes, + name: str | None, + ticklabels: list[str], + start_edge: float, + end_edge: float, + ) -> None: + # horizontal bars + ax.set_ylim((start_edge, end_edge)) + ax.set_yticks(self.tick_pos) + ax.set_yticklabels(ticklabels) + if name is not None and self.use_index: + ax.set_ylabel(name) + # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible type + # "Hashable | None"; expected "str" + ax.set_xlabel(self.xlabel) # type: ignore[arg-type] + + +class PiePlot(MPLPlot): + @property + def _kind(self) -> Literal["pie"]: + return "pie" + + _layout_type = "horizontal" + + def __init__(self, data, kind=None, **kwargs) -> None: + data = data.fillna(value=0) + if (data < 0).any().any(): + raise ValueError(f"{self._kind} plot doesn't allow negative values") + MPLPlot.__init__(self, data, kind=kind, **kwargs) + + @classmethod + def _validate_log_kwd( + cls, + kwd: str, + value: bool | None | Literal["sym"], + ) -> bool | None | Literal["sym"]: + super()._validate_log_kwd(kwd=kwd, value=value) + if value is not False: + warnings.warn( + f"PiePlot ignores the '{kwd}' keyword", + UserWarning, + stacklevel=find_stack_level(), + ) + return False + + def _validate_color_args(self, color, colormap) -> None: + # TODO: warn if color is passed and ignored? + return None + + def _make_plot(self, fig: Figure) -> None: + colors = self._get_colors(num_colors=len(self.data), color_kwds="colors") + self.kwds.setdefault("colors", colors) + + for i, (label, y) in enumerate(self._iter_data(data=self.data)): + ax = self._get_ax(i) + if label is not None: + label = pprint_thing(label) + ax.set_ylabel(label) + + kwds = self.kwds.copy() + + def blank_labeler(label, value): + if value == 0: + return "" + else: + return label + + idx = [pprint_thing(v) for v in self.data.index] + labels = kwds.pop("labels", idx) + # labels is used for each wedge's labels + # Blank out labels for values of 0 so they don't overlap + # with nonzero wedges + if labels is not None: + blabels = [blank_labeler(left, value) for left, value in zip(labels, y)] + else: + blabels = None + results = ax.pie(y, labels=blabels, **kwds) + + if kwds.get("autopct", None) is not None: + patches, texts, autotexts = results + else: + patches, texts = results + autotexts = [] + + if self.fontsize is not None: + for t in texts + autotexts: + t.set_fontsize(self.fontsize) + + # leglabels is used for legend labels + leglabels = labels if labels is not None else idx + for _patch, _leglabel in zip(patches, leglabels): + self._append_legend_handles_labels(_patch, _leglabel) + + def _post_plot_logic(self, ax: Axes, data) -> None: + pass diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/groupby.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/groupby.py new file mode 100644 index 0000000000000000000000000000000000000000..cbb66065a8039c63b7181619aea3aa74277da4a5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/groupby.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +from pandas.core.dtypes.missing import remove_na_arraylike + +from pandas import ( + MultiIndex, + concat, +) + +from pandas.plotting._matplotlib.misc import unpack_single_str_list + +if TYPE_CHECKING: + from collections.abc import Hashable + + from pandas._typing import IndexLabel + + from pandas import ( + DataFrame, + Series, + ) + + +def create_iter_data_given_by( + data: DataFrame, kind: str = "hist" +) -> dict[Hashable, DataFrame | Series]: + """ + Create data for iteration given `by` is assigned or not, and it is only + used in both hist and boxplot. + + If `by` is assigned, return a dictionary of DataFrames in which the key of + dictionary is the values in groups. + If `by` is not assigned, return input as is, and this preserves current + status of iter_data. + + Parameters + ---------- + data : reformatted grouped data from `_compute_plot_data` method. + kind : str, plot kind. This function is only used for `hist` and `box` plots. + + Returns + ------- + iter_data : DataFrame or Dictionary of DataFrames + + Examples + -------- + If `by` is assigned: + + >>> import numpy as np + >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')] + >>> mi = pd.MultiIndex.from_tuples(tuples) + >>> value = [[1, 3, np.nan, np.nan], + ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] + >>> data = pd.DataFrame(value, columns=mi) + >>> create_iter_data_given_by(data) + {'h1': h1 + a b + 0 1.0 3.0 + 1 3.0 4.0 + 2 NaN NaN, 'h2': h2 + a b + 0 NaN NaN + 1 NaN NaN + 2 5.0 6.0} + """ + + # For `hist` plot, before transformation, the values in level 0 are values + # in groups and subplot titles, and later used for column subselection and + # iteration; For `box` plot, values in level 1 are column names to show, + # and are used for iteration and as subplots titles. + if kind == "hist": + level = 0 + else: + level = 1 + + # Select sub-columns based on the value of level of MI, and if `by` is + # assigned, data must be a MI DataFrame + assert isinstance(data.columns, MultiIndex) + return { + col: data.loc[:, data.columns.get_level_values(level) == col] + for col in data.columns.levels[level] + } + + +def reconstruct_data_with_by( + data: DataFrame, by: IndexLabel, cols: IndexLabel +) -> DataFrame: + """ + Internal function to group data, and reassign multiindex column names onto the + result in order to let grouped data be used in _compute_plot_data method. + + Parameters + ---------- + data : Original DataFrame to plot + by : grouped `by` parameter selected by users + cols : columns of data set (excluding columns used in `by`) + + Returns + ------- + Output is the reconstructed DataFrame with MultiIndex columns. The first level + of MI is unique values of groups, and second level of MI is the columns + selected by users. + + Examples + -------- + >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]} + >>> df = pd.DataFrame(d) + >>> reconstruct_data_with_by(df, by='h', cols=['a', 'b']) + h1 h2 + a b a b + 0 1.0 3.0 NaN NaN + 1 3.0 4.0 NaN NaN + 2 NaN NaN 5.0 6.0 + """ + by_modified = unpack_single_str_list(by) + grouped = data.groupby(by_modified) + + data_list = [] + for key, group in grouped: + # error: List item 1 has incompatible type "Union[Hashable, + # Sequence[Hashable]]"; expected "Iterable[Hashable]" + columns = MultiIndex.from_product([[key], cols]) # type: ignore[list-item] + sub_group = group[cols] + sub_group.columns = columns + data_list.append(sub_group) + + data = concat(data_list, axis=1) + return data + + +def reformat_hist_y_given_by(y: np.ndarray, by: IndexLabel | None) -> np.ndarray: + """Internal function to reformat y given `by` is applied or not for hist plot. + + If by is None, input y is 1-d with NaN removed; and if by is not None, groupby + will take place and input y is multi-dimensional array. + """ + if by is not None and len(y.shape) > 1: + return np.array([remove_na_arraylike(col) for col in y.T]).T + return remove_na_arraylike(y) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/hist.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/hist.py new file mode 100644 index 0000000000000000000000000000000000000000..e610f1adb602c46ffd7affa50c0f857ad7d030e4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/hist.py @@ -0,0 +1,581 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, + Literal, + final, +) + +import numpy as np + +from pandas.core.dtypes.common import ( + is_integer, + is_list_like, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndex, +) +from pandas.core.dtypes.missing import ( + isna, + remove_na_arraylike, +) + +from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib.core import ( + LinePlot, + MPLPlot, +) +from pandas.plotting._matplotlib.groupby import ( + create_iter_data_given_by, + reformat_hist_y_given_by, +) +from pandas.plotting._matplotlib.misc import unpack_single_str_list +from pandas.plotting._matplotlib.tools import ( + create_subplots, + flatten_axes, + maybe_adjust_figure, + set_ticks_props, +) + +if TYPE_CHECKING: + from matplotlib.axes import Axes + from matplotlib.figure import Figure + + from pandas._typing import PlottingOrientation + + from pandas import ( + DataFrame, + Series, + ) + + +class HistPlot(LinePlot): + @property + def _kind(self) -> Literal["hist", "kde"]: + return "hist" + + def __init__( + self, + data, + bins: int | np.ndarray | list[np.ndarray] = 10, + bottom: int | np.ndarray = 0, + *, + range=None, + weights=None, + **kwargs, + ) -> None: + if is_list_like(bottom): + bottom = np.array(bottom) + self.bottom = bottom + + self._bin_range = range + self.weights = weights + + self.xlabel = kwargs.get("xlabel") + self.ylabel = kwargs.get("ylabel") + # Do not call LinePlot.__init__ which may fill nan + MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called + + self.bins = self._adjust_bins(bins) + + def _adjust_bins(self, bins: int | np.ndarray | list[np.ndarray]): + if is_integer(bins): + if self.by is not None: + by_modified = unpack_single_str_list(self.by) + grouped = self.data.groupby(by_modified)[self.columns] + bins = [self._calculate_bins(group, bins) for key, group in grouped] + else: + bins = self._calculate_bins(self.data, bins) + return bins + + def _calculate_bins(self, data: Series | DataFrame, bins) -> np.ndarray: + """Calculate bins given data""" + nd_values = data.infer_objects(copy=False)._get_numeric_data() + values = np.ravel(nd_values) + values = values[~isna(values)] + + hist, bins = np.histogram(values, bins=bins, range=self._bin_range) + return bins + + # error: Signature of "_plot" incompatible with supertype "LinePlot" + @classmethod + def _plot( # type: ignore[override] + cls, + ax: Axes, + y: np.ndarray, + style=None, + bottom: int | np.ndarray = 0, + column_num: int = 0, + stacking_id=None, + *, + bins, + **kwds, + ): + if column_num == 0: + cls._initialize_stacker(ax, stacking_id, len(bins) - 1) + + base = np.zeros(len(bins) - 1) + bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"]) + # ignore style + n, bins, patches = ax.hist(y, bins=bins, bottom=bottom, **kwds) + cls._update_stacker(ax, stacking_id, n) + return patches + + def _make_plot(self, fig: Figure) -> None: + colors = self._get_colors() + stacking_id = self._get_stacking_id() + + # Re-create iterated data if `by` is assigned by users + data = ( + create_iter_data_given_by(self.data, self._kind) + if self.by is not None + else self.data + ) + + # error: Argument "data" to "_iter_data" of "MPLPlot" has incompatible + # type "object"; expected "DataFrame | dict[Hashable, Series | DataFrame]" + for i, (label, y) in enumerate(self._iter_data(data=data)): # type: ignore[arg-type] + ax = self._get_ax(i) + + kwds = self.kwds.copy() + if self.color is not None: + kwds["color"] = self.color + + label = pprint_thing(label) + label = self._mark_right_label(label, index=i) + kwds["label"] = label + + style, kwds = self._apply_style_colors(colors, kwds, i, label) + if style is not None: + kwds["style"] = style + + self._make_plot_keywords(kwds, y) + + # the bins is multi-dimension array now and each plot need only 1-d and + # when by is applied, label should be columns that are grouped + if self.by is not None: + kwds["bins"] = kwds["bins"][i] + kwds["label"] = self.columns + kwds.pop("color") + + if self.weights is not None: + kwds["weights"] = type(self)._get_column_weights(self.weights, i, y) + + y = reformat_hist_y_given_by(y, self.by) + + artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) + + # when by is applied, show title for subplots to know which group it is + if self.by is not None: + ax.set_title(pprint_thing(label)) + + self._append_legend_handles_labels(artists[0], label) + + def _make_plot_keywords(self, kwds: dict[str, Any], y: np.ndarray) -> None: + """merge BoxPlot/KdePlot properties to passed kwds""" + # y is required for KdePlot + kwds["bottom"] = self.bottom + kwds["bins"] = self.bins + + @final + @staticmethod + def _get_column_weights(weights, i: int, y): + # We allow weights to be a multi-dimensional array, e.g. a (10, 2) array, + # and each sub-array (10,) will be called in each iteration. If users only + # provide 1D array, we assume the same weights is used for all iterations + if weights is not None: + if np.ndim(weights) != 1 and np.shape(weights)[-1] != 1: + try: + weights = weights[:, i] + except IndexError as err: + raise ValueError( + "weights must have the same shape as data, " + "or be a single column" + ) from err + weights = weights[~isna(y)] + return weights + + def _post_plot_logic(self, ax: Axes, data) -> None: + if self.orientation == "horizontal": + # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible + # type "Hashable"; expected "str" + ax.set_xlabel( + "Frequency" + if self.xlabel is None + else self.xlabel # type: ignore[arg-type] + ) + ax.set_ylabel(self.ylabel) # type: ignore[arg-type] + else: + ax.set_xlabel(self.xlabel) # type: ignore[arg-type] + ax.set_ylabel( + "Frequency" + if self.ylabel is None + else self.ylabel # type: ignore[arg-type] + ) + + @property + def orientation(self) -> PlottingOrientation: + if self.kwds.get("orientation", None) == "horizontal": + return "horizontal" + else: + return "vertical" + + +class KdePlot(HistPlot): + @property + def _kind(self) -> Literal["kde"]: + return "kde" + + @property + def orientation(self) -> Literal["vertical"]: + return "vertical" + + def __init__( + self, data, bw_method=None, ind=None, *, weights=None, **kwargs + ) -> None: + # Do not call LinePlot.__init__ which may fill nan + MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called + self.bw_method = bw_method + self.ind = ind + self.weights = weights + + @staticmethod + def _get_ind(y: np.ndarray, ind): + if ind is None: + # np.nanmax() and np.nanmin() ignores the missing values + sample_range = np.nanmax(y) - np.nanmin(y) + ind = np.linspace( + np.nanmin(y) - 0.5 * sample_range, + np.nanmax(y) + 0.5 * sample_range, + 1000, + ) + elif is_integer(ind): + sample_range = np.nanmax(y) - np.nanmin(y) + ind = np.linspace( + np.nanmin(y) - 0.5 * sample_range, + np.nanmax(y) + 0.5 * sample_range, + ind, + ) + return ind + + @classmethod + # error: Signature of "_plot" incompatible with supertype "MPLPlot" + def _plot( # type: ignore[override] + cls, + ax: Axes, + y: np.ndarray, + style=None, + bw_method=None, + ind=None, + column_num=None, + stacking_id: int | None = None, + **kwds, + ): + from scipy.stats import gaussian_kde + + y = remove_na_arraylike(y) + gkde = gaussian_kde(y, bw_method=bw_method) + + y = gkde.evaluate(ind) + lines = MPLPlot._plot(ax, ind, y, style=style, **kwds) + return lines + + def _make_plot_keywords(self, kwds: dict[str, Any], y: np.ndarray) -> None: + kwds["bw_method"] = self.bw_method + kwds["ind"] = type(self)._get_ind(y, ind=self.ind) + + def _post_plot_logic(self, ax: Axes, data) -> None: + ax.set_ylabel("Density") + + +def _grouped_plot( + plotf, + data: Series | DataFrame, + column=None, + by=None, + numeric_only: bool = True, + figsize: tuple[float, float] | None = None, + sharex: bool = True, + sharey: bool = True, + layout=None, + rot: float = 0, + ax=None, + **kwargs, +): + # error: Non-overlapping equality check (left operand type: "Optional[Tuple[float, + # float]]", right operand type: "Literal['default']") + if figsize == "default": # type: ignore[comparison-overlap] + # allowed to specify mpl default with 'default' + raise ValueError( + "figsize='default' is no longer supported. " + "Specify figure size by tuple instead" + ) + + grouped = data.groupby(by) + if column is not None: + grouped = grouped[column] + + naxes = len(grouped) + fig, axes = create_subplots( + naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout + ) + + _axes = flatten_axes(axes) + + for i, (key, group) in enumerate(grouped): + ax = _axes[i] + if numeric_only and isinstance(group, ABCDataFrame): + group = group._get_numeric_data() + plotf(group, ax, **kwargs) + ax.set_title(pprint_thing(key)) + + return fig, axes + + +def _grouped_hist( + data: Series | DataFrame, + column=None, + by=None, + ax=None, + bins: int = 50, + figsize: tuple[float, float] | None = None, + layout=None, + sharex: bool = False, + sharey: bool = False, + rot: float = 90, + grid: bool = True, + xlabelsize: int | None = None, + xrot=None, + ylabelsize: int | None = None, + yrot=None, + legend: bool = False, + **kwargs, +): + """ + Grouped histogram + + Parameters + ---------- + data : Series/DataFrame + column : object, optional + by : object, optional + ax : axes, optional + bins : int, default 50 + figsize : tuple, optional + layout : optional + sharex : bool, default False + sharey : bool, default False + rot : float, default 90 + grid : bool, default True + legend: : bool, default False + kwargs : dict, keyword arguments passed to matplotlib.Axes.hist + + Returns + ------- + collection of Matplotlib Axes + """ + if legend: + assert "label" not in kwargs + if data.ndim == 1: + kwargs["label"] = data.name + elif column is None: + kwargs["label"] = data.columns + else: + kwargs["label"] = column + + def plot_group(group, ax) -> None: + ax.hist(group.dropna().values, bins=bins, **kwargs) + if legend: + ax.legend() + + if xrot is None: + xrot = rot + + fig, axes = _grouped_plot( + plot_group, + data, + column=column, + by=by, + sharex=sharex, + sharey=sharey, + ax=ax, + figsize=figsize, + layout=layout, + rot=rot, + ) + + set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) + + maybe_adjust_figure( + fig, bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 + ) + return axes + + +def hist_series( + self: Series, + by=None, + ax=None, + grid: bool = True, + xlabelsize: int | None = None, + xrot=None, + ylabelsize: int | None = None, + yrot=None, + figsize: tuple[float, float] | None = None, + bins: int = 10, + legend: bool = False, + **kwds, +): + import matplotlib.pyplot as plt + + if legend and "label" in kwds: + raise ValueError("Cannot use both legend and label") + + if by is None: + if kwds.get("layout", None) is not None: + raise ValueError("The 'layout' keyword is not supported when 'by' is None") + # hack until the plotting interface is a bit more unified + fig = kwds.pop( + "figure", plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize) + ) + if figsize is not None and tuple(figsize) != tuple(fig.get_size_inches()): + fig.set_size_inches(*figsize, forward=True) + if ax is None: + ax = fig.gca() + elif ax.get_figure() != fig: + raise AssertionError("passed axis not bound to passed figure") + values = self.dropna().values + if legend: + kwds["label"] = self.name + ax.hist(values, bins=bins, **kwds) + if legend: + ax.legend() + ax.grid(grid) + axes = np.array([ax]) + + # error: Argument 1 to "set_ticks_props" has incompatible type "ndarray[Any, + # dtype[Any]]"; expected "Axes | Sequence[Axes]" + set_ticks_props( + axes, # type: ignore[arg-type] + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + ) + + else: + if "figure" in kwds: + raise ValueError( + "Cannot pass 'figure' when using the " + "'by' argument, since a new 'Figure' instance will be created" + ) + axes = _grouped_hist( + self, + by=by, + ax=ax, + grid=grid, + figsize=figsize, + bins=bins, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + legend=legend, + **kwds, + ) + + if hasattr(axes, "ndim"): + if axes.ndim == 1 and len(axes) == 1: + return axes[0] + return axes + + +def hist_frame( + data: DataFrame, + column=None, + by=None, + grid: bool = True, + xlabelsize: int | None = None, + xrot=None, + ylabelsize: int | None = None, + yrot=None, + ax=None, + sharex: bool = False, + sharey: bool = False, + figsize: tuple[float, float] | None = None, + layout=None, + bins: int = 10, + legend: bool = False, + **kwds, +): + if legend and "label" in kwds: + raise ValueError("Cannot use both legend and label") + if by is not None: + axes = _grouped_hist( + data, + column=column, + by=by, + ax=ax, + grid=grid, + figsize=figsize, + sharex=sharex, + sharey=sharey, + layout=layout, + bins=bins, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + legend=legend, + **kwds, + ) + return axes + + if column is not None: + if not isinstance(column, (list, np.ndarray, ABCIndex)): + column = [column] + data = data[column] + # GH32590 + data = data.select_dtypes( + include=(np.number, "datetime64", "datetimetz"), exclude="timedelta" + ) + naxes = len(data.columns) + + if naxes == 0: + raise ValueError( + "hist method requires numerical or datetime columns, nothing to plot." + ) + + fig, axes = create_subplots( + naxes=naxes, + ax=ax, + squeeze=False, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + ) + _axes = flatten_axes(axes) + + can_set_label = "label" not in kwds + + for i, col in enumerate(data.columns): + ax = _axes[i] + if legend and can_set_label: + kwds["label"] = col + ax.hist(data[col].dropna().values, bins=bins, **kwds) + ax.set_title(col) + ax.grid(grid) + if legend: + ax.legend() + + set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) + maybe_adjust_figure(fig, wspace=0.3, hspace=0.3) + + return axes diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/misc.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..1f9212587e05e2e3689b680ff01ae7780230657e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/misc.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import random +from typing import TYPE_CHECKING + +from matplotlib import patches +import matplotlib.lines as mlines +import numpy as np + +from pandas.core.dtypes.missing import notna + +from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib.style import get_standard_colors +from pandas.plotting._matplotlib.tools import ( + create_subplots, + do_adjust_figure, + maybe_adjust_figure, + set_ticks_props, +) + +if TYPE_CHECKING: + from collections.abc import Hashable + + from matplotlib.axes import Axes + from matplotlib.figure import Figure + + from pandas import ( + DataFrame, + Index, + Series, + ) + + +def scatter_matrix( + frame: DataFrame, + alpha: float = 0.5, + figsize: tuple[float, float] | None = None, + ax=None, + grid: bool = False, + diagonal: str = "hist", + marker: str = ".", + density_kwds=None, + hist_kwds=None, + range_padding: float = 0.05, + **kwds, +): + df = frame._get_numeric_data() + n = df.columns.size + naxes = n * n + fig, axes = create_subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) + + # no gaps between subplots + maybe_adjust_figure(fig, wspace=0, hspace=0) + + mask = notna(df) + + marker = _get_marker_compat(marker) + + hist_kwds = hist_kwds or {} + density_kwds = density_kwds or {} + + # GH 14855 + kwds.setdefault("edgecolors", "none") + + boundaries_list = [] + for a in df.columns: + values = df[a].values[mask[a].values] + rmin_, rmax_ = np.min(values), np.max(values) + rdelta_ext = (rmax_ - rmin_) * range_padding / 2 + boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) + + for i, a in enumerate(df.columns): + for j, b in enumerate(df.columns): + ax = axes[i, j] + + if i == j: + values = df[a].values[mask[a].values] + + # Deal with the diagonal by drawing a histogram there. + if diagonal == "hist": + ax.hist(values, **hist_kwds) + + elif diagonal in ("kde", "density"): + from scipy.stats import gaussian_kde + + y = values + gkde = gaussian_kde(y) + ind = np.linspace(y.min(), y.max(), 1000) + ax.plot(ind, gkde.evaluate(ind), **density_kwds) + + ax.set_xlim(boundaries_list[i]) + + else: + common = (mask[a] & mask[b]).values + + ax.scatter( + df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds + ) + + ax.set_xlim(boundaries_list[j]) + ax.set_ylim(boundaries_list[i]) + + ax.set_xlabel(b) + ax.set_ylabel(a) + + if j != 0: + ax.yaxis.set_visible(False) + if i != n - 1: + ax.xaxis.set_visible(False) + + if len(df.columns) > 1: + lim1 = boundaries_list[0] + locs = axes[0][1].yaxis.get_majorticklocs() + locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])] + adj = (locs - lim1[0]) / (lim1[1] - lim1[0]) + + lim0 = axes[0][0].get_ylim() + adj = adj * (lim0[1] - lim0[0]) + lim0[0] + axes[0][0].yaxis.set_ticks(adj) + + if np.all(locs == locs.astype(int)): + # if all ticks are int + locs = locs.astype(int) + axes[0][0].yaxis.set_ticklabels(locs) + + set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + + return axes + + +def _get_marker_compat(marker): + if marker not in mlines.lineMarkers: + return "o" + return marker + + +def radviz( + frame: DataFrame, + class_column, + ax: Axes | None = None, + color=None, + colormap=None, + **kwds, +) -> Axes: + import matplotlib.pyplot as plt + + def normalize(series): + a = min(series) + b = max(series) + return (series - a) / (b - a) + + n = len(frame) + classes = frame[class_column].drop_duplicates() + class_col = frame[class_column] + df = frame.drop(class_column, axis=1).apply(normalize) + + if ax is None: + ax = plt.gca() + ax.set_xlim(-1, 1) + ax.set_ylim(-1, 1) + + to_plot: dict[Hashable, list[list]] = {} + colors = get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) + + for kls in classes: + to_plot[kls] = [[], []] + + m = len(frame.columns) - 1 + s = np.array( + [(np.cos(t), np.sin(t)) for t in [2 * np.pi * (i / m) for i in range(m)]] + ) + + for i in range(n): + row = df.iloc[i].values + row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1) + y = (s * row_).sum(axis=0) / row.sum() + kls = class_col.iat[i] + to_plot[kls][0].append(y[0]) + to_plot[kls][1].append(y[1]) + + for i, kls in enumerate(classes): + ax.scatter( + to_plot[kls][0], + to_plot[kls][1], + color=colors[i], + label=pprint_thing(kls), + **kwds, + ) + ax.legend() + + ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) + + for xy, name in zip(s, df.columns): + ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray")) + + if xy[0] < 0.0 and xy[1] < 0.0: + ax.text( + xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small" + ) + elif xy[0] < 0.0 <= xy[1]: + ax.text( + xy[0] - 0.025, + xy[1] + 0.025, + name, + ha="right", + va="bottom", + size="small", + ) + elif xy[1] < 0.0 <= xy[0]: + ax.text( + xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small" + ) + elif xy[0] >= 0.0 and xy[1] >= 0.0: + ax.text( + xy[0] + 0.025, xy[1] + 0.025, name, ha="left", va="bottom", size="small" + ) + + ax.axis("equal") + return ax + + +def andrews_curves( + frame: DataFrame, + class_column, + ax: Axes | None = None, + samples: int = 200, + color=None, + colormap=None, + **kwds, +) -> Axes: + import matplotlib.pyplot as plt + + def function(amplitudes): + def f(t): + x1 = amplitudes[0] + result = x1 / np.sqrt(2.0) + + # Take the rest of the coefficients and resize them + # appropriately. Take a copy of amplitudes as otherwise numpy + # deletes the element from amplitudes itself. + coeffs = np.delete(np.copy(amplitudes), 0) + coeffs = np.resize(coeffs, (int((coeffs.size + 1) / 2), 2)) + + # Generate the harmonics and arguments for the sin and cos + # functions. + harmonics = np.arange(0, coeffs.shape[0]) + 1 + trig_args = np.outer(harmonics, t) + + result += np.sum( + coeffs[:, 0, np.newaxis] * np.sin(trig_args) + + coeffs[:, 1, np.newaxis] * np.cos(trig_args), + axis=0, + ) + return result + + return f + + n = len(frame) + class_col = frame[class_column] + classes = frame[class_column].drop_duplicates() + df = frame.drop(class_column, axis=1) + t = np.linspace(-np.pi, np.pi, samples) + used_legends: set[str] = set() + + color_values = get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) + colors = dict(zip(classes, color_values)) + if ax is None: + ax = plt.gca() + ax.set_xlim(-np.pi, np.pi) + for i in range(n): + row = df.iloc[i].values + f = function(row) + y = f(t) + kls = class_col.iat[i] + label = pprint_thing(kls) + if label not in used_legends: + used_legends.add(label) + ax.plot(t, y, color=colors[kls], label=label, **kwds) + else: + ax.plot(t, y, color=colors[kls], **kwds) + + ax.legend(loc="upper right") + ax.grid() + return ax + + +def bootstrap_plot( + series: Series, + fig: Figure | None = None, + size: int = 50, + samples: int = 500, + **kwds, +) -> Figure: + import matplotlib.pyplot as plt + + # TODO: is the failure mentioned below still relevant? + # random.sample(ndarray, int) fails on python 3.3, sigh + data = list(series.values) + samplings = [random.sample(data, size) for _ in range(samples)] + + means = np.array([np.mean(sampling) for sampling in samplings]) + medians = np.array([np.median(sampling) for sampling in samplings]) + midranges = np.array( + [(min(sampling) + max(sampling)) * 0.5 for sampling in samplings] + ) + if fig is None: + fig = plt.figure() + x = list(range(samples)) + axes = [] + ax1 = fig.add_subplot(2, 3, 1) + ax1.set_xlabel("Sample") + axes.append(ax1) + ax1.plot(x, means, **kwds) + ax2 = fig.add_subplot(2, 3, 2) + ax2.set_xlabel("Sample") + axes.append(ax2) + ax2.plot(x, medians, **kwds) + ax3 = fig.add_subplot(2, 3, 3) + ax3.set_xlabel("Sample") + axes.append(ax3) + ax3.plot(x, midranges, **kwds) + ax4 = fig.add_subplot(2, 3, 4) + ax4.set_xlabel("Mean") + axes.append(ax4) + ax4.hist(means, **kwds) + ax5 = fig.add_subplot(2, 3, 5) + ax5.set_xlabel("Median") + axes.append(ax5) + ax5.hist(medians, **kwds) + ax6 = fig.add_subplot(2, 3, 6) + ax6.set_xlabel("Midrange") + axes.append(ax6) + ax6.hist(midranges, **kwds) + for axis in axes: + plt.setp(axis.get_xticklabels(), fontsize=8) + plt.setp(axis.get_yticklabels(), fontsize=8) + if do_adjust_figure(fig): + plt.tight_layout() + return fig + + +def parallel_coordinates( + frame: DataFrame, + class_column, + cols=None, + ax: Axes | None = None, + color=None, + use_columns: bool = False, + xticks=None, + colormap=None, + axvlines: bool = True, + axvlines_kwds=None, + sort_labels: bool = False, + **kwds, +) -> Axes: + import matplotlib.pyplot as plt + + if axvlines_kwds is None: + axvlines_kwds = {"linewidth": 1, "color": "black"} + + n = len(frame) + classes = frame[class_column].drop_duplicates() + class_col = frame[class_column] + + if cols is None: + df = frame.drop(class_column, axis=1) + else: + df = frame[cols] + + used_legends: set[str] = set() + + ncols = len(df.columns) + + # determine values to use for xticks + x: list[int] | Index + if use_columns is True: + if not np.all(np.isreal(list(df.columns))): + raise ValueError("Columns must be numeric to be used as xticks") + x = df.columns + elif xticks is not None: + if not np.all(np.isreal(xticks)): + raise ValueError("xticks specified must be numeric") + if len(xticks) != ncols: + raise ValueError("Length of xticks must match number of columns") + x = xticks + else: + x = list(range(ncols)) + + if ax is None: + ax = plt.gca() + + color_values = get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) + + if sort_labels: + classes = sorted(classes) + color_values = sorted(color_values) + colors = dict(zip(classes, color_values)) + + for i in range(n): + y = df.iloc[i].values + kls = class_col.iat[i] + label = pprint_thing(kls) + if label not in used_legends: + used_legends.add(label) + ax.plot(x, y, color=colors[kls], label=label, **kwds) + else: + ax.plot(x, y, color=colors[kls], **kwds) + + if axvlines: + for i in x: + ax.axvline(i, **axvlines_kwds) + + ax.set_xticks(x) + ax.set_xticklabels(df.columns) + ax.set_xlim(x[0], x[-1]) + ax.legend(loc="upper right") + ax.grid() + return ax + + +def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Axes: + # workaround because `c='b'` is hardcoded in matplotlib's scatter method + import matplotlib.pyplot as plt + + kwds.setdefault("c", plt.rcParams["patch.facecolor"]) + + data = series.values + y1 = data[:-lag] + y2 = data[lag:] + if ax is None: + ax = plt.gca() + ax.set_xlabel("y(t)") + ax.set_ylabel(f"y(t + {lag})") + ax.scatter(y1, y2, **kwds) + return ax + + +def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwds) -> Axes: + import matplotlib.pyplot as plt + + n = len(series) + data = np.asarray(series) + if ax is None: + ax = plt.gca() + ax.set_xlim(1, n) + ax.set_ylim(-1.0, 1.0) + mean = np.mean(data) + c0 = np.sum((data - mean) ** 2) / n + + def r(h): + return ((data[: n - h] - mean) * (data[h:] - mean)).sum() / n / c0 + + x = np.arange(n) + 1 + y = [r(loc) for loc in x] + z95 = 1.959963984540054 + z99 = 2.5758293035489004 + ax.axhline(y=z99 / np.sqrt(n), linestyle="--", color="grey") + ax.axhline(y=z95 / np.sqrt(n), color="grey") + ax.axhline(y=0.0, color="black") + ax.axhline(y=-z95 / np.sqrt(n), color="grey") + ax.axhline(y=-z99 / np.sqrt(n), linestyle="--", color="grey") + ax.set_xlabel("Lag") + ax.set_ylabel("Autocorrelation") + ax.plot(x, y, **kwds) + if "label" in kwds: + ax.legend() + ax.grid() + return ax + + +def unpack_single_str_list(keys): + # GH 42795 + if isinstance(keys, list) and len(keys) == 1: + keys = keys[0] + return keys diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/style.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/style.py new file mode 100644 index 0000000000000000000000000000000000000000..bf4e4be3bfd82e6ce89d526aa0da555f67b9f565 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/style.py @@ -0,0 +1,278 @@ +from __future__ import annotations + +from collections.abc import ( + Collection, + Iterator, +) +import itertools +from typing import ( + TYPE_CHECKING, + cast, +) +import warnings + +import matplotlib as mpl +import matplotlib.colors +import numpy as np + +from pandas._typing import MatplotlibColor as Color +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import is_list_like + +import pandas.core.common as com + +if TYPE_CHECKING: + from matplotlib.colors import Colormap + + +def get_standard_colors( + num_colors: int, + colormap: Colormap | None = None, + color_type: str = "default", + color: dict[str, Color] | Color | Collection[Color] | None = None, +): + """ + Get standard colors based on `colormap`, `color_type` or `color` inputs. + + Parameters + ---------- + num_colors : int + Minimum number of colors to be returned. + Ignored if `color` is a dictionary. + colormap : :py:class:`matplotlib.colors.Colormap`, optional + Matplotlib colormap. + When provided, the resulting colors will be derived from the colormap. + color_type : {"default", "random"}, optional + Type of colors to derive. Used if provided `color` and `colormap` are None. + Ignored if either `color` or `colormap` are not None. + color : dict or str or sequence, optional + Color(s) to be used for deriving sequence of colors. + Can be either be a dictionary, or a single color (single color string, + or sequence of floats representing a single color), + or a sequence of colors. + + Returns + ------- + dict or list + Standard colors. Can either be a mapping if `color` was a dictionary, + or a list of colors with a length of `num_colors` or more. + + Warns + ----- + UserWarning + If both `colormap` and `color` are provided. + Parameter `color` will override. + """ + if isinstance(color, dict): + return color + + colors = _derive_colors( + color=color, + colormap=colormap, + color_type=color_type, + num_colors=num_colors, + ) + + return list(_cycle_colors(colors, num_colors=num_colors)) + + +def _derive_colors( + *, + color: Color | Collection[Color] | None, + colormap: str | Colormap | None, + color_type: str, + num_colors: int, +) -> list[Color]: + """ + Derive colors from either `colormap`, `color_type` or `color` inputs. + + Get a list of colors either from `colormap`, or from `color`, + or from `color_type` (if both `colormap` and `color` are None). + + Parameters + ---------- + color : str or sequence, optional + Color(s) to be used for deriving sequence of colors. + Can be either be a single color (single color string, or sequence of floats + representing a single color), or a sequence of colors. + colormap : :py:class:`matplotlib.colors.Colormap`, optional + Matplotlib colormap. + When provided, the resulting colors will be derived from the colormap. + color_type : {"default", "random"}, optional + Type of colors to derive. Used if provided `color` and `colormap` are None. + Ignored if either `color` or `colormap`` are not None. + num_colors : int + Number of colors to be extracted. + + Returns + ------- + list + List of colors extracted. + + Warns + ----- + UserWarning + If both `colormap` and `color` are provided. + Parameter `color` will override. + """ + if color is None and colormap is not None: + return _get_colors_from_colormap(colormap, num_colors=num_colors) + elif color is not None: + if colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used simultaneously. Using 'color'", + stacklevel=find_stack_level(), + ) + return _get_colors_from_color(color) + else: + return _get_colors_from_color_type(color_type, num_colors=num_colors) + + +def _cycle_colors(colors: list[Color], num_colors: int) -> Iterator[Color]: + """Cycle colors until achieving max of `num_colors` or length of `colors`. + + Extra colors will be ignored by matplotlib if there are more colors + than needed and nothing needs to be done here. + """ + max_colors = max(num_colors, len(colors)) + yield from itertools.islice(itertools.cycle(colors), max_colors) + + +def _get_colors_from_colormap( + colormap: str | Colormap, + num_colors: int, +) -> list[Color]: + """Get colors from colormap.""" + cmap = _get_cmap_instance(colormap) + return [cmap(num) for num in np.linspace(0, 1, num=num_colors)] + + +def _get_cmap_instance(colormap: str | Colormap) -> Colormap: + """Get instance of matplotlib colormap.""" + if isinstance(colormap, str): + cmap = colormap + colormap = mpl.colormaps[colormap] + if colormap is None: + raise ValueError(f"Colormap {cmap} is not recognized") + return colormap + + +def _get_colors_from_color( + color: Color | Collection[Color], +) -> list[Color]: + """Get colors from user input color.""" + if len(color) == 0: + raise ValueError(f"Invalid color argument: {color}") + + if _is_single_color(color): + color = cast(Color, color) + return [color] + + color = cast(Collection[Color], color) + return list(_gen_list_of_colors_from_iterable(color)) + + +def _is_single_color(color: Color | Collection[Color]) -> bool: + """Check if `color` is a single color, not a sequence of colors. + + Single color is of these kinds: + - Named color "red", "C0", "firebrick" + - Alias "g" + - Sequence of floats, such as (0.1, 0.2, 0.3) or (0.1, 0.2, 0.3, 0.4). + + See Also + -------- + _is_single_string_color + """ + if isinstance(color, str) and _is_single_string_color(color): + # GH #36972 + return True + + if _is_floats_color(color): + return True + + return False + + +def _gen_list_of_colors_from_iterable(color: Collection[Color]) -> Iterator[Color]: + """ + Yield colors from string of several letters or from collection of colors. + """ + for x in color: + if _is_single_color(x): + yield x + else: + raise ValueError(f"Invalid color {x}") + + +def _is_floats_color(color: Color | Collection[Color]) -> bool: + """Check if color comprises a sequence of floats representing color.""" + return bool( + is_list_like(color) + and (len(color) == 3 or len(color) == 4) + and all(isinstance(x, (int, float)) for x in color) + ) + + +def _get_colors_from_color_type(color_type: str, num_colors: int) -> list[Color]: + """Get colors from user input color type.""" + if color_type == "default": + return _get_default_colors(num_colors) + elif color_type == "random": + return _get_random_colors(num_colors) + else: + raise ValueError("color_type must be either 'default' or 'random'") + + +def _get_default_colors(num_colors: int) -> list[Color]: + """Get `num_colors` of default colors from matplotlib rc params.""" + import matplotlib.pyplot as plt + + colors = [c["color"] for c in plt.rcParams["axes.prop_cycle"]] + return colors[0:num_colors] + + +def _get_random_colors(num_colors: int) -> list[Color]: + """Get `num_colors` of random colors.""" + return [_random_color(num) for num in range(num_colors)] + + +def _random_color(column: int) -> list[float]: + """Get a random color represented as a list of length 3""" + # GH17525 use common._random_state to avoid resetting the seed + rs = com.random_state(column) + return rs.rand(3).tolist() + + +def _is_single_string_color(color: Color) -> bool: + """Check if `color` is a single string color. + + Examples of single string colors: + - 'r' + - 'g' + - 'red' + - 'green' + - 'C3' + - 'firebrick' + + Parameters + ---------- + color : Color + Color string or sequence of floats. + + Returns + ------- + bool + True if `color` looks like a valid color. + False otherwise. + """ + conv = matplotlib.colors.ColorConverter() + try: + # error: Argument 1 to "to_rgba" of "ColorConverter" has incompatible type + # "str | Sequence[float]"; expected "tuple[float, float, float] | ..." + conv.to_rgba(color) # type: ignore[arg-type] + except ValueError: + return False + else: + return True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/timeseries.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/timeseries.py new file mode 100644 index 0000000000000000000000000000000000000000..c7ddfa55d0417f8c8fa4addd82faacedf90394d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/timeseries.py @@ -0,0 +1,370 @@ +# TODO: Use the fact that axis can have units to simplify the process + +from __future__ import annotations + +import functools +from typing import ( + TYPE_CHECKING, + Any, + cast, +) +import warnings + +import numpy as np + +from pandas._libs.tslibs import ( + BaseOffset, + Period, + to_offset, +) +from pandas._libs.tslibs.dtypes import ( + OFFSET_TO_PERIOD_FREQSTR, + FreqGroup, +) + +from pandas.core.dtypes.generic import ( + ABCDatetimeIndex, + ABCPeriodIndex, + ABCTimedeltaIndex, +) + +from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib.converter import ( + TimeSeries_DateFormatter, + TimeSeries_DateLocator, + TimeSeries_TimedeltaFormatter, +) +from pandas.tseries.frequencies import ( + get_period_alias, + is_subperiod, + is_superperiod, +) + +if TYPE_CHECKING: + from datetime import timedelta + + from matplotlib.axes import Axes + + from pandas._typing import NDFrameT + + from pandas import ( + DataFrame, + DatetimeIndex, + Index, + PeriodIndex, + Series, + ) + +# --------------------------------------------------------------------- +# Plotting functions and monkey patches + + +def maybe_resample(series: Series, ax: Axes, kwargs: dict[str, Any]): + # resample against axes freq if necessary + + if "how" in kwargs: + raise ValueError( + "'how' is not a valid keyword for plotting functions. If plotting " + "multiple objects on shared axes, resample manually first." + ) + + freq, ax_freq = _get_freq(ax, series) + + if freq is None: # pragma: no cover + raise ValueError("Cannot use dynamic axis without frequency info") + + # Convert DatetimeIndex to PeriodIndex + if isinstance(series.index, ABCDatetimeIndex): + series = series.to_period(freq=freq) + + if ax_freq is not None and freq != ax_freq: + if is_superperiod(freq, ax_freq): # upsample input + series = series.copy() + # error: "Index" has no attribute "asfreq" + series.index = series.index.asfreq( # type: ignore[attr-defined] + ax_freq, how="s" + ) + freq = ax_freq + elif _is_sup(freq, ax_freq): # one is weekly + # Resampling with PeriodDtype is deprecated, so we convert to + # DatetimeIndex, resample, then convert back. + ser_ts = series.to_timestamp() + ser_d = ser_ts.resample("D").last().dropna() + ser_freq = ser_d.resample(ax_freq).last().dropna() + series = ser_freq.to_period(ax_freq) + freq = ax_freq + elif is_subperiod(freq, ax_freq) or _is_sub(freq, ax_freq): + _upsample_others(ax, freq, kwargs) + else: # pragma: no cover + raise ValueError("Incompatible frequency conversion") + return freq, series + + +def _is_sub(f1: str, f2: str) -> bool: + return (f1.startswith("W") and is_subperiod("D", f2)) or ( + f2.startswith("W") and is_subperiod(f1, "D") + ) + + +def _is_sup(f1: str, f2: str) -> bool: + return (f1.startswith("W") and is_superperiod("D", f2)) or ( + f2.startswith("W") and is_superperiod(f1, "D") + ) + + +def _upsample_others(ax: Axes, freq: BaseOffset, kwargs: dict[str, Any]) -> None: + legend = ax.get_legend() + lines, labels = _replot_ax(ax, freq) + _replot_ax(ax, freq) + + other_ax = None + if hasattr(ax, "left_ax"): + other_ax = ax.left_ax + if hasattr(ax, "right_ax"): + other_ax = ax.right_ax + + if other_ax is not None: + rlines, rlabels = _replot_ax(other_ax, freq) + lines.extend(rlines) + labels.extend(rlabels) + + if legend is not None and kwargs.get("legend", True) and len(lines) > 0: + title: str | None = legend.get_title().get_text() + if title == "None": + title = None + ax.legend(lines, labels, loc="best", title=title) + + +def _replot_ax(ax: Axes, freq: BaseOffset): + data = getattr(ax, "_plot_data", None) + + # clear current axes and data + # TODO #54485 + ax._plot_data = [] # type: ignore[attr-defined] + ax.clear() + + decorate_axes(ax, freq) + + lines = [] + labels = [] + if data is not None: + for series, plotf, kwds in data: + series = series.copy() + idx = series.index.asfreq(freq, how="S") + series.index = idx + # TODO #54485 + ax._plot_data.append((series, plotf, kwds)) # type: ignore[attr-defined] + + # for tsplot + if isinstance(plotf, str): + from pandas.plotting._matplotlib import PLOT_CLASSES + + plotf = PLOT_CLASSES[plotf]._plot + + lines.append(plotf(ax, series.index._mpl_repr(), series.values, **kwds)[0]) + labels.append(pprint_thing(series.name)) + + return lines, labels + + +def decorate_axes(ax: Axes, freq: BaseOffset) -> None: + """Initialize axes for time-series plotting""" + if not hasattr(ax, "_plot_data"): + # TODO #54485 + ax._plot_data = [] # type: ignore[attr-defined] + + # TODO #54485 + ax.freq = freq # type: ignore[attr-defined] + xaxis = ax.get_xaxis() + # TODO #54485 + xaxis.freq = freq # type: ignore[attr-defined] + + +def _get_ax_freq(ax: Axes): + """ + Get the freq attribute of the ax object if set. + Also checks shared axes (eg when using secondary yaxis, sharex=True + or twinx) + """ + ax_freq = getattr(ax, "freq", None) + if ax_freq is None: + # check for left/right ax in case of secondary yaxis + if hasattr(ax, "left_ax"): + ax_freq = getattr(ax.left_ax, "freq", None) + elif hasattr(ax, "right_ax"): + ax_freq = getattr(ax.right_ax, "freq", None) + if ax_freq is None: + # check if a shared ax (sharex/twinx) has already freq set + shared_axes = ax.get_shared_x_axes().get_siblings(ax) + if len(shared_axes) > 1: + for shared_ax in shared_axes: + ax_freq = getattr(shared_ax, "freq", None) + if ax_freq is not None: + break + return ax_freq + + +def _get_period_alias(freq: timedelta | BaseOffset | str) -> str | None: + if isinstance(freq, BaseOffset): + freqstr = freq.name + else: + freqstr = to_offset(freq, is_period=True).rule_code + + return get_period_alias(freqstr) + + +def _get_freq(ax: Axes, series: Series): + # get frequency from data + freq = getattr(series.index, "freq", None) + if freq is None: + freq = getattr(series.index, "inferred_freq", None) + freq = to_offset(freq, is_period=True) + + ax_freq = _get_ax_freq(ax) + + # use axes freq if no data freq + if freq is None: + freq = ax_freq + + # get the period frequency + freq = _get_period_alias(freq) + return freq, ax_freq + + +def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool: + freq = _get_index_freq(data.index) + ax_freq = _get_ax_freq(ax) + + if freq is None: # convert irregular if axes has freq info + freq = ax_freq + # do not use tsplot if irregular was plotted first + elif (ax_freq is None) and (len(ax.get_lines()) > 0): + return False + + if freq is None: + return False + + freq_str = _get_period_alias(freq) + + if freq_str is None: + return False + + # FIXME: hack this for 0.10.1, creating more technical debt...sigh + if isinstance(data.index, ABCDatetimeIndex): + # error: "BaseOffset" has no attribute "_period_dtype_code" + freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str) + base = to_offset( + freq_str, is_period=True + )._period_dtype_code # type: ignore[attr-defined] + x = data.index + if base <= FreqGroup.FR_DAY.value: + return x[:1].is_normalized + period = Period(x[0], freq_str) + assert isinstance(period, Period) + return period.to_timestamp().tz_localize(x.tz) == x[0] + return True + + +def _get_index_freq(index: Index) -> BaseOffset | None: + freq = getattr(index, "freq", None) + if freq is None: + freq = getattr(index, "inferred_freq", None) + if freq == "B": + # error: "Index" has no attribute "dayofweek" + weekdays = np.unique(index.dayofweek) # type: ignore[attr-defined] + if (5 in weekdays) or (6 in weekdays): + freq = None + + freq = to_offset(freq) + return freq + + +def maybe_convert_index(ax: Axes, data: NDFrameT) -> NDFrameT: + # tsplot converts automatically, but don't want to convert index + # over and over for DataFrames + if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)): + freq: str | BaseOffset | None = data.index.freq + + if freq is None: + # We only get here for DatetimeIndex + data.index = cast("DatetimeIndex", data.index) + freq = data.index.inferred_freq + freq = to_offset(freq) + + if freq is None: + freq = _get_ax_freq(ax) + + if freq is None: + raise ValueError("Could not get frequency alias for plotting") + + freq_str = _get_period_alias(freq) + + with warnings.catch_warnings(): + # suppress Period[B] deprecation warning + # TODO: need to find an alternative to this before the deprecation + # is enforced! + warnings.filterwarnings( + "ignore", + r"PeriodDtype\[B\] is deprecated", + category=FutureWarning, + ) + + if isinstance(data.index, ABCDatetimeIndex): + data = data.tz_localize(None).to_period(freq=freq_str) + elif isinstance(data.index, ABCPeriodIndex): + data.index = data.index.asfreq(freq=freq_str) + return data + + +# Patch methods for subplot. + + +def _format_coord(freq, t, y) -> str: + time_period = Period(ordinal=int(t), freq=freq) + return f"t = {time_period} y = {y:8f}" + + +def format_dateaxis( + subplot, freq: BaseOffset, index: DatetimeIndex | PeriodIndex +) -> None: + """ + Pretty-formats the date axis (x-axis). + + Major and minor ticks are automatically set for the frequency of the + current underlying series. As the dynamic mode is activated by + default, changing the limits of the x axis will intelligently change + the positions of the ticks. + """ + from matplotlib import pylab + + # handle index specific formatting + # Note: DatetimeIndex does not use this + # interface. DatetimeIndex uses matplotlib.date directly + if isinstance(index, ABCPeriodIndex): + majlocator = TimeSeries_DateLocator( + freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot + ) + minlocator = TimeSeries_DateLocator( + freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot + ) + subplot.xaxis.set_major_locator(majlocator) + subplot.xaxis.set_minor_locator(minlocator) + + majformatter = TimeSeries_DateFormatter( + freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot + ) + minformatter = TimeSeries_DateFormatter( + freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot + ) + subplot.xaxis.set_major_formatter(majformatter) + subplot.xaxis.set_minor_formatter(minformatter) + + # x and y coord info + subplot.format_coord = functools.partial(_format_coord, freq) + + elif isinstance(index, ABCTimedeltaIndex): + subplot.xaxis.set_major_formatter(TimeSeries_TimedeltaFormatter()) + else: + raise TypeError("index type not supported") + + pylab.draw_if_interactive() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/tools.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..898b5b25e7b0171e4b41106bf340e7fbc3bbb735 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/plotting/_matplotlib/tools.py @@ -0,0 +1,492 @@ +# being a bit too dynamic +from __future__ import annotations + +from math import ceil +from typing import TYPE_CHECKING +import warnings + +from matplotlib import ticker +import matplotlib.table +import numpy as np + +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndex, + ABCSeries, +) + +if TYPE_CHECKING: + from collections.abc import ( + Iterable, + Sequence, + ) + + from matplotlib.axes import Axes + from matplotlib.axis import Axis + from matplotlib.figure import Figure + from matplotlib.lines import Line2D + from matplotlib.table import Table + + from pandas import ( + DataFrame, + Series, + ) + + +def do_adjust_figure(fig: Figure) -> bool: + """Whether fig has constrained_layout enabled.""" + if not hasattr(fig, "get_constrained_layout"): + return False + return not fig.get_constrained_layout() + + +def maybe_adjust_figure(fig: Figure, *args, **kwargs) -> None: + """Call fig.subplots_adjust unless fig has constrained_layout enabled.""" + if do_adjust_figure(fig): + fig.subplots_adjust(*args, **kwargs) + + +def format_date_labels(ax: Axes, rot) -> None: + # mini version of autofmt_xdate + for label in ax.get_xticklabels(): + label.set_horizontalalignment("right") + label.set_rotation(rot) + fig = ax.get_figure() + if fig is not None: + # should always be a Figure but can technically be None + maybe_adjust_figure(fig, bottom=0.2) + + +def table( + ax, data: DataFrame | Series, rowLabels=None, colLabels=None, **kwargs +) -> Table: + if isinstance(data, ABCSeries): + data = data.to_frame() + elif isinstance(data, ABCDataFrame): + pass + else: + raise ValueError("Input data must be DataFrame or Series") + + if rowLabels is None: + rowLabels = data.index + + if colLabels is None: + colLabels = data.columns + + cellText = data.values + + # error: Argument "cellText" to "table" has incompatible type "ndarray[Any, + # Any]"; expected "Sequence[Sequence[str]] | None" + return matplotlib.table.table( + ax, + cellText=cellText, # type: ignore[arg-type] + rowLabels=rowLabels, + colLabels=colLabels, + **kwargs, + ) + + +def _get_layout( + nplots: int, + layout: tuple[int, int] | None = None, + layout_type: str = "box", +) -> tuple[int, int]: + if layout is not None: + if not isinstance(layout, (tuple, list)) or len(layout) != 2: + raise ValueError("Layout must be a tuple of (rows, columns)") + + nrows, ncols = layout + + if nrows == -1 and ncols > 0: + layout = nrows, ncols = (ceil(nplots / ncols), ncols) + elif ncols == -1 and nrows > 0: + layout = nrows, ncols = (nrows, ceil(nplots / nrows)) + elif ncols <= 0 and nrows <= 0: + msg = "At least one dimension of layout must be positive" + raise ValueError(msg) + + if nrows * ncols < nplots: + raise ValueError( + f"Layout of {nrows}x{ncols} must be larger than required size {nplots}" + ) + + return layout + + if layout_type == "single": + return (1, 1) + elif layout_type == "horizontal": + return (1, nplots) + elif layout_type == "vertical": + return (nplots, 1) + + layouts = {1: (1, 1), 2: (1, 2), 3: (2, 2), 4: (2, 2)} + try: + return layouts[nplots] + except KeyError: + k = 1 + while k**2 < nplots: + k += 1 + + if (k - 1) * k >= nplots: + return k, (k - 1) + else: + return k, k + + +# copied from matplotlib/pyplot.py and modified for pandas.plotting + + +def create_subplots( + naxes: int, + sharex: bool = False, + sharey: bool = False, + squeeze: bool = True, + subplot_kw=None, + ax=None, + layout=None, + layout_type: str = "box", + **fig_kw, +): + """ + Create a figure with a set of subplots already made. + + This utility wrapper makes it convenient to create common layouts of + subplots, including the enclosing figure object, in a single call. + + Parameters + ---------- + naxes : int + Number of required axes. Exceeded axes are set invisible. Default is + nrows * ncols. + + sharex : bool + If True, the X axis will be shared amongst all subplots. + + sharey : bool + If True, the Y axis will be shared amongst all subplots. + + squeeze : bool + + If True, extra dimensions are squeezed out from the returned axis object: + - if only one subplot is constructed (nrows=ncols=1), the resulting + single Axis object is returned as a scalar. + - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object + array of Axis objects are returned as numpy 1-d arrays. + - for NxM subplots with N>1 and M>1 are returned as a 2d array. + + If False, no squeezing is done: the returned axis object is always + a 2-d array containing Axis instances, even if it ends up being 1x1. + + subplot_kw : dict + Dict with keywords passed to the add_subplot() call used to create each + subplots. + + ax : Matplotlib axis object, optional + + layout : tuple + Number of rows and columns of the subplot grid. + If not specified, calculated from naxes and layout_type + + layout_type : {'box', 'horizontal', 'vertical'}, default 'box' + Specify how to layout the subplot grid. + + fig_kw : Other keyword arguments to be passed to the figure() call. + Note that all keywords not recognized above will be + automatically included here. + + Returns + ------- + fig, ax : tuple + - fig is the Matplotlib Figure object + - ax can be either a single axis object or an array of axis objects if + more than one subplot was created. The dimensions of the resulting array + can be controlled with the squeeze keyword, see above. + + Examples + -------- + x = np.linspace(0, 2*np.pi, 400) + y = np.sin(x**2) + + # Just a figure and one subplot + f, ax = plt.subplots() + ax.plot(x, y) + ax.set_title('Simple plot') + + # Two subplots, unpack the output array immediately + f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) + ax1.plot(x, y) + ax1.set_title('Sharing Y axis') + ax2.scatter(x, y) + + # Four polar axes + plt.subplots(2, 2, subplot_kw=dict(polar=True)) + """ + import matplotlib.pyplot as plt + + if subplot_kw is None: + subplot_kw = {} + + if ax is None: + fig = plt.figure(**fig_kw) + else: + if is_list_like(ax): + if squeeze: + ax = flatten_axes(ax) + if layout is not None: + warnings.warn( + "When passing multiple axes, layout keyword is ignored.", + UserWarning, + stacklevel=find_stack_level(), + ) + if sharex or sharey: + warnings.warn( + "When passing multiple axes, sharex and sharey " + "are ignored. These settings must be specified when creating axes.", + UserWarning, + stacklevel=find_stack_level(), + ) + if ax.size == naxes: + fig = ax.flat[0].get_figure() + return fig, ax + else: + raise ValueError( + f"The number of passed axes must be {naxes}, the " + "same as the output plot" + ) + + fig = ax.get_figure() + # if ax is passed and a number of subplots is 1, return ax as it is + if naxes == 1: + if squeeze: + return fig, ax + else: + return fig, flatten_axes(ax) + else: + warnings.warn( + "To output multiple subplots, the figure containing " + "the passed axes is being cleared.", + UserWarning, + stacklevel=find_stack_level(), + ) + fig.clear() + + nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type) + nplots = nrows * ncols + + # Create empty object array to hold all axes. It's easiest to make it 1-d + # so we can just append subplots upon creation, and then + axarr = np.empty(nplots, dtype=object) + + # Create first subplot separately, so we can share it if requested + ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw) + + if sharex: + subplot_kw["sharex"] = ax0 + if sharey: + subplot_kw["sharey"] = ax0 + axarr[0] = ax0 + + # Note off-by-one counting because add_subplot uses the MATLAB 1-based + # convention. + for i in range(1, nplots): + kwds = subplot_kw.copy() + # Set sharex and sharey to None for blank/dummy axes, these can + # interfere with proper axis limits on the visible axes if + # they share axes e.g. issue #7528 + if i >= naxes: + kwds["sharex"] = None + kwds["sharey"] = None + ax = fig.add_subplot(nrows, ncols, i + 1, **kwds) + axarr[i] = ax + + if naxes != nplots: + for ax in axarr[naxes:]: + ax.set_visible(False) + + handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey) + + if squeeze: + # Reshape the array to have the final desired dimension (nrow,ncol), + # though discarding unneeded dimensions that equal 1. If we only have + # one subplot, just return it instead of a 1-element array. + if nplots == 1: + axes = axarr[0] + else: + axes = axarr.reshape(nrows, ncols).squeeze() + else: + # returned axis array will be always 2-d, even if nrows=ncols=1 + axes = axarr.reshape(nrows, ncols) + + return fig, axes + + +def _remove_labels_from_axis(axis: Axis) -> None: + for t in axis.get_majorticklabels(): + t.set_visible(False) + + # set_visible will not be effective if + # minor axis has NullLocator and NullFormatter (default) + if isinstance(axis.get_minor_locator(), ticker.NullLocator): + axis.set_minor_locator(ticker.AutoLocator()) + if isinstance(axis.get_minor_formatter(), ticker.NullFormatter): + axis.set_minor_formatter(ticker.FormatStrFormatter("")) + for t in axis.get_minorticklabels(): + t.set_visible(False) + + axis.get_label().set_visible(False) + + +def _has_externally_shared_axis(ax1: Axes, compare_axis: str) -> bool: + """ + Return whether an axis is externally shared. + + Parameters + ---------- + ax1 : matplotlib.axes.Axes + Axis to query. + compare_axis : str + `"x"` or `"y"` according to whether the X-axis or Y-axis is being + compared. + + Returns + ------- + bool + `True` if the axis is externally shared. Otherwise `False`. + + Notes + ----- + If two axes with different positions are sharing an axis, they can be + referred to as *externally* sharing the common axis. + + If two axes sharing an axis also have the same position, they can be + referred to as *internally* sharing the common axis (a.k.a twinning). + + _handle_shared_axes() is only interested in axes externally sharing an + axis, regardless of whether either of the axes is also internally sharing + with a third axis. + """ + if compare_axis == "x": + axes = ax1.get_shared_x_axes() + elif compare_axis == "y": + axes = ax1.get_shared_y_axes() + else: + raise ValueError( + "_has_externally_shared_axis() needs 'x' or 'y' as a second parameter" + ) + + axes_siblings = axes.get_siblings(ax1) + + # Retain ax1 and any of its siblings which aren't in the same position as it + ax1_points = ax1.get_position().get_points() + + for ax2 in axes_siblings: + if not np.array_equal(ax1_points, ax2.get_position().get_points()): + return True + + return False + + +def handle_shared_axes( + axarr: Iterable[Axes], + nplots: int, + naxes: int, + nrows: int, + ncols: int, + sharex: bool, + sharey: bool, +) -> None: + if nplots > 1: + row_num = lambda x: x.get_subplotspec().rowspan.start + col_num = lambda x: x.get_subplotspec().colspan.start + + is_first_col = lambda x: x.get_subplotspec().is_first_col() + + if nrows > 1: + try: + # first find out the ax layout, + # so that we can correctly handle 'gaps" + layout = np.zeros((nrows + 1, ncols + 1), dtype=np.bool_) + for ax in axarr: + layout[row_num(ax), col_num(ax)] = ax.get_visible() + + for ax in axarr: + # only the last row of subplots should get x labels -> all + # other off layout handles the case that the subplot is + # the last in the column, because below is no subplot/gap. + if not layout[row_num(ax) + 1, col_num(ax)]: + continue + if sharex or _has_externally_shared_axis(ax, "x"): + _remove_labels_from_axis(ax.xaxis) + + except IndexError: + # if gridspec is used, ax.rowNum and ax.colNum may different + # from layout shape. in this case, use last_row logic + is_last_row = lambda x: x.get_subplotspec().is_last_row() + for ax in axarr: + if is_last_row(ax): + continue + if sharex or _has_externally_shared_axis(ax, "x"): + _remove_labels_from_axis(ax.xaxis) + + if ncols > 1: + for ax in axarr: + # only the first column should get y labels -> set all other to + # off as we only have labels in the first column and we always + # have a subplot there, we can skip the layout test + if is_first_col(ax): + continue + if sharey or _has_externally_shared_axis(ax, "y"): + _remove_labels_from_axis(ax.yaxis) + + +def flatten_axes(axes: Axes | Sequence[Axes]) -> np.ndarray: + if not is_list_like(axes): + return np.array([axes]) + elif isinstance(axes, (np.ndarray, ABCIndex)): + return np.asarray(axes).ravel() + return np.array(axes) + + +def set_ticks_props( + axes: Axes | Sequence[Axes], + xlabelsize: int | None = None, + xrot=None, + ylabelsize: int | None = None, + yrot=None, +): + import matplotlib.pyplot as plt + + for ax in flatten_axes(axes): + if xlabelsize is not None: + plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) + if xrot is not None: + plt.setp(ax.get_xticklabels(), rotation=xrot) + if ylabelsize is not None: + plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) + if yrot is not None: + plt.setp(ax.get_yticklabels(), rotation=yrot) + return axes + + +def get_all_lines(ax: Axes) -> list[Line2D]: + lines = ax.get_lines() + + if hasattr(ax, "right_ax"): + lines += ax.right_ax.get_lines() + + if hasattr(ax, "left_ax"): + lines += ax.left_ax.get_lines() + + return lines + + +def get_xlim(lines: Iterable[Line2D]) -> tuple[float, float]: + left, right = np.inf, -np.inf + for line in lines: + x = line.get_xdata(orig=False) + left = min(np.nanmin(x), left) + right = max(np.nanmax(x), right) + return left, right diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..852f48b8e265d568091c4f582827e5687cc9f41b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_arithmetic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_arithmetic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8950225a6f08f97c12c59e24750a55e97355b50e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_arithmetic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff8fb65be1a8b7b141907aac43fd6ecc95ffd330 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_comparison.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_comparison.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da686dd6011518dcf0914cedc8e2c395d9d65c80 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_comparison.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_construction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_construction.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d65872c6b09d6292870510875666df3b5a7d3bb7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_construction.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_function.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_function.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..930fc81753c0b0b5767e5aa4ecad0a6d3781c684 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_function.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80cfc34765d19fd5395b73396ad19c22bb0668d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_logical.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_logical.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e20b87322f5e97c79133b2d33b8850cf05824680 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_logical.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f551ed2ae56ac8f5babd5aa51cf09ced1f754d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_reduction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_reduction.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..331fbdd642edd5f2d61905995241e82c2d3616dd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_reduction.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_repr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_repr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cead408ede782cdc2a65525aa091b98ad229707e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/boolean/__pycache__/test_repr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_algos.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_algos.py new file mode 100644 index 0000000000000000000000000000000000000000..d4c19a4970135cfb1865eaa0fae0845dc7d17971 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_algos.py @@ -0,0 +1,89 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("ordered", [True, False]) +@pytest.mark.parametrize("categories", [["b", "a", "c"], ["a", "b", "c", "d"]]) +def test_factorize(categories, ordered): + cat = pd.Categorical( + ["b", "b", "a", "c", None], categories=categories, ordered=ordered + ) + codes, uniques = pd.factorize(cat) + expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp) + expected_uniques = pd.Categorical( + ["b", "a", "c"], categories=categories, ordered=ordered + ) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_factorized_sort(): + cat = pd.Categorical(["b", "b", None, "a"]) + codes, uniques = pd.factorize(cat, sort=True) + expected_codes = np.array([1, 1, -1, 0], dtype=np.intp) + expected_uniques = pd.Categorical(["a", "b"]) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_factorized_sort_ordered(): + cat = pd.Categorical( + ["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True + ) + + codes, uniques = pd.factorize(cat, sort=True) + expected_codes = np.array([0, 0, -1, 1], dtype=np.intp) + expected_uniques = pd.Categorical( + ["b", "a"], categories=["c", "b", "a"], ordered=True + ) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_isin_cats(): + # GH2003 + cat = pd.Categorical(["a", "b", np.nan]) + + result = cat.isin(["a", np.nan]) + expected = np.array([True, False, True], dtype=bool) + tm.assert_numpy_array_equal(expected, result) + + result = cat.isin(["a", "c"]) + expected = np.array([True, False, False], dtype=bool) + tm.assert_numpy_array_equal(expected, result) + + +@pytest.mark.parametrize("value", [[""], [None, ""], [pd.NaT, ""]]) +def test_isin_cats_corner_cases(value): + # GH36550 + cat = pd.Categorical([""]) + result = cat.isin(value) + expected = np.array([True], dtype=bool) + tm.assert_numpy_array_equal(expected, result) + + +@pytest.mark.parametrize("empty", [[], pd.Series(dtype=object), np.array([])]) +def test_isin_empty(empty): + s = pd.Categorical(["a", "b"]) + expected = np.array([False, False], dtype=bool) + + result = s.isin(empty) + tm.assert_numpy_array_equal(expected, result) + + +def test_diff(): + ser = pd.Series([1, 2, 3], dtype="category") + + msg = "Convert to a suitable dtype" + with pytest.raises(TypeError, match=msg): + ser.diff() + + df = ser.to_frame(name="A") + with pytest.raises(TypeError, match=msg): + df.diff() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_analytics.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_analytics.py new file mode 100644 index 0000000000000000000000000000000000000000..c2c53fbc4637ed60dc92914f6e2ca74d5e0bdfe9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_analytics.py @@ -0,0 +1,349 @@ +import re +import sys + +import numpy as np +import pytest + +from pandas.compat import PYPY + +from pandas import ( + Categorical, + CategoricalDtype, + DataFrame, + Index, + NaT, + Series, + date_range, +) +import pandas._testing as tm +from pandas.api.types import is_scalar + + +class TestCategoricalAnalytics: + @pytest.mark.parametrize("aggregation", ["min", "max"]) + def test_min_max_not_ordered_raises(self, aggregation): + # unordered cats have no min/max + cat = Categorical(["a", "b", "c", "d"], ordered=False) + msg = f"Categorical is not ordered for operation {aggregation}" + agg_func = getattr(cat, aggregation) + + with pytest.raises(TypeError, match=msg): + agg_func() + + ufunc = np.minimum if aggregation == "min" else np.maximum + with pytest.raises(TypeError, match=msg): + ufunc.reduce(cat) + + def test_min_max_ordered(self, index_or_series_or_array): + cat = Categorical(["a", "b", "c", "d"], ordered=True) + obj = index_or_series_or_array(cat) + _min = obj.min() + _max = obj.max() + assert _min == "a" + assert _max == "d" + + assert np.minimum.reduce(obj) == "a" + assert np.maximum.reduce(obj) == "d" + # TODO: raises if we pass axis=0 (on Index and Categorical, not Series) + + cat = Categorical( + ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True + ) + obj = index_or_series_or_array(cat) + _min = obj.min() + _max = obj.max() + assert _min == "d" + assert _max == "a" + assert np.minimum.reduce(obj) == "d" + assert np.maximum.reduce(obj) == "a" + + def test_min_max_reduce(self): + # GH52788 + cat = Categorical(["a", "b", "c", "d"], ordered=True) + df = DataFrame(cat) + + result_max = df.agg("max") + expected_max = Series(Categorical(["d"], dtype=cat.dtype)) + tm.assert_series_equal(result_max, expected_max) + + result_min = df.agg("min") + expected_min = Series(Categorical(["a"], dtype=cat.dtype)) + tm.assert_series_equal(result_min, expected_min) + + @pytest.mark.parametrize( + "categories,expected", + [ + (list("ABC"), np.nan), + ([1, 2, 3], np.nan), + pytest.param( + Series(date_range("2020-01-01", periods=3), dtype="category"), + NaT, + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/29962" + ), + ), + ], + ) + @pytest.mark.parametrize("aggregation", ["min", "max"]) + def test_min_max_ordered_empty(self, categories, expected, aggregation): + # GH 30227 + cat = Categorical([], categories=categories, ordered=True) + + agg_func = getattr(cat, aggregation) + result = agg_func() + assert result is expected + + @pytest.mark.parametrize( + "values, categories", + [(["a", "b", "c", np.nan], list("cba")), ([1, 2, 3, np.nan], [3, 2, 1])], + ) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("function", ["min", "max"]) + def test_min_max_with_nan(self, values, categories, function, skipna): + # GH 25303 + cat = Categorical(values, categories=categories, ordered=True) + result = getattr(cat, function)(skipna=skipna) + + if skipna is False: + assert result is np.nan + else: + expected = categories[0] if function == "min" else categories[2] + assert result == expected + + @pytest.mark.parametrize("function", ["min", "max"]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_only_nan(self, function, skipna): + # https://github.com/pandas-dev/pandas/issues/33450 + cat = Categorical([np.nan], categories=[1, 2], ordered=True) + result = getattr(cat, function)(skipna=skipna) + assert result is np.nan + + @pytest.mark.parametrize("method", ["min", "max"]) + def test_numeric_only_min_max_raises(self, method): + # GH 25303 + cat = Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + with pytest.raises(TypeError, match=".* got an unexpected keyword"): + getattr(cat, method)(numeric_only=True) + + @pytest.mark.parametrize("method", ["min", "max"]) + def test_numpy_min_max_raises(self, method): + cat = Categorical(["a", "b", "c", "b"], ordered=False) + msg = ( + f"Categorical is not ordered for operation {method}\n" + "you can use .as_ordered() to change the Categorical to an ordered one" + ) + method = getattr(np, method) + with pytest.raises(TypeError, match=re.escape(msg)): + method(cat) + + @pytest.mark.parametrize("kwarg", ["axis", "out", "keepdims"]) + @pytest.mark.parametrize("method", ["min", "max"]) + def test_numpy_min_max_unsupported_kwargs_raises(self, method, kwarg): + cat = Categorical(["a", "b", "c", "b"], ordered=True) + msg = ( + f"the '{kwarg}' parameter is not supported in the pandas implementation " + f"of {method}" + ) + if kwarg == "axis": + msg = r"`axis` must be fewer than the number of dimensions \(1\)" + kwargs = {kwarg: 42} + method = getattr(np, method) + with pytest.raises(ValueError, match=msg): + method(cat, **kwargs) + + @pytest.mark.parametrize("method, expected", [("min", "a"), ("max", "c")]) + def test_numpy_min_max_axis_equals_none(self, method, expected): + cat = Categorical(["a", "b", "c", "b"], ordered=True) + method = getattr(np, method) + result = method(cat, axis=None) + assert result == expected + + @pytest.mark.parametrize( + "values,categories,exp_mode", + [ + ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]), + ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]), + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]), + ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]), + ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), + ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), + ], + ) + def test_mode(self, values, categories, exp_mode): + cat = Categorical(values, categories=categories, ordered=True) + res = Series(cat).mode()._values + exp = Categorical(exp_mode, categories=categories, ordered=True) + tm.assert_categorical_equal(res, exp) + + def test_searchsorted(self, ordered): + # https://github.com/pandas-dev/pandas/issues/8420 + # https://github.com/pandas-dev/pandas/issues/14522 + + cat = Categorical( + ["cheese", "milk", "apple", "bread", "bread"], + categories=["cheese", "milk", "apple", "bread"], + ordered=ordered, + ) + ser = Series(cat) + + # Searching for single item argument, side='left' (default) + res_cat = cat.searchsorted("apple") + assert res_cat == 2 + assert is_scalar(res_cat) + + res_ser = ser.searchsorted("apple") + assert res_ser == 2 + assert is_scalar(res_ser) + + # Searching for single item array, side='left' (default) + res_cat = cat.searchsorted(["bread"]) + res_ser = ser.searchsorted(["bread"]) + exp = np.array([3], dtype=np.intp) + tm.assert_numpy_array_equal(res_cat, exp) + tm.assert_numpy_array_equal(res_ser, exp) + + # Searching for several items array, side='right' + res_cat = cat.searchsorted(["apple", "bread"], side="right") + res_ser = ser.searchsorted(["apple", "bread"], side="right") + exp = np.array([3, 5], dtype=np.intp) + tm.assert_numpy_array_equal(res_cat, exp) + tm.assert_numpy_array_equal(res_ser, exp) + + # Searching for a single value that is not from the Categorical + with pytest.raises(TypeError, match="cucumber"): + cat.searchsorted("cucumber") + with pytest.raises(TypeError, match="cucumber"): + ser.searchsorted("cucumber") + + # Searching for multiple values one of each is not from the Categorical + msg = ( + "Cannot setitem on a Categorical with a new category, " + "set the categories first" + ) + with pytest.raises(TypeError, match=msg): + cat.searchsorted(["bread", "cucumber"]) + with pytest.raises(TypeError, match=msg): + ser.searchsorted(["bread", "cucumber"]) + + def test_unique(self, ordered): + # GH38140 + dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered) + + # categories are reordered based on value when ordered=False + cat = Categorical(["a", "b", "c"], dtype=dtype) + res = cat.unique() + tm.assert_categorical_equal(res, cat) + + cat = Categorical(["a", "b", "a", "a"], dtype=dtype) + res = cat.unique() + tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype)) + + cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype) + res = cat.unique() + exp_cat = Categorical(["c", "a", "b"], dtype=dtype) + tm.assert_categorical_equal(res, exp_cat) + + # nan must be removed + cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype) + res = cat.unique() + exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype) + tm.assert_categorical_equal(res, exp_cat) + + def test_unique_index_series(self, ordered): + # GH38140 + dtype = CategoricalDtype([3, 2, 1], ordered=ordered) + + c = Categorical([3, 1, 2, 2, 1], dtype=dtype) + # Categorical.unique sorts categories by appearance order + # if ordered=False + exp = Categorical([3, 1, 2], dtype=dtype) + tm.assert_categorical_equal(c.unique(), exp) + + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(Series(c).unique(), exp) + + c = Categorical([1, 1, 2, 2], dtype=dtype) + exp = Categorical([1, 2], dtype=dtype) + tm.assert_categorical_equal(c.unique(), exp) + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(Series(c).unique(), exp) + + def test_shift(self): + # GH 9416 + cat = Categorical(["a", "b", "c", "d", "a"]) + + # shift forward + sp1 = cat.shift(1) + xp1 = Categorical([np.nan, "a", "b", "c", "d"]) + tm.assert_categorical_equal(sp1, xp1) + tm.assert_categorical_equal(cat[:-1], sp1[1:]) + + # shift back + sn2 = cat.shift(-2) + xp2 = Categorical( + ["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"] + ) + tm.assert_categorical_equal(sn2, xp2) + tm.assert_categorical_equal(cat[2:], sn2[:-2]) + + # shift by zero + tm.assert_categorical_equal(cat, cat.shift(0)) + + def test_nbytes(self): + cat = Categorical([1, 2, 3]) + exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories + assert cat.nbytes == exp + + def test_memory_usage(self): + cat = Categorical([1, 2, 3]) + + # .categories is an index, so we include the hashtable + assert 0 < cat.nbytes <= cat.memory_usage() + assert 0 < cat.nbytes <= cat.memory_usage(deep=True) + + cat = Categorical(["foo", "foo", "bar"]) + assert cat.memory_usage(deep=True) > cat.nbytes + + if not PYPY: + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = cat.memory_usage(deep=True) - sys.getsizeof(cat) + assert abs(diff) < 100 + + def test_map(self): + c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True) + result = c.map(lambda x: x.lower(), na_action=None) + exp = Categorical(list("ababc"), categories=list("cba"), ordered=True) + tm.assert_categorical_equal(result, exp) + + c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False) + result = c.map(lambda x: x.lower(), na_action=None) + exp = Categorical(list("ababc"), categories=list("abc"), ordered=False) + tm.assert_categorical_equal(result, exp) + + result = c.map(lambda x: 1, na_action=None) + # GH 12766: Return an index not an array + tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64))) + + @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) + def test_validate_inplace_raises(self, value): + cat = Categorical(["A", "B", "B", "C", "A"]) + msg = ( + 'For argument "inplace" expected type bool, ' + f"received type {type(value).__name__}" + ) + + with pytest.raises(ValueError, match=msg): + cat.sort_values(inplace=value) + + def test_quantile_empty(self): + # make sure we have correct itemsize on resulting codes + cat = Categorical(["A", "B"]) + idx = Index([0.0, 0.5]) + result = cat[:0]._quantile(idx, interpolation="linear") + assert result._codes.dtype == np.int8 + + expected = cat.take([-1, -1], allow_fill=True) + tm.assert_extension_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_api.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_api.py new file mode 100644 index 0000000000000000000000000000000000000000..a939ee5f6f53f805211d46773c625c8361203991 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_api.py @@ -0,0 +1,501 @@ +import re + +import numpy as np +import pytest + +from pandas.compat import PY311 + +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Index, + Series, + StringDtype, +) +import pandas._testing as tm +from pandas.core.arrays.categorical import recode_for_categories + + +class TestCategoricalAPI: + def test_to_list_deprecated(self): + # GH#51254 + cat1 = Categorical(list("acb"), ordered=False) + msg = "Categorical.to_list is deprecated and will be removed" + with tm.assert_produces_warning(FutureWarning, match=msg): + cat1.to_list() + + def test_ordered_api(self): + # GH 9347 + cat1 = Categorical(list("acb"), ordered=False) + tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"])) + assert not cat1.ordered + + cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False) + tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"])) + assert not cat2.ordered + + cat3 = Categorical(list("acb"), ordered=True) + tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"])) + assert cat3.ordered + + cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True) + tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"])) + assert cat4.ordered + + def test_set_ordered(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + cat2 = cat.as_unordered() + assert not cat2.ordered + cat2 = cat.as_ordered() + assert cat2.ordered + + assert cat2.set_ordered(True).ordered + assert not cat2.set_ordered(False).ordered + + # removed in 0.19.0 + msg = ( + "property 'ordered' of 'Categorical' object has no setter" + if PY311 + else "can't set attribute" + ) + with pytest.raises(AttributeError, match=msg): + cat.ordered = True + with pytest.raises(AttributeError, match=msg): + cat.ordered = False + + def test_rename_categories(self): + cat = Categorical(["a", "b", "c", "a"]) + + # inplace=False: the old one must not be changed + res = cat.rename_categories([1, 2, 3]) + tm.assert_numpy_array_equal( + res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64) + ) + tm.assert_index_equal(res.categories, Index([1, 2, 3])) + + exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) + tm.assert_numpy_array_equal(cat.__array__(), exp_cat) + + exp_cat = Index(["a", "b", "c"]) + tm.assert_index_equal(cat.categories, exp_cat) + + # GH18862 (let rename_categories take callables) + result = cat.rename_categories(lambda x: x.upper()) + expected = Categorical(["A", "B", "C", "A"]) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]]) + def test_rename_categories_wrong_length_raises(self, new_categories): + cat = Categorical(["a", "b", "c", "a"]) + msg = ( + "new categories need to have the same number of items as the " + "old categories!" + ) + with pytest.raises(ValueError, match=msg): + cat.rename_categories(new_categories) + + def test_rename_categories_series(self): + # https://github.com/pandas-dev/pandas/issues/17981 + c = Categorical(["a", "b"]) + result = c.rename_categories(Series([0, 1], index=["a", "b"])) + expected = Categorical([0, 1]) + tm.assert_categorical_equal(result, expected) + + def test_rename_categories_dict(self): + # GH 17336 + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}) + expected = Index([4, 3, 2, 1]) + tm.assert_index_equal(res.categories, expected) + + # Test for dicts of smaller length + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 1, "c": 3}) + + expected = Index([1, "b", 3, "d"]) + tm.assert_index_equal(res.categories, expected) + + # Test for dicts with bigger length + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6}) + expected = Index([1, 2, 3, 4]) + tm.assert_index_equal(res.categories, expected) + + # Test for dicts with no items from old categories + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"f": 1, "g": 3}) + + expected = Index(["a", "b", "c", "d"]) + tm.assert_index_equal(res.categories, expected) + + def test_reorder_categories(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + old = cat.copy() + new = Categorical( + ["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True + ) + + res = cat.reorder_categories(["c", "b", "a"]) + # cat must be the same as before + tm.assert_categorical_equal(cat, old) + # only res is changed + tm.assert_categorical_equal(res, new) + + @pytest.mark.parametrize( + "new_categories", + [ + ["a"], # not all "old" included in "new" + ["a", "b", "d"], # still not all "old" in "new" + ["a", "b", "c", "d"], # all "old" included in "new", but too long + ], + ) + def test_reorder_categories_raises(self, new_categories): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + msg = "items in new_categories are not the same as in old categories" + with pytest.raises(ValueError, match=msg): + cat.reorder_categories(new_categories) + + def test_add_categories(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + old = cat.copy() + new = Categorical( + ["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True + ) + + res = cat.add_categories("d") + tm.assert_categorical_equal(cat, old) + tm.assert_categorical_equal(res, new) + + res = cat.add_categories(["d"]) + tm.assert_categorical_equal(cat, old) + tm.assert_categorical_equal(res, new) + + # GH 9927 + cat = Categorical(list("abc"), ordered=True) + expected = Categorical(list("abc"), categories=list("abcde"), ordered=True) + # test with Series, np.array, index, list + res = cat.add_categories(Series(["d", "e"])) + tm.assert_categorical_equal(res, expected) + res = cat.add_categories(np.array(["d", "e"])) + tm.assert_categorical_equal(res, expected) + res = cat.add_categories(Index(["d", "e"])) + tm.assert_categorical_equal(res, expected) + res = cat.add_categories(["d", "e"]) + tm.assert_categorical_equal(res, expected) + + def test_add_categories_existing_raises(self): + # new is in old categories + cat = Categorical(["a", "b", "c", "d"], ordered=True) + msg = re.escape("new categories must not include old categories: {'d'}") + with pytest.raises(ValueError, match=msg): + cat.add_categories(["d"]) + + def test_add_categories_losing_dtype_information(self): + # GH#48812 + cat = Categorical(Series([1, 2], dtype="Int64")) + ser = Series([4], dtype="Int64") + result = cat.add_categories(ser) + expected = Categorical( + Series([1, 2], dtype="Int64"), categories=Series([1, 2, 4], dtype="Int64") + ) + tm.assert_categorical_equal(result, expected) + + cat = Categorical(Series(["a", "b", "a"], dtype=StringDtype())) + ser = Series(["d"], dtype=StringDtype()) + result = cat.add_categories(ser) + expected = Categorical( + Series(["a", "b", "a"], dtype=StringDtype()), + categories=Series(["a", "b", "d"], dtype=StringDtype()), + ) + tm.assert_categorical_equal(result, expected) + + def test_set_categories(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + exp_categories = Index(["c", "b", "a"]) + exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) + + cat = cat.set_categories(["c", "b", "a"]) + res = cat.set_categories(["a", "b", "c"]) + # cat must be the same as before + tm.assert_index_equal(cat.categories, exp_categories) + tm.assert_numpy_array_equal(cat.__array__(), exp_values) + # only res is changed + exp_categories_back = Index(["a", "b", "c"]) + tm.assert_index_equal(res.categories, exp_categories_back) + tm.assert_numpy_array_equal(res.__array__(), exp_values) + + # not all "old" included in "new" -> all not included ones are now + # np.nan + cat = Categorical(["a", "b", "c", "a"], ordered=True) + res = cat.set_categories(["a"]) + tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8)) + + # still not all "old" in "new" + res = cat.set_categories(["a", "b", "d"]) + tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8)) + tm.assert_index_equal(res.categories, Index(["a", "b", "d"])) + + # all "old" included in "new" + cat = cat.set_categories(["a", "b", "c", "d"]) + exp_categories = Index(["a", "b", "c", "d"]) + tm.assert_index_equal(cat.categories, exp_categories) + + # internals... + c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8)) + tm.assert_index_equal(c.categories, Index([1, 2, 3, 4])) + + exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) + tm.assert_numpy_array_equal(np.asarray(c), exp) + + # all "pointers" to '4' must be changed from 3 to 0,... + c = c.set_categories([4, 3, 2, 1]) + + # positions are changed + tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8)) + + # categories are now in new order + tm.assert_index_equal(c.categories, Index([4, 3, 2, 1])) + + # output is the same + exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) + tm.assert_numpy_array_equal(np.asarray(c), exp) + assert c.min() == 4 + assert c.max() == 1 + + # set_categories should set the ordering if specified + c2 = c.set_categories([4, 3, 2, 1], ordered=False) + assert not c2.ordered + + tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2)) + + # set_categories should pass thru the ordering + c2 = c.set_ordered(False).set_categories([4, 3, 2, 1]) + assert not c2.ordered + + tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2)) + + @pytest.mark.parametrize( + "values, categories, new_categories", + [ + # No NaNs, same cats, same order + (["a", "b", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["a", "b", "a"], ["a", "b"], ["b", "a"]), + # Same, unsorted + (["b", "a", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["b", "a", "a"], ["a", "b"], ["b", "a"]), + # NaNs + (["a", "b", "c"], ["a", "b"], ["a", "b"]), + (["a", "b", "c"], ["a", "b"], ["b", "a"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + # Introduce NaNs + (["a", "b", "c"], ["a", "b"], ["a"]), + (["a", "b", "c"], ["a", "b"], ["b"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + # No overlap + (["a", "b", "c"], ["a", "b"], ["d", "e"]), + ], + ) + @pytest.mark.parametrize("ordered", [True, False]) + def test_set_categories_many(self, values, categories, new_categories, ordered): + c = Categorical(values, categories) + expected = Categorical(values, new_categories, ordered) + result = c.set_categories(new_categories, ordered=ordered) + tm.assert_categorical_equal(result, expected) + + def test_set_categories_rename_less(self): + # GH 24675 + cat = Categorical(["A", "B"]) + result = cat.set_categories(["A"], rename=True) + expected = Categorical(["A", np.nan]) + tm.assert_categorical_equal(result, expected) + + def test_set_categories_private(self): + cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"]) + cat._set_categories(["a", "c", "d", "e"]) + expected = Categorical(["a", "c", "d"], categories=list("acde")) + tm.assert_categorical_equal(cat, expected) + + # fastpath + cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"]) + cat._set_categories(["a", "c", "d", "e"], fastpath=True) + expected = Categorical(["a", "c", "d"], categories=list("acde")) + tm.assert_categorical_equal(cat, expected) + + def test_remove_categories(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + old = cat.copy() + new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True) + + res = cat.remove_categories("c") + tm.assert_categorical_equal(cat, old) + tm.assert_categorical_equal(res, new) + + res = cat.remove_categories(["c"]) + tm.assert_categorical_equal(cat, old) + tm.assert_categorical_equal(res, new) + + @pytest.mark.parametrize("removals", [["c"], ["c", np.nan], "c", ["c", "c"]]) + def test_remove_categories_raises(self, removals): + cat = Categorical(["a", "b", "a"]) + message = re.escape("removals must all be in old categories: {'c'}") + + with pytest.raises(ValueError, match=message): + cat.remove_categories(removals) + + def test_remove_unused_categories(self): + c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"]) + exp_categories_all = Index(["a", "b", "c", "d", "e"]) + exp_categories_dropped = Index(["a", "b", "c", "d"]) + + tm.assert_index_equal(c.categories, exp_categories_all) + + res = c.remove_unused_categories() + tm.assert_index_equal(res.categories, exp_categories_dropped) + tm.assert_index_equal(c.categories, exp_categories_all) + + # with NaN values (GH11599) + c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"]) + res = c.remove_unused_categories() + tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"]))) + exp_codes = np.array([0, 1, 2, -1], dtype=np.int8) + tm.assert_numpy_array_equal(res.codes, exp_codes) + tm.assert_index_equal(c.categories, exp_categories_all) + + val = ["F", np.nan, "D", "B", "D", "F", np.nan] + cat = Categorical(values=val, categories=list("ABCDEFG")) + out = cat.remove_unused_categories() + tm.assert_index_equal(out.categories, Index(["B", "D", "F"])) + exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8) + tm.assert_numpy_array_equal(out.codes, exp_codes) + assert out.tolist() == val + + alpha = list("abcdefghijklmnopqrstuvwxyz") + val = np.random.default_rng(2).choice(alpha[::2], 10000).astype("object") + val[np.random.default_rng(2).choice(len(val), 100)] = np.nan + + cat = Categorical(values=val, categories=alpha) + out = cat.remove_unused_categories() + assert out.tolist() == val.tolist() + + +class TestCategoricalAPIWithFactor: + def test_describe(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) + # string type + desc = factor.describe() + assert factor.ordered + exp_index = CategoricalIndex( + ["a", "b", "c"], name="categories", ordered=factor.ordered + ) + expected = DataFrame( + {"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index + ) + tm.assert_frame_equal(desc, expected) + + # check unused categories + cat = factor.copy() + cat = cat.set_categories(["a", "b", "c", "d"]) + desc = cat.describe() + + exp_index = CategoricalIndex( + list("abcd"), ordered=factor.ordered, name="categories" + ) + expected = DataFrame( + {"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]}, + index=exp_index, + ) + tm.assert_frame_equal(desc, expected) + + # check an integer one + cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]) + desc = cat.describe() + exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories") + expected = DataFrame( + {"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]}, + index=exp_index, + ) + tm.assert_frame_equal(desc, expected) + + # https://github.com/pandas-dev/pandas/issues/3678 + # describe should work with NaN + cat = Categorical([np.nan, 1, 2, 2]) + desc = cat.describe() + expected = DataFrame( + {"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]}, + index=CategoricalIndex( + [1, 2, np.nan], categories=[1, 2], name="categories" + ), + ) + tm.assert_frame_equal(desc, expected) + + +class TestPrivateCategoricalAPI: + def test_codes_immutable(self): + # Codes should be read only + c = Categorical(["a", "b", "c", "a", np.nan]) + exp = np.array([0, 1, 2, 0, -1], dtype="int8") + tm.assert_numpy_array_equal(c.codes, exp) + + # Assignments to codes should raise + msg = ( + "property 'codes' of 'Categorical' object has no setter" + if PY311 + else "can't set attribute" + ) + with pytest.raises(AttributeError, match=msg): + c.codes = np.array([0, 1, 2, 0, 1], dtype="int8") + + # changes in the codes array should raise + codes = c.codes + + with pytest.raises(ValueError, match="assignment destination is read-only"): + codes[4] = 1 + + # But even after getting the codes, the original array should still be + # writeable! + c[4] = "a" + exp = np.array([0, 1, 2, 0, 0], dtype="int8") + tm.assert_numpy_array_equal(c.codes, exp) + c._codes[4] = 2 + exp = np.array([0, 1, 2, 0, 2], dtype="int8") + tm.assert_numpy_array_equal(c.codes, exp) + + @pytest.mark.parametrize( + "codes, old, new, expected", + [ + ([0, 1], ["a", "b"], ["a", "b"], [0, 1]), + ([0, 1], ["b", "a"], ["b", "a"], [0, 1]), + ([0, 1], ["a", "b"], ["b", "a"], [1, 0]), + ([0, 1], ["b", "a"], ["a", "b"], [1, 0]), + ([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]), + ([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]), + ([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]), + ([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]), + ([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]), + ([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]), + ([-1, -1], [], ["a", "b"], [-1, -1]), + ([1, 0], ["b", "a"], ["a", "b"], [0, 1]), + ], + ) + def test_recode_to_categories(self, codes, old, new, expected): + codes = np.asanyarray(codes, dtype=np.int8) + expected = np.asanyarray(expected, dtype=np.int8) + old = Index(old) + new = Index(new) + result = recode_for_categories(codes, old, new) + tm.assert_numpy_array_equal(result, expected) + + def test_recode_to_categories_large(self): + N = 1000 + codes = np.arange(N) + old = Index(codes) + expected = np.arange(N - 1, -1, -1, dtype=np.int16) + new = Index(expected) + result = recode_for_categories(codes, old, new) + tm.assert_numpy_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..a2a53af6ab1ad3701a58bcc6d00929ad2629d36b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_astype.py @@ -0,0 +1,155 @@ +import numpy as np +import pytest + +from pandas import ( + Categorical, + CategoricalDtype, + CategoricalIndex, + DatetimeIndex, + Interval, + NaT, + Period, + Timestamp, + array, + to_datetime, +) +import pandas._testing as tm + + +class TestAstype: + @pytest.mark.parametrize("cls", [Categorical, CategoricalIndex]) + @pytest.mark.parametrize("values", [[1, np.nan], [Timestamp("2000"), NaT]]) + def test_astype_nan_to_int(self, cls, values): + # GH#28406 + obj = cls(values) + + msg = "Cannot (cast|convert)" + with pytest.raises((ValueError, TypeError), match=msg): + obj.astype(int) + + @pytest.mark.parametrize( + "expected", + [ + array(["2019", "2020"], dtype="datetime64[ns, UTC]"), + array([0, 0], dtype="timedelta64[ns]"), + array([Period("2019"), Period("2020")], dtype="period[Y-DEC]"), + array([Interval(0, 1), Interval(1, 2)], dtype="interval"), + array([1, np.nan], dtype="Int64"), + ], + ) + def test_astype_category_to_extension_dtype(self, expected): + # GH#28668 + result = expected.astype("category").astype(expected.dtype) + + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, expected", + [ + ( + "datetime64[ns]", + np.array(["2015-01-01T00:00:00.000000000"], dtype="datetime64[ns]"), + ), + ( + "datetime64[ns, MET]", + DatetimeIndex([Timestamp("2015-01-01 00:00:00+0100", tz="MET")]).array, + ), + ], + ) + def test_astype_to_datetime64(self, dtype, expected): + # GH#28448 + result = Categorical(["2015-01-01"]).astype(dtype) + assert result == expected + + def test_astype_str_int_categories_to_nullable_int(self): + # GH#39616 + dtype = CategoricalDtype([str(i) for i in range(5)]) + codes = np.random.default_rng(2).integers(5, size=20) + arr = Categorical.from_codes(codes, dtype=dtype) + + res = arr.astype("Int64") + expected = array(codes, dtype="Int64") + tm.assert_extension_array_equal(res, expected) + + def test_astype_str_int_categories_to_nullable_float(self): + # GH#39616 + dtype = CategoricalDtype([str(i / 2) for i in range(5)]) + codes = np.random.default_rng(2).integers(5, size=20) + arr = Categorical.from_codes(codes, dtype=dtype) + + res = arr.astype("Float64") + expected = array(codes, dtype="Float64") / 2 + tm.assert_extension_array_equal(res, expected) + + @pytest.mark.parametrize("ordered", [True, False]) + def test_astype(self, ordered): + # string + cat = Categorical(list("abbaaccc"), ordered=ordered) + result = cat.astype(object) + expected = np.array(cat) + tm.assert_numpy_array_equal(result, expected) + + msg = r"Cannot cast object|string dtype to float64" + with pytest.raises(ValueError, match=msg): + cat.astype(float) + + # numeric + cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered) + result = cat.astype(object) + expected = np.array(cat, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + result = cat.astype(int) + expected = np.array(cat, dtype="int") + tm.assert_numpy_array_equal(result, expected) + + result = cat.astype(float) + expected = np.array(cat, dtype=float) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("cat_ordered", [True, False]) + def test_astype_category(self, dtype_ordered, cat_ordered): + # GH#10696/GH#18593 + data = list("abcaacbab") + cat = Categorical(data, categories=list("bac"), ordered=cat_ordered) + + # standard categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = cat.astype(dtype) + expected = Categorical(data, categories=cat.categories, ordered=dtype_ordered) + tm.assert_categorical_equal(result, expected) + + # non-standard categories + dtype = CategoricalDtype(list("adc"), dtype_ordered) + result = cat.astype(dtype) + expected = Categorical(data, dtype=dtype) + tm.assert_categorical_equal(result, expected) + + if dtype_ordered is False: + # dtype='category' can't specify ordered, so only test once + result = cat.astype("category") + expected = cat + tm.assert_categorical_equal(result, expected) + + def test_astype_object_datetime_categories(self): + # GH#40754 + cat = Categorical(to_datetime(["2021-03-27", NaT])) + result = cat.astype(object) + expected = np.array([Timestamp("2021-03-27 00:00:00"), NaT], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + def test_astype_object_timestamp_categories(self): + # GH#18024 + cat = Categorical([Timestamp("2014-01-01")]) + result = cat.astype(object) + expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + def test_astype_category_readonly_mask_values(self): + # GH#53658 + arr = array([0, 1, 2], dtype="Int64") + arr._mask.flags["WRITEABLE"] = False + result = arr.astype("category") + expected = array([0, 1, 2], dtype="Int64").astype("category") + tm.assert_extension_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_constructors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..373f1c95463fc43feaaf50d9d984e539628a6b5e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_constructors.py @@ -0,0 +1,783 @@ +from datetime import ( + date, + datetime, +) + +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +from pandas.core.dtypes.common import ( + is_float_dtype, + is_integer_dtype, +) +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm + + +class TestCategoricalConstructors: + def test_fastpath_deprecated(self): + codes = np.array([1, 2, 3]) + dtype = CategoricalDtype(categories=["a", "b", "c", "d"], ordered=False) + msg = "The 'fastpath' keyword in Categorical is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + Categorical(codes, dtype=dtype, fastpath=True) + + def test_categorical_from_cat_and_dtype_str_preserve_ordered(self): + # GH#49309 we should preserve orderedness in `res` + cat = Categorical([3, 1], categories=[3, 2, 1], ordered=True) + + res = Categorical(cat, dtype="category") + assert res.dtype.ordered + + def test_categorical_disallows_scalar(self): + # GH#38433 + with pytest.raises(TypeError, match="Categorical input must be list-like"): + Categorical("A", categories=["A", "B"]) + + def test_categorical_1d_only(self): + # ndim > 1 + msg = "> 1 ndim Categorical are not supported at this time" + with pytest.raises(NotImplementedError, match=msg): + Categorical(np.array([list("abcd")])) + + def test_validate_ordered(self): + # see gh-14058 + exp_msg = "'ordered' must either be 'True' or 'False'" + exp_err = TypeError + + # This should be a boolean. + ordered = np.array([0, 1, 2]) + + with pytest.raises(exp_err, match=exp_msg): + Categorical([1, 2, 3], ordered=ordered) + + with pytest.raises(exp_err, match=exp_msg): + Categorical.from_codes( + [0, 0, 1], categories=["a", "b", "c"], ordered=ordered + ) + + def test_constructor_empty(self): + # GH 17248 + c = Categorical([]) + expected = Index([]) + tm.assert_index_equal(c.categories, expected) + + c = Categorical([], categories=[1, 2, 3]) + expected = Index([1, 2, 3], dtype=np.int64) + tm.assert_index_equal(c.categories, expected) + + def test_constructor_empty_boolean(self): + # see gh-22702 + cat = Categorical([], categories=[True, False]) + categories = sorted(cat.categories.tolist()) + assert categories == [False, True] + + def test_constructor_tuples(self): + values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object) + result = Categorical(values) + expected = Index([(1,), (1, 2)], tupleize_cols=False) + tm.assert_index_equal(result.categories, expected) + assert result.ordered is False + + def test_constructor_tuples_datetimes(self): + # numpy will auto reshape when all of the tuples are the + # same len, so add an extra one with 2 items and slice it off + values = np.array( + [ + (Timestamp("2010-01-01"),), + (Timestamp("2010-01-02"),), + (Timestamp("2010-01-01"),), + (Timestamp("2010-01-02"),), + ("a", "b"), + ], + dtype=object, + )[:-1] + result = Categorical(values) + expected = Index( + [(Timestamp("2010-01-01"),), (Timestamp("2010-01-02"),)], + tupleize_cols=False, + ) + tm.assert_index_equal(result.categories, expected) + + def test_constructor_unsortable(self): + # it works! + arr = np.array([1, 2, 3, datetime.now()], dtype="O") + factor = Categorical(arr, ordered=False) + assert not factor.ordered + + # this however will raise as cannot be sorted + msg = ( + "'values' is not ordered, please explicitly specify the " + "categories order by passing in a categories argument." + ) + with pytest.raises(TypeError, match=msg): + Categorical(arr, ordered=True) + + def test_constructor_interval(self): + result = Categorical( + [Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True + ) + ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)]) + exp = Categorical(ii, ordered=True) + tm.assert_categorical_equal(result, exp) + tm.assert_index_equal(result.categories, ii) + + def test_constructor(self): + exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_) + c1 = Categorical(exp_arr) + tm.assert_numpy_array_equal(c1.__array__(), exp_arr) + c2 = Categorical(exp_arr, categories=["a", "b", "c"]) + tm.assert_numpy_array_equal(c2.__array__(), exp_arr) + c2 = Categorical(exp_arr, categories=["c", "b", "a"]) + tm.assert_numpy_array_equal(c2.__array__(), exp_arr) + + # categories must be unique + msg = "Categorical categories must be unique" + with pytest.raises(ValueError, match=msg): + Categorical([1, 2], [1, 2, 2]) + + with pytest.raises(ValueError, match=msg): + Categorical(["a", "b"], ["a", "b", "b"]) + + # The default should be unordered + c1 = Categorical(["a", "b", "c", "a"]) + assert not c1.ordered + + # Categorical as input + c1 = Categorical(["a", "b", "c", "a"]) + c2 = Categorical(c1) + tm.assert_categorical_equal(c1, c2) + + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + c2 = Categorical(c1) + tm.assert_categorical_equal(c1, c2) + + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) + c2 = Categorical(c1) + tm.assert_categorical_equal(c1, c2) + + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) + c2 = Categorical(c1, categories=["a", "b", "c"]) + tm.assert_numpy_array_equal(c1.__array__(), c2.__array__()) + tm.assert_index_equal(c2.categories, Index(["a", "b", "c"])) + + # Series of dtype category + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + c2 = Categorical(Series(c1)) + tm.assert_categorical_equal(c1, c2) + + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) + c2 = Categorical(Series(c1)) + tm.assert_categorical_equal(c1, c2) + + # Series + c1 = Categorical(["a", "b", "c", "a"]) + c2 = Categorical(Series(["a", "b", "c", "a"])) + tm.assert_categorical_equal(c1, c2) + + c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"]) + tm.assert_categorical_equal(c1, c2) + + # This should result in integer categories, not float! + cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) + assert is_integer_dtype(cat.categories) + + # https://github.com/pandas-dev/pandas/issues/3678 + cat = Categorical([np.nan, 1, 2, 3]) + assert is_integer_dtype(cat.categories) + + # this should result in floats + cat = Categorical([np.nan, 1, 2.0, 3]) + assert is_float_dtype(cat.categories) + + cat = Categorical([np.nan, 1.0, 2.0, 3.0]) + assert is_float_dtype(cat.categories) + + # This doesn't work -> this would probably need some kind of "remember + # the original type" feature to try to cast the array interface result + # to... + + # vals = np.asarray(cat[cat.notna()]) + # assert is_integer_dtype(vals) + + # corner cases + cat = Categorical([1]) + assert len(cat.categories) == 1 + assert cat.categories[0] == 1 + assert len(cat.codes) == 1 + assert cat.codes[0] == 0 + + cat = Categorical(["a"]) + assert len(cat.categories) == 1 + assert cat.categories[0] == "a" + assert len(cat.codes) == 1 + assert cat.codes[0] == 0 + + # two arrays + # - when the first is an integer dtype and the second is not + # - when the resulting codes are all -1/NaN + with tm.assert_produces_warning(None): + Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"]) + + with tm.assert_produces_warning(None): + Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5]) + + # the next one are from the old docs + with tm.assert_produces_warning(None): + Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) + cat = Categorical([1, 2], categories=[1, 2, 3]) + + # this is a legitimate constructor + with tm.assert_produces_warning(None): + Categorical(np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True) + + def test_constructor_with_existing_categories(self): + # GH25318: constructing with pd.Series used to bogusly skip recoding + # categories + c0 = Categorical(["a", "b", "c", "a"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"]) + + c2 = Categorical(c0, categories=c1.categories) + tm.assert_categorical_equal(c1, c2) + + c3 = Categorical(Series(c0), categories=c1.categories) + tm.assert_categorical_equal(c1, c3) + + def test_constructor_not_sequence(self): + # https://github.com/pandas-dev/pandas/issues/16022 + msg = r"^Parameter 'categories' must be list-like, was" + with pytest.raises(TypeError, match=msg): + Categorical(["a", "b"], categories="a") + + def test_constructor_with_null(self): + # Cannot have NaN in categories + msg = "Categorical categories cannot be null" + with pytest.raises(ValueError, match=msg): + Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) + + with pytest.raises(ValueError, match=msg): + Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"]) + + with pytest.raises(ValueError, match=msg): + Categorical( + DatetimeIndex(["nat", "20160101"]), + categories=[NaT, Timestamp("20160101")], + ) + + def test_constructor_with_index(self): + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + tm.assert_categorical_equal(ci.values, Categorical(ci)) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + tm.assert_categorical_equal( + ci.values, Categorical(ci.astype(object), categories=ci.categories) + ) + + def test_constructor_with_generator(self): + # This was raising an Error in isna(single_val).any() because isna + # returned a scalar for a generator + + exp = Categorical([0, 1, 2]) + cat = Categorical(x for x in [0, 1, 2]) + tm.assert_categorical_equal(cat, exp) + cat = Categorical(range(3)) + tm.assert_categorical_equal(cat, exp) + + MultiIndex.from_product([range(5), ["a", "b", "c"]]) + + # check that categories accept generators and sequences + cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2])) + tm.assert_categorical_equal(cat, exp) + cat = Categorical([0, 1, 2], categories=range(3)) + tm.assert_categorical_equal(cat, exp) + + def test_constructor_with_rangeindex(self): + # RangeIndex is preserved in Categories + rng = Index(range(3)) + + cat = Categorical(rng) + tm.assert_index_equal(cat.categories, rng, exact=True) + + cat = Categorical([1, 2, 0], categories=rng) + tm.assert_index_equal(cat.categories, rng, exact=True) + + @pytest.mark.parametrize( + "dtl", + [ + date_range("1995-01-01 00:00:00", periods=5, freq="s"), + date_range("1995-01-01 00:00:00", periods=5, freq="s", tz="US/Eastern"), + timedelta_range("1 day", periods=5, freq="s"), + ], + ) + def test_constructor_with_datetimelike(self, dtl): + # see gh-12077 + # constructor with a datetimelike and NaT + + s = Series(dtl) + c = Categorical(s) + + expected = type(dtl)(s) + expected._data.freq = None + + tm.assert_index_equal(c.categories, expected) + tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8")) + + # with NaT + s2 = s.copy() + s2.iloc[-1] = NaT + c = Categorical(s2) + + expected = type(dtl)(s2.dropna()) + expected._data.freq = None + + tm.assert_index_equal(c.categories, expected) + + exp = np.array([0, 1, 2, 3, -1], dtype=np.int8) + tm.assert_numpy_array_equal(c.codes, exp) + + result = repr(c) + assert "NaT" in result + + def test_constructor_from_index_series_datetimetz(self): + idx = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") + idx = idx._with_freq(None) # freq not preserved in result.categories + result = Categorical(idx) + tm.assert_index_equal(result.categories, idx) + + result = Categorical(Series(idx)) + tm.assert_index_equal(result.categories, idx) + + def test_constructor_date_objects(self): + # we dont cast date objects to timestamps, matching Index constructor + v = date.today() + + cat = Categorical([v, v]) + assert cat.categories.dtype == object + assert type(cat.categories[0]) is date + + def test_constructor_from_index_series_timedelta(self): + idx = timedelta_range("1 days", freq="D", periods=3) + idx = idx._with_freq(None) # freq not preserved in result.categories + result = Categorical(idx) + tm.assert_index_equal(result.categories, idx) + + result = Categorical(Series(idx)) + tm.assert_index_equal(result.categories, idx) + + def test_constructor_from_index_series_period(self): + idx = period_range("2015-01-01", freq="D", periods=3) + result = Categorical(idx) + tm.assert_index_equal(result.categories, idx) + + result = Categorical(Series(idx)) + tm.assert_index_equal(result.categories, idx) + + @pytest.mark.parametrize( + "values", + [ + np.array([1.0, 1.2, 1.8, np.nan]), + np.array([1, 2, 3], dtype="int64"), + ["a", "b", "c", np.nan], + [pd.Period("2014-01"), pd.Period("2014-02"), NaT], + [Timestamp("2014-01-01"), Timestamp("2014-01-02"), NaT], + [ + Timestamp("2014-01-01", tz="US/Eastern"), + Timestamp("2014-01-02", tz="US/Eastern"), + NaT, + ], + ], + ) + def test_constructor_invariant(self, values): + # GH 14190 + c = Categorical(values) + c2 = Categorical(c) + tm.assert_categorical_equal(c, c2) + + @pytest.mark.parametrize("ordered", [True, False]) + def test_constructor_with_dtype(self, ordered): + categories = ["b", "a", "c"] + dtype = CategoricalDtype(categories, ordered=ordered) + result = Categorical(["a", "b", "a", "c"], dtype=dtype) + expected = Categorical( + ["a", "b", "a", "c"], categories=categories, ordered=ordered + ) + tm.assert_categorical_equal(result, expected) + assert result.ordered is ordered + + def test_constructor_dtype_and_others_raises(self): + dtype = CategoricalDtype(["a", "b"], ordered=True) + msg = "Cannot specify `categories` or `ordered` together with `dtype`." + with pytest.raises(ValueError, match=msg): + Categorical(["a", "b"], categories=["a", "b"], dtype=dtype) + + with pytest.raises(ValueError, match=msg): + Categorical(["a", "b"], ordered=True, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + Categorical(["a", "b"], ordered=False, dtype=dtype) + + @pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]]) + @pytest.mark.parametrize("ordered", [True, False]) + def test_constructor_str_category(self, categories, ordered): + result = Categorical( + ["a", "b"], categories=categories, ordered=ordered, dtype="category" + ) + expected = Categorical(["a", "b"], categories=categories, ordered=ordered) + tm.assert_categorical_equal(result, expected) + + def test_constructor_str_unknown(self): + with pytest.raises(ValueError, match="Unknown dtype"): + Categorical([1, 2], dtype="foo") + + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings") + def test_constructor_np_strs(self): + # GH#31499 Hashtable.map_locations needs to work on np.str_ objects + cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) + assert all(isinstance(x, np.str_) for x in cat.categories) + + def test_constructor_from_categorical_with_dtype(self): + dtype = CategoricalDtype(["a", "b", "c"], ordered=True) + values = Categorical(["a", "b", "d"]) + result = Categorical(values, dtype=dtype) + # We use dtype.categories, not values.categories + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "c"], ordered=True + ) + tm.assert_categorical_equal(result, expected) + + def test_constructor_from_categorical_with_unknown_dtype(self): + dtype = CategoricalDtype(None, ordered=True) + values = Categorical(["a", "b", "d"]) + result = Categorical(values, dtype=dtype) + # We use values.categories, not dtype.categories + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "d"], ordered=True + ) + tm.assert_categorical_equal(result, expected) + + def test_constructor_from_categorical_string(self): + values = Categorical(["a", "b", "d"]) + # use categories, ordered + result = Categorical( + values, categories=["a", "b", "c"], ordered=True, dtype="category" + ) + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "c"], ordered=True + ) + tm.assert_categorical_equal(result, expected) + + # No string + result = Categorical(values, categories=["a", "b", "c"], ordered=True) + tm.assert_categorical_equal(result, expected) + + def test_constructor_with_categorical_categories(self): + # GH17884 + expected = Categorical(["a", "b"], categories=["a", "b", "c"]) + + result = Categorical(["a", "b"], categories=Categorical(["a", "b", "c"])) + tm.assert_categorical_equal(result, expected) + + result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"])) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize("klass", [lambda x: np.array(x, dtype=object), list]) + def test_construction_with_null(self, klass, nulls_fixture): + # https://github.com/pandas-dev/pandas/issues/31927 + values = klass(["a", nulls_fixture, "b"]) + result = Categorical(values) + + dtype = CategoricalDtype(["a", "b"]) + codes = [0, -1, 1] + expected = Categorical.from_codes(codes=codes, dtype=dtype) + + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize("validate", [True, False]) + def test_from_codes_nullable_int_categories(self, any_numeric_ea_dtype, validate): + # GH#39649 + cats = pd.array(range(5), dtype=any_numeric_ea_dtype) + codes = np.random.default_rng(2).integers(5, size=3) + dtype = CategoricalDtype(cats) + arr = Categorical.from_codes(codes, dtype=dtype, validate=validate) + assert arr.categories.dtype == cats.dtype + tm.assert_index_equal(arr.categories, Index(cats)) + + def test_from_codes_empty(self): + cat = ["a", "b", "c"] + result = Categorical.from_codes([], categories=cat) + expected = Categorical([], categories=cat) + + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize("validate", [True, False]) + def test_from_codes_validate(self, validate): + # GH53122 + dtype = CategoricalDtype(["a", "b"]) + if validate: + with pytest.raises(ValueError, match="codes need to be between "): + Categorical.from_codes([4, 5], dtype=dtype, validate=validate) + else: + # passes, though has incorrect codes, but that's the user responsibility + Categorical.from_codes([4, 5], dtype=dtype, validate=validate) + + def test_from_codes_too_few_categories(self): + dtype = CategoricalDtype(categories=[1, 2]) + msg = "codes need to be between " + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([1, 2], categories=dtype.categories) + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([1, 2], dtype=dtype) + + def test_from_codes_non_int_codes(self): + dtype = CategoricalDtype(categories=[1, 2]) + msg = "codes need to be array-like integers" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes(["a"], categories=dtype.categories) + with pytest.raises(ValueError, match=msg): + Categorical.from_codes(["a"], dtype=dtype) + + def test_from_codes_non_unique_categories(self): + with pytest.raises(ValueError, match="Categorical categories must be unique"): + Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) + + def test_from_codes_nan_cat_included(self): + with pytest.raises(ValueError, match="Categorical categories cannot be null"): + Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) + + def test_from_codes_too_negative(self): + dtype = CategoricalDtype(categories=["a", "b", "c"]) + msg = r"codes need to be between -1 and len\(categories\)-1" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([-2, 1, 2], categories=dtype.categories) + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([-2, 1, 2], dtype=dtype) + + def test_from_codes(self): + dtype = CategoricalDtype(categories=["a", "b", "c"]) + exp = Categorical(["a", "b", "c"], ordered=False) + res = Categorical.from_codes([0, 1, 2], categories=dtype.categories) + tm.assert_categorical_equal(exp, res) + + res = Categorical.from_codes([0, 1, 2], dtype=dtype) + tm.assert_categorical_equal(exp, res) + + @pytest.mark.parametrize("klass", [Categorical, CategoricalIndex]) + def test_from_codes_with_categorical_categories(self, klass): + # GH17884 + expected = Categorical(["a", "b"], categories=["a", "b", "c"]) + + result = Categorical.from_codes([0, 1], categories=klass(["a", "b", "c"])) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize("klass", [Categorical, CategoricalIndex]) + def test_from_codes_with_non_unique_categorical_categories(self, klass): + with pytest.raises(ValueError, match="Categorical categories must be unique"): + Categorical.from_codes([0, 1], klass(["a", "b", "a"])) + + def test_from_codes_with_nan_code(self): + # GH21767 + codes = [1, 2, np.nan] + dtype = CategoricalDtype(categories=["a", "b", "c"]) + with pytest.raises(ValueError, match="codes need to be array-like integers"): + Categorical.from_codes(codes, categories=dtype.categories) + with pytest.raises(ValueError, match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype=dtype) + + @pytest.mark.parametrize("codes", [[1.0, 2.0, 0], [1.1, 2.0, 0]]) + def test_from_codes_with_float(self, codes): + # GH21767 + # float codes should raise even if values are equal to integers + dtype = CategoricalDtype(categories=["a", "b", "c"]) + + msg = "codes need to be array-like integers" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes(codes, dtype.categories) + with pytest.raises(ValueError, match=msg): + Categorical.from_codes(codes, dtype=dtype) + + def test_from_codes_with_dtype_raises(self): + msg = "Cannot specify" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes( + [0, 1], categories=["a", "b"], dtype=CategoricalDtype(["a", "b"]) + ) + + with pytest.raises(ValueError, match=msg): + Categorical.from_codes( + [0, 1], ordered=True, dtype=CategoricalDtype(["a", "b"]) + ) + + def test_from_codes_neither(self): + msg = "Both were None" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([0, 1]) + + def test_from_codes_with_nullable_int(self): + codes = pd.array([0, 1], dtype="Int64") + categories = ["a", "b"] + + result = Categorical.from_codes(codes, categories=categories) + expected = Categorical.from_codes(codes.to_numpy(int), categories=categories) + + tm.assert_categorical_equal(result, expected) + + def test_from_codes_with_nullable_int_na_raises(self): + codes = pd.array([0, None], dtype="Int64") + categories = ["a", "b"] + + msg = "codes cannot contain NA values" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes(codes, categories=categories) + + @pytest.mark.parametrize("dtype", [None, "category"]) + def test_from_inferred_categories(self, dtype): + cats = ["a", "b"] + codes = np.array([0, 0, 1, 1], dtype="i8") + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical.from_codes(codes, cats) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize("dtype", [None, "category"]) + def test_from_inferred_categories_sorts(self, dtype): + cats = ["b", "a"] + codes = np.array([0, 1, 1, 1], dtype="i8") + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"]) + tm.assert_categorical_equal(result, expected) + + def test_from_inferred_categories_dtype(self): + cats = ["a", "b", "d"] + codes = np.array([0, 1, 0, 2], dtype="i8") + dtype = CategoricalDtype(["c", "b", "a"], ordered=True) + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical( + ["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True + ) + tm.assert_categorical_equal(result, expected) + + def test_from_inferred_categories_coerces(self): + cats = ["1", "2", "bad"] + codes = np.array([0, 0, 1, 2], dtype="i8") + dtype = CategoricalDtype([1, 2]) + result = Categorical._from_inferred_categories(cats, codes, dtype) + expected = Categorical([1, 1, 2, np.nan]) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize("ordered", [None, True, False]) + def test_construction_with_ordered(self, ordered): + # GH 9347, 9190 + cat = Categorical([0, 1, 2], ordered=ordered) + assert cat.ordered == bool(ordered) + + def test_constructor_imaginary(self): + values = [1, 2, 3 + 1j] + c1 = Categorical(values) + tm.assert_index_equal(c1.categories, Index(values)) + tm.assert_numpy_array_equal(np.array(c1), np.array(values)) + + def test_constructor_string_and_tuples(self): + # GH 21416 + c = Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object)) + expected_index = Index([("a", "b"), ("b", "a"), "c"]) + assert c.categories.equals(expected_index) + + def test_interval(self): + idx = pd.interval_range(0, 10, periods=10) + cat = Categorical(idx, categories=idx) + expected_codes = np.arange(10, dtype="int8") + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # infer categories + cat = Categorical(idx) + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # list values + cat = Categorical(list(idx)) + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # list values, categories + cat = Categorical(list(idx), categories=list(idx)) + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # shuffled + values = idx.take([1, 2, 0]) + cat = Categorical(values, categories=idx) + tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8")) + tm.assert_index_equal(cat.categories, idx) + + # extra + values = pd.interval_range(8, 11, periods=3) + cat = Categorical(values, categories=idx) + expected_codes = np.array([8, 9, -1], dtype="int8") + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # overlapping + idx = IntervalIndex([Interval(0, 2), Interval(0, 1)]) + cat = Categorical(idx, categories=idx) + expected_codes = np.array([0, 1], dtype="int8") + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + def test_categorical_extension_array_nullable(self, nulls_fixture): + # GH: + arr = pd.arrays.StringArray._from_sequence( + [nulls_fixture] * 2, dtype=pd.StringDtype() + ) + result = Categorical(arr) + assert arr.dtype == result.categories.dtype + expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype)) + tm.assert_categorical_equal(result, expected) + + def test_from_sequence_copy(self): + cat = Categorical(np.arange(5).repeat(2)) + result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=False) + + # more generally, we'd be OK with a view + assert result._codes is cat._codes + + result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=True) + + assert not tm.shares_memory(result, cat) + + def test_constructor_datetime64_non_nano(self): + categories = np.arange(10).view("M8[D]") + values = categories[::2].copy() + + cat = Categorical(values, categories=categories) + assert (cat == values).all() + + def test_constructor_preserves_freq(self): + # GH33830 freq retention in categorical + dti = date_range("2016-01-01", periods=5) + + expected = dti.freq + + cat = Categorical(dti) + result = cat.categories.freq + + assert expected == result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_dtypes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..525663cad1745880bc5e683e7302afdc2c06a527 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_dtypes.py @@ -0,0 +1,139 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +from pandas import ( + Categorical, + CategoricalIndex, + Index, + IntervalIndex, + Series, + Timestamp, +) +import pandas._testing as tm + + +class TestCategoricalDtypes: + def test_categories_match_up_to_permutation(self): + # test dtype comparisons between cats + + c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) + c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False) + c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True) + assert c1._categories_match_up_to_permutation(c1) + assert c2._categories_match_up_to_permutation(c2) + assert c3._categories_match_up_to_permutation(c3) + assert c1._categories_match_up_to_permutation(c2) + assert not c1._categories_match_up_to_permutation(c3) + assert not c1._categories_match_up_to_permutation(Index(list("aabca"))) + assert not c1._categories_match_up_to_permutation(c1.astype(object)) + assert c1._categories_match_up_to_permutation(CategoricalIndex(c1)) + assert c1._categories_match_up_to_permutation( + CategoricalIndex(c1, categories=list("cab")) + ) + assert not c1._categories_match_up_to_permutation( + CategoricalIndex(c1, ordered=True) + ) + + # GH 16659 + s1 = Series(c1) + s2 = Series(c2) + s3 = Series(c3) + assert c1._categories_match_up_to_permutation(s1) + assert c2._categories_match_up_to_permutation(s2) + assert c3._categories_match_up_to_permutation(s3) + assert c1._categories_match_up_to_permutation(s2) + assert not c1._categories_match_up_to_permutation(s3) + assert not c1._categories_match_up_to_permutation(s1.astype(object)) + + def test_set_dtype_same(self): + c = Categorical(["a", "b", "c"]) + result = c._set_dtype(CategoricalDtype(["a", "b", "c"])) + tm.assert_categorical_equal(result, c) + + def test_set_dtype_new_categories(self): + c = Categorical(["a", "b", "c"]) + result = c._set_dtype(CategoricalDtype(list("abcd"))) + tm.assert_numpy_array_equal(result.codes, c.codes) + tm.assert_index_equal(result.dtype.categories, Index(list("abcd"))) + + @pytest.mark.parametrize( + "values, categories, new_categories", + [ + # No NaNs, same cats, same order + (["a", "b", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["a", "b", "a"], ["a", "b"], ["b", "a"]), + # Same, unsorted + (["b", "a", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["b", "a", "a"], ["a", "b"], ["b", "a"]), + # NaNs + (["a", "b", "c"], ["a", "b"], ["a", "b"]), + (["a", "b", "c"], ["a", "b"], ["b", "a"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + # Introduce NaNs + (["a", "b", "c"], ["a", "b"], ["a"]), + (["a", "b", "c"], ["a", "b"], ["b"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + # No overlap + (["a", "b", "c"], ["a", "b"], ["d", "e"]), + ], + ) + @pytest.mark.parametrize("ordered", [True, False]) + def test_set_dtype_many(self, values, categories, new_categories, ordered): + c = Categorical(values, categories) + expected = Categorical(values, new_categories, ordered) + result = c._set_dtype(expected.dtype) + tm.assert_categorical_equal(result, expected) + + def test_set_dtype_no_overlap(self): + c = Categorical(["a", "b", "c"], ["d", "e"]) + result = c._set_dtype(CategoricalDtype(["a", "b"])) + expected = Categorical([None, None, None], categories=["a", "b"]) + tm.assert_categorical_equal(result, expected) + + def test_codes_dtypes(self): + # GH 8453 + result = Categorical(["foo", "bar", "baz"]) + assert result.codes.dtype == "int8" + + result = Categorical([f"foo{i:05d}" for i in range(400)]) + assert result.codes.dtype == "int16" + + result = Categorical([f"foo{i:05d}" for i in range(40000)]) + assert result.codes.dtype == "int32" + + # adding cats + result = Categorical(["foo", "bar", "baz"]) + assert result.codes.dtype == "int8" + result = result.add_categories([f"foo{i:05d}" for i in range(400)]) + assert result.codes.dtype == "int16" + + # removing cats + result = result.remove_categories([f"foo{i:05d}" for i in range(300)]) + assert result.codes.dtype == "int8" + + def test_iter_python_types(self): + # GH-19909 + cat = Categorical([1, 2]) + assert isinstance(next(iter(cat)), int) + assert isinstance(cat.tolist()[0], int) + + def test_iter_python_types_datetime(self): + cat = Categorical([Timestamp("2017-01-01"), Timestamp("2017-01-02")]) + assert isinstance(next(iter(cat)), Timestamp) + assert isinstance(cat.tolist()[0], Timestamp) + + def test_interval_index_category(self): + # GH 38316 + index = IntervalIndex.from_breaks(np.arange(3, dtype="uint64")) + + result = CategoricalIndex(index).dtype.categories + expected = IntervalIndex.from_arrays( + [0, 1], [1, 2], dtype="interval[uint64, right]" + ) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..5e1c5c64fa660f501d2b9d77c9181f47e013267f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_indexing.py @@ -0,0 +1,388 @@ +import math + +import numpy as np +import pytest + +from pandas import ( + NA, + Categorical, + CategoricalIndex, + Index, + Interval, + IntervalIndex, + NaT, + PeriodIndex, + Series, + Timedelta, + Timestamp, +) +import pandas._testing as tm +import pandas.core.common as com + + +class TestCategoricalIndexingWithFactor: + def test_getitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) + assert factor[0] == "a" + assert factor[-1] == "c" + + subf = factor[[0, 1, 2]] + tm.assert_numpy_array_equal(subf._codes, np.array([0, 1, 1], dtype=np.int8)) + + subf = factor[np.asarray(factor) == "c"] + tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8)) + + def test_setitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) + # int/positional + c = factor.copy() + c[0] = "b" + assert c[0] == "b" + c[-1] = "a" + assert c[-1] == "a" + + # boolean + c = factor.copy() + indexer = np.zeros(len(c), dtype="bool") + indexer[0] = True + indexer[-1] = True + c[indexer] = "c" + expected = Categorical(["c", "b", "b", "a", "a", "c", "c", "c"], ordered=True) + + tm.assert_categorical_equal(c, expected) + + @pytest.mark.parametrize( + "other", + [Categorical(["b", "a"]), Categorical(["b", "a"], categories=["b", "a"])], + ) + def test_setitem_same_but_unordered(self, other): + # GH-24142 + target = Categorical(["a", "b"], categories=["a", "b"]) + mask = np.array([True, False]) + target[mask] = other[mask] + expected = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_categorical_equal(target, expected) + + @pytest.mark.parametrize( + "other", + [ + Categorical(["b", "a"], categories=["b", "a", "c"]), + Categorical(["b", "a"], categories=["a", "b", "c"]), + Categorical(["a", "a"], categories=["a"]), + Categorical(["b", "b"], categories=["b"]), + ], + ) + def test_setitem_different_unordered_raises(self, other): + # GH-24142 + target = Categorical(["a", "b"], categories=["a", "b"]) + mask = np.array([True, False]) + msg = "Cannot set a Categorical with another, without identical categories" + with pytest.raises(TypeError, match=msg): + target[mask] = other[mask] + + @pytest.mark.parametrize( + "other", + [ + Categorical(["b", "a"]), + Categorical(["b", "a"], categories=["b", "a"], ordered=True), + Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True), + ], + ) + def test_setitem_same_ordered_raises(self, other): + # Gh-24142 + target = Categorical(["a", "b"], categories=["a", "b"], ordered=True) + mask = np.array([True, False]) + msg = "Cannot set a Categorical with another, without identical categories" + with pytest.raises(TypeError, match=msg): + target[mask] = other[mask] + + def test_setitem_tuple(self): + # GH#20439 + cat = Categorical([(0, 1), (0, 2), (0, 1)]) + + # This should not raise + cat[1] = cat[0] + assert cat[1] == (0, 1) + + def test_setitem_listlike(self): + # GH#9469 + # properly coerce the input indexers + + cat = Categorical( + np.random.default_rng(2).integers(0, 5, size=150000).astype(np.int8) + ).add_categories([-1000]) + indexer = np.array([100000]).astype(np.int64) + cat[indexer] = -1000 + + # we are asserting the code result here + # which maps to the -1000 category + result = cat.codes[np.array([100000]).astype(np.int64)] + tm.assert_numpy_array_equal(result, np.array([5], dtype="int8")) + + +class TestCategoricalIndexing: + def test_getitem_slice(self): + cat = Categorical(["a", "b", "c", "d", "a", "b", "c"]) + sliced = cat[3] + assert sliced == "d" + + sliced = cat[3:5] + expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"]) + tm.assert_categorical_equal(sliced, expected) + + def test_getitem_listlike(self): + # GH 9469 + # properly coerce the input indexers + + c = Categorical( + np.random.default_rng(2).integers(0, 5, size=150000).astype(np.int8) + ) + result = c.codes[np.array([100000]).astype(np.int64)] + expected = c[np.array([100000]).astype(np.int64)].codes + tm.assert_numpy_array_equal(result, expected) + + def test_periodindex(self): + idx1 = PeriodIndex( + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], + freq="M", + ) + + cat1 = Categorical(idx1) + str(cat1) + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8) + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") + tm.assert_numpy_array_equal(cat1._codes, exp_arr) + tm.assert_index_equal(cat1.categories, exp_idx) + + idx2 = PeriodIndex( + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], + freq="M", + ) + cat2 = Categorical(idx2, ordered=True) + str(cat2) + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8) + exp_idx2 = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") + tm.assert_numpy_array_equal(cat2._codes, exp_arr) + tm.assert_index_equal(cat2.categories, exp_idx2) + + idx3 = PeriodIndex( + [ + "2013-12", + "2013-11", + "2013-10", + "2013-09", + "2013-08", + "2013-07", + "2013-05", + ], + freq="M", + ) + cat3 = Categorical(idx3, ordered=True) + exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8) + exp_idx = PeriodIndex( + [ + "2013-05", + "2013-07", + "2013-08", + "2013-09", + "2013-10", + "2013-11", + "2013-12", + ], + freq="M", + ) + tm.assert_numpy_array_equal(cat3._codes, exp_arr) + tm.assert_index_equal(cat3.categories, exp_idx) + + @pytest.mark.parametrize( + "null_val", + [None, np.nan, NaT, NA, math.nan, "NaT", "nat", "NAT", "nan", "NaN", "NAN"], + ) + def test_periodindex_on_null_types(self, null_val): + # GH 46673 + result = PeriodIndex(["2022-04-06", "2022-04-07", null_val], freq="D") + expected = PeriodIndex(["2022-04-06", "2022-04-07", "NaT"], dtype="period[D]") + assert result[2] is NaT + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]]) + def test_categories_assignments_wrong_length_raises(self, new_categories): + cat = Categorical(["a", "b", "c", "a"]) + msg = ( + "new categories need to have the same number of items " + "as the old categories!" + ) + with pytest.raises(ValueError, match=msg): + cat.rename_categories(new_categories) + + # Combinations of sorted/unique: + @pytest.mark.parametrize( + "idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], [1, 3, 3, 4], [1, 2, 2, 4]] + ) + # Combinations of missing/unique + @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]]) + @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex]) + @pytest.mark.parametrize("dtype", [None, "category", "key"]) + def test_get_indexer_non_unique(self, idx_values, key_values, key_class, dtype): + # GH 21448 + key = key_class(key_values, categories=range(1, 5)) + + if dtype == "key": + dtype = key.dtype + + # Test for flat index and CategoricalIndex with same/different cats: + idx = Index(idx_values, dtype=dtype) + expected, exp_miss = idx.get_indexer_non_unique(key_values) + result, res_miss = idx.get_indexer_non_unique(key) + + tm.assert_numpy_array_equal(expected, result) + tm.assert_numpy_array_equal(exp_miss, res_miss) + + exp_unique = idx.unique().get_indexer(key_values) + res_unique = idx.unique().get_indexer(key) + tm.assert_numpy_array_equal(res_unique, exp_unique) + + def test_where_unobserved_nan(self): + ser = Series(Categorical(["a", "b"])) + result = ser.where([True, False]) + expected = Series(Categorical(["a", None], categories=["a", "b"])) + tm.assert_series_equal(result, expected) + + # all NA + ser = Series(Categorical(["a", "b"])) + result = ser.where([False, False]) + expected = Series(Categorical([None, None], categories=["a", "b"])) + tm.assert_series_equal(result, expected) + + def test_where_unobserved_categories(self): + ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) + result = ser.where([True, True, False], other="b") + expected = Series(Categorical(["a", "b", "b"], categories=ser.cat.categories)) + tm.assert_series_equal(result, expected) + + def test_where_other_categorical(self): + ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) + other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"]) + result = ser.where([True, False, True], other) + expected = Series(Categorical(["a", "c", "c"], dtype=ser.dtype)) + tm.assert_series_equal(result, expected) + + def test_where_new_category_raises(self): + ser = Series(Categorical(["a", "b", "c"])) + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(TypeError, match=msg): + ser.where([True, False, True], "d") + + def test_where_ordered_differs_rasies(self): + ser = Series( + Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True) + ) + other = Categorical( + ["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True + ) + with pytest.raises(TypeError, match="without identical categories"): + ser.where([True, False, True], other) + + +class TestContains: + def test_contains(self): + # GH#21508 + cat = Categorical(list("aabbca"), categories=list("cab")) + + assert "b" in cat + assert "z" not in cat + assert np.nan not in cat + with pytest.raises(TypeError, match="unhashable type: 'list'"): + assert [1] in cat + + # assert codes NOT in index + assert 0 not in cat + assert 1 not in cat + + cat = Categorical(list("aabbca") + [np.nan], categories=list("cab")) + assert np.nan in cat + + @pytest.mark.parametrize( + "item, expected", + [ + (Interval(0, 1), True), + (1.5, True), + (Interval(0.5, 1.5), False), + ("a", False), + (Timestamp(1), False), + (Timedelta(1), False), + ], + ids=str, + ) + def test_contains_interval(self, item, expected): + # GH#23705 + cat = Categorical(IntervalIndex.from_breaks(range(3))) + result = item in cat + assert result is expected + + def test_contains_list(self): + # GH#21729 + cat = Categorical([1, 2, 3]) + + assert "a" not in cat + + with pytest.raises(TypeError, match="unhashable type"): + ["a"] in cat + + with pytest.raises(TypeError, match="unhashable type"): + ["a", "b"] in cat + + +@pytest.mark.parametrize("index", [True, False]) +def test_mask_with_boolean(index): + ser = Series(range(3)) + idx = Categorical([True, False, True]) + if index: + idx = CategoricalIndex(idx) + + assert com.is_bool_indexer(idx) + result = ser[idx] + expected = ser[idx.astype("object")] + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("index", [True, False]) +def test_mask_with_boolean_na_treated_as_false(index): + # https://github.com/pandas-dev/pandas/issues/31503 + ser = Series(range(3)) + idx = Categorical([True, False, None]) + if index: + idx = CategoricalIndex(idx) + + result = ser[idx] + expected = ser[idx.fillna(False)] + + tm.assert_series_equal(result, expected) + + +@pytest.fixture +def non_coercible_categorical(monkeypatch): + """ + Monkeypatch Categorical.__array__ to ensure no implicit conversion. + + Raises + ------ + ValueError + When Categorical.__array__ is called. + """ + + # TODO(Categorical): identify other places where this may be + # useful and move to a conftest.py + def array(self, dtype=None): + raise ValueError("I cannot be converted.") + + with monkeypatch.context() as m: + m.setattr(Categorical, "__array__", array) + yield + + +def test_series_at(): + arr = Categorical(["a", "b", "c"]) + ser = Series(arr) + result = ser.at[0] + assert result == "a" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_map.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_map.py new file mode 100644 index 0000000000000000000000000000000000000000..3d41b7cc7094d237fa8d31501ce90a99b04fe4e6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_map.py @@ -0,0 +1,154 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + Index, + Series, +) +import pandas._testing as tm + + +@pytest.fixture(params=[None, "ignore"]) +def na_action(request): + return request.param + + +@pytest.mark.parametrize( + "data, categories", + [ + (list("abcbca"), list("cab")), + (pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)), + ], + ids=["string", "interval"], +) +def test_map_str(data, categories, ordered, na_action): + # GH 31202 - override base class since we want to maintain categorical/ordered + cat = Categorical(data, categories=categories, ordered=ordered) + result = cat.map(str, na_action=na_action) + expected = Categorical( + map(str, data), categories=map(str, categories), ordered=ordered + ) + tm.assert_categorical_equal(result, expected) + + +def test_map(na_action): + cat = Categorical(list("ABABC"), categories=list("CBA"), ordered=True) + result = cat.map(lambda x: x.lower(), na_action=na_action) + exp = Categorical(list("ababc"), categories=list("cba"), ordered=True) + tm.assert_categorical_equal(result, exp) + + cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False) + result = cat.map(lambda x: x.lower(), na_action=na_action) + exp = Categorical(list("ababc"), categories=list("bac"), ordered=False) + tm.assert_categorical_equal(result, exp) + + # GH 12766: Return an index not an array + result = cat.map(lambda x: 1, na_action=na_action) + exp = Index(np.array([1] * 5, dtype=np.int64)) + tm.assert_index_equal(result, exp) + + # change categories dtype + cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False) + + def f(x): + return {"A": 10, "B": 20, "C": 30}.get(x) + + result = cat.map(f, na_action=na_action) + exp = Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False) + tm.assert_categorical_equal(result, exp) + + mapper = Series([10, 20, 30], index=["A", "B", "C"]) + result = cat.map(mapper, na_action=na_action) + tm.assert_categorical_equal(result, exp) + + result = cat.map({"A": 10, "B": 20, "C": 30}, na_action=na_action) + tm.assert_categorical_equal(result, exp) + + +@pytest.mark.parametrize( + ("data", "f", "expected"), + ( + ([1, 1, np.nan], pd.isna, Index([False, False, True])), + ([1, 2, np.nan], pd.isna, Index([False, False, True])), + ([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])), + ([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])), + ( + [1, 1, np.nan], + Series([False, False]), + Categorical([False, False, np.nan]), + ), + ( + [1, 2, np.nan], + Series([False] * 3), + Index([False, False, np.nan]), + ), + ), +) +def test_map_with_nan_none(data, f, expected): # GH 24241 + values = Categorical(data) + result = values.map(f, na_action=None) + if isinstance(expected, Categorical): + tm.assert_categorical_equal(result, expected) + else: + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + ("data", "f", "expected"), + ( + ([1, 1, np.nan], pd.isna, Categorical([False, False, np.nan])), + ([1, 2, np.nan], pd.isna, Index([False, False, np.nan])), + ([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])), + ([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])), + ( + [1, 1, np.nan], + Series([False, False]), + Categorical([False, False, np.nan]), + ), + ( + [1, 2, np.nan], + Series([False, False, False]), + Index([False, False, np.nan]), + ), + ), +) +def test_map_with_nan_ignore(data, f, expected): # GH 24241 + values = Categorical(data) + result = values.map(f, na_action="ignore") + if data[1] == 1: + tm.assert_categorical_equal(result, expected) + else: + tm.assert_index_equal(result, expected) + + +def test_map_with_dict_or_series(na_action): + orig_values = ["a", "B", 1, "a"] + new_values = ["one", 2, 3.0, "one"] + cat = Categorical(orig_values) + + mapper = Series(new_values[:-1], index=orig_values[:-1]) + result = cat.map(mapper, na_action=na_action) + + # Order of categories in result can be different + expected = Categorical(new_values, categories=[3.0, 2, "one"]) + tm.assert_categorical_equal(result, expected) + + mapper = dict(zip(orig_values[:-1], new_values[:-1])) + result = cat.map(mapper, na_action=na_action) + # Order of categories in result can be different + tm.assert_categorical_equal(result, expected) + + +def test_map_na_action_no_default_deprecated(): + # GH51645 + cat = Categorical(["a", "b", "c"]) + msg = ( + "The default value of 'ignore' for the `na_action` parameter in " + "pandas.Categorical.map is deprecated and will be " + "changed to 'None' in a future version. Please set na_action to the " + "desired value to avoid seeing this warning" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + cat.map(lambda x: x) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_missing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_missing.py new file mode 100644 index 0000000000000000000000000000000000000000..0eeb01b74608890daf81fef083adb29e797e57ce --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_missing.py @@ -0,0 +1,216 @@ +import collections + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Index, + Series, + isna, +) +import pandas._testing as tm + + +class TestCategoricalMissing: + def test_isna(self): + exp = np.array([False, False, True]) + cat = Categorical(["a", "b", np.nan]) + res = cat.isna() + + tm.assert_numpy_array_equal(res, exp) + + def test_na_flags_int_categories(self): + # #1457 + + categories = list(range(10)) + labels = np.random.default_rng(2).integers(0, 10, 20) + labels[::5] = -1 + + cat = Categorical(labels, categories) + repr(cat) + + tm.assert_numpy_array_equal(isna(cat), labels == -1) + + def test_nan_handling(self): + # Nans are represented as -1 in codes + c = Categorical(["a", "b", np.nan, "a"]) + tm.assert_index_equal(c.categories, Index(["a", "b"])) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) + c[1] = np.nan + tm.assert_index_equal(c.categories, Index(["a", "b"])) + tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8)) + + # Adding nan to categories should make assigned nan point to the + # category! + c = Categorical(["a", "b", np.nan, "a"]) + tm.assert_index_equal(c.categories, Index(["a", "b"])) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) + + def test_set_dtype_nans(self): + c = Categorical(["a", "b", np.nan]) + result = c._set_dtype(CategoricalDtype(["a", "c"])) + tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8")) + + def test_set_item_nan(self): + cat = Categorical([1, 2, 3]) + cat[1] = np.nan + + exp = Categorical([1, np.nan, 3], categories=[1, 2, 3]) + tm.assert_categorical_equal(cat, exp) + + @pytest.mark.parametrize( + "fillna_kwargs, msg", + [ + ( + {"value": 1, "method": "ffill"}, + "Cannot specify both 'value' and 'method'.", + ), + ({}, "Must specify a fill 'value' or 'method'."), + ({"method": "bad"}, "Invalid fill method. Expecting .* bad"), + ( + {"value": Series([1, 2, 3, 4, "a"])}, + "Cannot setitem on a Categorical with a new category", + ), + ], + ) + def test_fillna_raises(self, fillna_kwargs, msg): + # https://github.com/pandas-dev/pandas/issues/19682 + # https://github.com/pandas-dev/pandas/issues/13628 + cat = Categorical([1, 2, 3, None, None]) + + if len(fillna_kwargs) == 1 and "value" in fillna_kwargs: + err = TypeError + else: + err = ValueError + + with pytest.raises(err, match=msg): + cat.fillna(**fillna_kwargs) + + @pytest.mark.parametrize("named", [True, False]) + def test_fillna_iterable_category(self, named): + # https://github.com/pandas-dev/pandas/issues/21097 + if named: + Point = collections.namedtuple("Point", "x y") + else: + Point = lambda *args: args # tuple + cat = Categorical(np.array([Point(0, 0), Point(0, 1), None], dtype=object)) + result = cat.fillna(Point(0, 0)) + expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)]) + + tm.assert_categorical_equal(result, expected) + + # Case where the Point is not among our categories; we want ValueError, + # not NotImplementedError GH#41914 + cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object)) + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(TypeError, match=msg): + cat.fillna(Point(0, 0)) + + def test_fillna_array(self): + # accept Categorical or ndarray value if it holds appropriate values + cat = Categorical(["A", "B", "C", None, None]) + + other = cat.fillna("C") + result = cat.fillna(other) + tm.assert_categorical_equal(result, other) + assert isna(cat[-1]) # didn't modify original inplace + + other = np.array(["A", "B", "C", "B", "A"]) + result = cat.fillna(other) + expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype) + tm.assert_categorical_equal(result, expected) + assert isna(cat[-1]) # didn't modify original inplace + + @pytest.mark.parametrize( + "values, expected", + [ + ([1, 2, 3], np.array([False, False, False])), + ([1, 2, np.nan], np.array([False, False, True])), + ([1, 2, np.inf], np.array([False, False, True])), + ([1, 2, pd.NA], np.array([False, False, True])), + ], + ) + def test_use_inf_as_na(self, values, expected): + # https://github.com/pandas-dev/pandas/issues/33594 + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.use_inf_as_na", True): + cat = Categorical(values) + result = cat.isna() + tm.assert_numpy_array_equal(result, expected) + + result = Series(cat).isna() + expected = Series(expected) + tm.assert_series_equal(result, expected) + + result = DataFrame(cat).isna() + expected = DataFrame(expected) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "values, expected", + [ + ([1, 2, 3], np.array([False, False, False])), + ([1, 2, np.nan], np.array([False, False, True])), + ([1, 2, np.inf], np.array([False, False, True])), + ([1, 2, pd.NA], np.array([False, False, True])), + ], + ) + def test_use_inf_as_na_outside_context(self, values, expected): + # https://github.com/pandas-dev/pandas/issues/33594 + # Using isna directly for Categorical will fail in general here + cat = Categorical(values) + + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.use_inf_as_na", True): + result = isna(cat) + tm.assert_numpy_array_equal(result, expected) + + result = isna(Series(cat)) + expected = Series(expected) + tm.assert_series_equal(result, expected) + + result = isna(DataFrame(cat)) + expected = DataFrame(expected) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "a1, a2, categories", + [ + (["a", "b", "c"], [np.nan, "a", "b"], ["a", "b", "c"]), + ([1, 2, 3], [np.nan, 1, 2], [1, 2, 3]), + ], + ) + def test_compare_categorical_with_missing(self, a1, a2, categories): + # GH 28384 + cat_type = CategoricalDtype(categories) + + # != + result = Series(a1, dtype=cat_type) != Series(a2, dtype=cat_type) + expected = Series(a1) != Series(a2) + tm.assert_series_equal(result, expected) + + # == + result = Series(a1, dtype=cat_type) == Series(a2, dtype=cat_type) + expected = Series(a1) == Series(a2) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "na_value, dtype", + [ + (pd.NaT, "datetime64[ns]"), + (None, "float64"), + (np.nan, "float64"), + (pd.NA, "float64"), + ], + ) + def test_categorical_only_missing_values_no_cast(self, na_value, dtype): + # GH#44900 + result = Categorical([na_value, na_value]) + tm.assert_index_equal(result.categories, Index([], dtype=dtype)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_operators.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_operators.py new file mode 100644 index 0000000000000000000000000000000000000000..4174d2adc810b872e7ec0b1e3ca820e3d2c3920d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_operators.py @@ -0,0 +1,414 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestCategoricalOpsWithFactor: + def test_categories_none_comparisons(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) + tm.assert_categorical_equal(factor, factor) + + def test_comparisons(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) + result = factor[factor == "a"] + expected = factor[np.asarray(factor) == "a"] + tm.assert_categorical_equal(result, expected) + + result = factor[factor != "a"] + expected = factor[np.asarray(factor) != "a"] + tm.assert_categorical_equal(result, expected) + + result = factor[factor < "c"] + expected = factor[np.asarray(factor) < "c"] + tm.assert_categorical_equal(result, expected) + + result = factor[factor > "a"] + expected = factor[np.asarray(factor) > "a"] + tm.assert_categorical_equal(result, expected) + + result = factor[factor >= "b"] + expected = factor[np.asarray(factor) >= "b"] + tm.assert_categorical_equal(result, expected) + + result = factor[factor <= "b"] + expected = factor[np.asarray(factor) <= "b"] + tm.assert_categorical_equal(result, expected) + + n = len(factor) + + other = factor[np.random.default_rng(2).permutation(n)] + result = factor == other + expected = np.asarray(factor) == np.asarray(other) + tm.assert_numpy_array_equal(result, expected) + + result = factor == "d" + expected = np.zeros(len(factor), dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + # comparisons with categoricals + cat_rev = Categorical(["a", "b", "c"], categories=["c", "b", "a"], ordered=True) + cat_rev_base = Categorical( + ["b", "b", "b"], categories=["c", "b", "a"], ordered=True + ) + cat = Categorical(["a", "b", "c"], ordered=True) + cat_base = Categorical(["b", "b", "b"], categories=cat.categories, ordered=True) + + # comparisons need to take categories ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = np.array([True, False, False]) + tm.assert_numpy_array_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = np.array([False, False, True]) + tm.assert_numpy_array_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = np.array([False, False, True]) + tm.assert_numpy_array_equal(res, exp) + + # Only categories with same categories can be compared + msg = "Categoricals can only be compared if 'categories' are the same" + with pytest.raises(TypeError, match=msg): + cat > cat_rev + + cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"]) + + with pytest.raises(TypeError, match=msg): + cat_rev > cat_rev_base2 + + # Only categories with same ordering information can be compared + cat_unordered = cat.set_ordered(False) + assert not (cat > cat).any() + + with pytest.raises(TypeError, match=msg): + cat > cat_unordered + + # comparison (in both directions) with Series will raise + s = Series(["b", "b", "b"], dtype=object) + msg = ( + "Cannot compare a Categorical for op __gt__ with type " + r"" + ) + with pytest.raises(TypeError, match=msg): + cat > s + with pytest.raises(TypeError, match=msg): + cat_rev > s + with pytest.raises(TypeError, match=msg): + s < cat + with pytest.raises(TypeError, match=msg): + s < cat_rev + + # comparison with numpy.array will raise in both direction, but only on + # newer numpy versions + a = np.array(["b", "b", "b"], dtype=object) + with pytest.raises(TypeError, match=msg): + cat > a + with pytest.raises(TypeError, match=msg): + cat_rev > a + + # Make sure that unequal comparison take the categories order in + # account + cat_rev = Categorical(list("abc"), categories=list("cba"), ordered=True) + exp = np.array([True, False, False]) + res = cat_rev > "b" + tm.assert_numpy_array_equal(res, exp) + + # check that zero-dim array gets unboxed + res = cat_rev > np.array("b") + tm.assert_numpy_array_equal(res, exp) + + +class TestCategoricalOps: + @pytest.mark.parametrize( + "categories", + [["a", "b"], [0, 1], [Timestamp("2019"), Timestamp("2020")]], + ) + def test_not_equal_with_na(self, categories): + # https://github.com/pandas-dev/pandas/issues/32276 + c1 = Categorical.from_codes([-1, 0], categories=categories) + c2 = Categorical.from_codes([0, 1], categories=categories) + + result = c1 != c2 + + assert result.all() + + def test_compare_frame(self): + # GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame + data = ["a", "b", 2, "a"] + cat = Categorical(data) + + df = DataFrame(cat) + + result = cat == df.T + expected = DataFrame([[True, True, True, True]]) + tm.assert_frame_equal(result, expected) + + result = cat[::-1] != df.T + expected = DataFrame([[False, True, True, False]]) + tm.assert_frame_equal(result, expected) + + def test_compare_frame_raises(self, comparison_op): + # alignment raises unless we transpose + op = comparison_op + cat = Categorical(["a", "b", 2, "a"]) + df = DataFrame(cat) + msg = "Unable to coerce to Series, length must be 1: given 4" + with pytest.raises(ValueError, match=msg): + op(cat, df) + + def test_datetime_categorical_comparison(self): + dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True) + tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True])) + tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, np.array([False, True, True])) + + def test_reflected_comparison_with_scalars(self): + # GH8658 + cat = Categorical([1, 2, 3], ordered=True) + tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True, True])) + tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True, True])) + + def test_comparison_with_unknown_scalars(self): + # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057 + # and following comparisons with scalars not in categories should raise + # for unequal comps, but not for equal/not equal + cat = Categorical([1, 2, 3], ordered=True) + + msg = "Invalid comparison between dtype=category and int" + with pytest.raises(TypeError, match=msg): + cat < 4 + with pytest.raises(TypeError, match=msg): + cat > 4 + with pytest.raises(TypeError, match=msg): + 4 < cat + with pytest.raises(TypeError, match=msg): + 4 > cat + + tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False])) + tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) + + def test_comparison_with_tuple(self): + cat = Categorical(np.array(["foo", (0, 1), 3, (0, 1)], dtype=object)) + + result = cat == "foo" + expected = np.array([True, False, False, False], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + result = cat == (0, 1) + expected = np.array([False, True, False, True], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + result = cat != (0, 1) + tm.assert_numpy_array_equal(result, ~expected) + + @pytest.mark.filterwarnings("ignore::RuntimeWarning") + def test_comparison_of_ordered_categorical_with_nan_to_scalar( + self, compare_operators_no_eq_ne + ): + # https://github.com/pandas-dev/pandas/issues/26504 + # BUG: fix ordered categorical comparison with missing values (#26504 ) + # and following comparisons with scalars in categories with missing + # values should be evaluated as False + + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + scalar = 2 + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) + actual = getattr(cat, compare_operators_no_eq_ne)(scalar) + tm.assert_numpy_array_equal(actual, expected) + + @pytest.mark.filterwarnings("ignore::RuntimeWarning") + def test_comparison_of_ordered_categorical_with_nan_to_listlike( + self, compare_operators_no_eq_ne + ): + # https://github.com/pandas-dev/pandas/issues/26504 + # and following comparisons of missing values in ordered Categorical + # with listlike should be evaluated as False + + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) + actual = getattr(cat, compare_operators_no_eq_ne)(other) + tm.assert_numpy_array_equal(actual, expected) + + @pytest.mark.parametrize( + "data,reverse,base", + [(list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])], + ) + def test_comparisons(self, data, reverse, base): + cat_rev = Series(Categorical(data, categories=reverse, ordered=True)) + cat_rev_base = Series(Categorical(base, categories=reverse, ordered=True)) + cat = Series(Categorical(data, ordered=True)) + cat_base = Series( + Categorical(base, categories=cat.cat.categories, ordered=True) + ) + s = Series(base, dtype=object if base == list("bbb") else None) + a = np.array(base) + + # comparisons need to take categories ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = Series([True, False, False]) + tm.assert_series_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = Series([False, False, True]) + tm.assert_series_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = Series([False, False, True]) + tm.assert_series_equal(res, exp) + + scalar = base[1] + res = cat > scalar + exp = Series([False, False, True]) + exp2 = cat.values > scalar + tm.assert_series_equal(res, exp) + tm.assert_numpy_array_equal(res.values, exp2) + res_rev = cat_rev > scalar + exp_rev = Series([True, False, False]) + exp_rev2 = cat_rev.values > scalar + tm.assert_series_equal(res_rev, exp_rev) + tm.assert_numpy_array_equal(res_rev.values, exp_rev2) + + # Only categories with same categories can be compared + msg = "Categoricals can only be compared if 'categories' are the same" + with pytest.raises(TypeError, match=msg): + cat > cat_rev + + # categorical cannot be compared to Series or numpy array, and also + # not the other way around + msg = ( + "Cannot compare a Categorical for op __gt__ with type " + r"" + ) + with pytest.raises(TypeError, match=msg): + cat > s + with pytest.raises(TypeError, match=msg): + cat_rev > s + with pytest.raises(TypeError, match=msg): + cat > a + with pytest.raises(TypeError, match=msg): + cat_rev > a + + with pytest.raises(TypeError, match=msg): + s < cat + with pytest.raises(TypeError, match=msg): + s < cat_rev + + with pytest.raises(TypeError, match=msg): + a < cat + with pytest.raises(TypeError, match=msg): + a < cat_rev + + @pytest.mark.parametrize( + "ctor", + [ + lambda *args, **kwargs: Categorical(*args, **kwargs), + lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), + ], + ) + def test_unordered_different_order_equal(self, ctor): + # https://github.com/pandas-dev/pandas/issues/16014 + c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) + c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) + assert (c1 == c2).all() + + c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) + c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False) + assert (c1 != c2).all() + + c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) + c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False) + assert (c1 != c2).all() + + c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) + c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) + result = c1 == c2 + tm.assert_numpy_array_equal(np.array(result), np.array([True, False])) + + def test_unordered_different_categories_raises(self): + c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False) + c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False) + + with pytest.raises(TypeError, match=("Categoricals can only be compared")): + c1 == c2 + + def test_compare_different_lengths(self): + c1 = Categorical([], categories=["a", "b"]) + c2 = Categorical([], categories=["a"]) + + msg = "Categoricals can only be compared if 'categories' are the same." + with pytest.raises(TypeError, match=msg): + c1 == c2 + + def test_compare_unordered_different_order(self): + # https://github.com/pandas-dev/pandas/issues/16603#issuecomment- + # 349290078 + a = Categorical(["a"], categories=["a", "b"]) + b = Categorical(["b"], categories=["b", "a"]) + assert not a.equals(b) + + def test_numeric_like_ops(self): + df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)}) + labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) + + # numeric ops should not succeed + for op, str_rep in [ + ("__add__", r"\+"), + ("__sub__", "-"), + ("__mul__", r"\*"), + ("__truediv__", "/"), + ]: + msg = f"Series cannot perform the operation {str_rep}|unsupported operand" + with pytest.raises(TypeError, match=msg): + getattr(df, op)(df) + + # reduction ops should not succeed (unless specifically defined, e.g. + # min/max) + s = df["value_group"] + for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]: + msg = f"does not support reduction '{op}'" + with pytest.raises(TypeError, match=msg): + getattr(s, op)(numeric_only=False) + + def test_numeric_like_ops_series(self): + # numpy ops + s = Series(Categorical([1, 2, 3, 4])) + with pytest.raises(TypeError, match="does not support reduction 'sum'"): + np.sum(s) + + @pytest.mark.parametrize( + "op, str_rep", + [ + ("__add__", r"\+"), + ("__sub__", "-"), + ("__mul__", r"\*"), + ("__truediv__", "/"), + ], + ) + def test_numeric_like_ops_series_arith(self, op, str_rep): + # numeric ops on a Series + s = Series(Categorical([1, 2, 3, 4])) + msg = f"Series cannot perform the operation {str_rep}|unsupported operand" + with pytest.raises(TypeError, match=msg): + getattr(s, op)(2) + + def test_numeric_like_ops_series_invalid(self): + # invalid ufunc + s = Series(Categorical([1, 2, 3, 4])) + msg = "Object with dtype category cannot perform the numpy op log" + with pytest.raises(TypeError, match=msg): + np.log(s) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_replace.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_replace.py new file mode 100644 index 0000000000000000000000000000000000000000..3c677142846d73f7cfd08c6681ff0d7814b55bd1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_replace.py @@ -0,0 +1,111 @@ +import pytest + +import pandas as pd +from pandas import Categorical +import pandas._testing as tm + + +@pytest.mark.parametrize( + "to_replace,value,expected,flip_categories", + [ + # one-to-one + (1, 2, [2, 2, 3], False), + (1, 4, [4, 2, 3], False), + (4, 1, [1, 2, 3], False), + (5, 6, [1, 2, 3], False), + # many-to-one + ([1], 2, [2, 2, 3], False), + ([1, 2], 3, [3, 3, 3], False), + ([1, 2], 4, [4, 4, 3], False), + ((1, 2, 4), 5, [5, 5, 3], False), + ((5, 6), 2, [1, 2, 3], False), + ([1], [2], [2, 2, 3], False), + ([1, 4], [5, 2], [5, 2, 3], False), + # GH49404: overlap between to_replace and value + ([1, 2, 3], [2, 3, 4], [2, 3, 4], False), + # GH50872, GH46884: replace with null + (1, None, [None, 2, 3], False), + (1, pd.NA, [None, 2, 3], False), + # check_categorical sorts categories, which crashes on mixed dtypes + (3, "4", [1, 2, "4"], False), + ([1, 2, "3"], "5", ["5", "5", 3], True), + ], +) +@pytest.mark.filterwarnings( + "ignore:.*with CategoricalDtype is deprecated:FutureWarning" +) +def test_replace_categorical_series(to_replace, value, expected, flip_categories): + # GH 31720 + + ser = pd.Series([1, 2, 3], dtype="category") + result = ser.replace(to_replace, value) + expected = pd.Series(expected, dtype="category") + ser.replace(to_replace, value, inplace=True) + + if flip_categories: + expected = expected.cat.set_categories(expected.cat.categories[::-1]) + + tm.assert_series_equal(expected, result, check_category_order=False) + tm.assert_series_equal(expected, ser, check_category_order=False) + + +@pytest.mark.parametrize( + "to_replace, value, result, expected_error_msg", + [ + ("b", "c", ["a", "c"], "Categorical.categories are different"), + ("c", "d", ["a", "b"], None), + # https://github.com/pandas-dev/pandas/issues/33288 + ("a", "a", ["a", "b"], None), + ("b", None, ["a", None], "Categorical.categories length are different"), + ], +) +def test_replace_categorical(to_replace, value, result, expected_error_msg): + # GH#26988 + cat = Categorical(["a", "b"]) + expected = Categorical(result) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if expected_error_msg is not None else None + with tm.assert_produces_warning(warn, match=msg): + result = pd.Series(cat, copy=False).replace(to_replace, value)._values + + tm.assert_categorical_equal(result, expected) + if to_replace == "b": # the "c" test is supposed to be unchanged + with pytest.raises(AssertionError, match=expected_error_msg): + # ensure non-inplace call does not affect original + tm.assert_categorical_equal(cat, expected) + + ser = pd.Series(cat, copy=False) + with tm.assert_produces_warning(warn, match=msg): + ser.replace(to_replace, value, inplace=True) + tm.assert_categorical_equal(cat, expected) + + +def test_replace_categorical_ea_dtype(): + # GH49404 + cat = Categorical(pd.array(["a", "b"], dtype="string")) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values + expected = Categorical(pd.array(["c", pd.NA], dtype="string")) + tm.assert_categorical_equal(result, expected) + + +def test_replace_maintain_ordering(): + # GH51016 + dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) + ser = pd.Series([0, 1, 2], dtype=dtype) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace(0, 2) + expected_dtype = pd.CategoricalDtype([1, 2], ordered=True) + expected = pd.Series([2, 1, 2], dtype=expected_dtype) + tm.assert_series_equal(expected, result, check_category_order=True) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_repr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_repr.py new file mode 100644 index 0000000000000000000000000000000000000000..ef0315130215cc762e2fad6fc07a97c9f4b94eb8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_repr.py @@ -0,0 +1,550 @@ +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +from pandas import ( + Categorical, + CategoricalDtype, + CategoricalIndex, + Index, + Series, + date_range, + option_context, + period_range, + timedelta_range, +) + + +class TestCategoricalReprWithFactor: + def test_print(self, using_infer_string): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) + if using_infer_string: + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + "Categories (3, string): [a < b < c]", + ] + else: + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + "Categories (3, object): ['a' < 'b' < 'c']", + ] + expected = "\n".join(expected) + actual = repr(factor) + assert actual == expected + + +class TestCategoricalRepr: + def test_big_print(self): + codes = np.array([0, 1, 2, 0, 1, 2] * 100) + dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object)) + factor = Categorical.from_codes(codes, dtype=dtype) + expected = [ + "['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']", + "Length: 600", + "Categories (3, object): ['a', 'b', 'c']", + ] + expected = "\n".join(expected) + + actual = repr(factor) + + assert actual == expected + + def test_empty_print(self): + factor = Categorical([], Index(["a", "b", "c"], dtype=object)) + expected = "[], Categories (3, object): ['a', 'b', 'c']" + actual = repr(factor) + assert actual == expected + + assert expected == actual + factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True) + expected = "[], Categories (3, object): ['a' < 'b' < 'c']" + actual = repr(factor) + assert expected == actual + + factor = Categorical([], []) + expected = "[], Categories (0, object): []" + assert expected == repr(factor) + + def test_print_none_width(self): + # GH10087 + a = Series(Categorical([1, 2, 3, 4])) + exp = ( + "0 1\n1 2\n2 3\n3 4\n" + "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + ) + + with option_context("display.width", None): + assert exp == repr(a) + + @pytest.mark.skipif( + using_pyarrow_string_dtype(), + reason="Change once infer_string is set to True by default", + ) + def test_unicode_print(self): + c = Categorical(["aaaaa", "bb", "cccc"] * 20) + expected = """\ +['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc'] +Length: 60 +Categories (3, object): ['aaaaa', 'bb', 'cccc']""" + + assert repr(c) == expected + + c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) + expected = """\ +['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう'] +Length: 60 +Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 + + assert repr(c) == expected + + # unicode option should not affect to Categorical, as it doesn't care + # the repr width + with option_context("display.unicode.east_asian_width", True): + c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) + expected = """['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう'] +Length: 60 +Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 + + assert repr(c) == expected + + def test_categorical_repr(self): + c = Categorical([1, 2, 3]) + exp = """[1, 2, 3] +Categories (3, int64): [1, 2, 3]""" + + assert repr(c) == exp + + c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) + exp = """[1, 2, 3, 1, 2, 3] +Categories (3, int64): [1, 2, 3]""" + + assert repr(c) == exp + + c = Categorical([1, 2, 3, 4, 5] * 10) + exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] +Length: 50 +Categories (5, int64): [1, 2, 3, 4, 5]""" + + assert repr(c) == exp + + c = Categorical(np.arange(20, dtype=np.int64)) + exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] +Length: 20 +Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]""" + + assert repr(c) == exp + + def test_categorical_repr_ordered(self): + c = Categorical([1, 2, 3], ordered=True) + exp = """[1, 2, 3] +Categories (3, int64): [1 < 2 < 3]""" + + assert repr(c) == exp + + c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True) + exp = """[1, 2, 3, 1, 2, 3] +Categories (3, int64): [1 < 2 < 3]""" + + assert repr(c) == exp + + c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True) + exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] +Length: 50 +Categories (5, int64): [1 < 2 < 3 < 4 < 5]""" + + assert repr(c) == exp + + c = Categorical(np.arange(20, dtype=np.int64), ordered=True) + exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] +Length: 20 +Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]""" + + assert repr(c) == exp + + def test_categorical_repr_datetime(self): + idx = date_range("2011-01-01 09:00", freq="h", periods=5) + c = Categorical(idx) + + exp = ( + "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, " + "2011-01-01 12:00:00, 2011-01-01 13:00:00]\n" + "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, " + "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n" + " 2011-01-01 12:00:00, " + "2011-01-01 13:00:00]" + "" + ) + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = ( + "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, " + "2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, " + "2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, " + "2011-01-01 13:00:00]\n" + "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, " + "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n" + " 2011-01-01 12:00:00, " + "2011-01-01 13:00:00]" + ) + + assert repr(c) == exp + + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") + c = Categorical(idx) + exp = ( + "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, " + "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, " + "2011-01-01 13:00:00-05:00]\n" + "Categories (5, datetime64[ns, US/Eastern]): " + "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n" + " " + "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n" + " " + "2011-01-01 13:00:00-05:00]" + ) + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = ( + "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, " + "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, " + "2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, " + "2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, " + "2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]\n" + "Categories (5, datetime64[ns, US/Eastern]): " + "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n" + " " + "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n" + " " + "2011-01-01 13:00:00-05:00]" + ) + + assert repr(c) == exp + + def test_categorical_repr_datetime_ordered(self): + idx = date_range("2011-01-01 09:00", freq="h", periods=5) + c = Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501 + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501 + + assert repr(c) == exp + + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") + c = Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" # noqa: E501 + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" # noqa: E501 + + assert repr(c) == exp + + def test_categorical_repr_int_with_nan(self): + c = Categorical([1, 2, np.nan]) + c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]""" + assert repr(c) == c_exp + + s = Series([1, 2, np.nan], dtype="object").astype("category") + s_exp = """0 1\n1 2\n2 NaN +dtype: category +Categories (2, int64): [1, 2]""" + assert repr(s) == s_exp + + def test_categorical_repr_period(self): + idx = period_range("2011-01-01 09:00", freq="h", periods=5) + c = Categorical(idx) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" # noqa: E501 + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" # noqa: E501 + + assert repr(c) == exp + + idx = period_range("2011-01", freq="M", periods=5) + c = Categorical(idx) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" # noqa: E501 + + assert repr(c) == exp + + def test_categorical_repr_period_ordered(self): + idx = period_range("2011-01-01 09:00", freq="h", periods=5) + c = Categorical(idx, ordered=True) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" # noqa: E501 + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] +Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" # noqa: E501 + + assert repr(c) == exp + + idx = period_range("2011-01", freq="M", periods=5) + c = Categorical(idx, ordered=True) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" # noqa: E501 + + assert repr(c) == exp + + def test_categorical_repr_timedelta(self): + idx = timedelta_range("1 days", periods=5) + c = Categorical(idx) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" # noqa: E501 + + assert repr(c) == exp + + idx = timedelta_range("1 hours", periods=20) + c = Categorical(idx) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 20 +Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, + 18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501 + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 40 +Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, + 18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501 + + assert repr(c) == exp + + def test_categorical_repr_timedelta_ordered(self): + idx = timedelta_range("1 days", periods=5) + c = Categorical(idx, ordered=True) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days] +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa: E501 + + assert repr(c) == exp + + idx = timedelta_range("1 hours", periods=20) + c = Categorical(idx, ordered=True) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 20 +Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < + 18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501 + + assert repr(c) == exp + + c = Categorical(idx.append(idx), categories=idx, ordered=True) + exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] +Length: 40 +Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < + 18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501 + + assert repr(c) == exp + + def test_categorical_index_repr(self): + idx = CategoricalIndex(Categorical([1, 2, 3])) + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa: E501 + assert repr(idx) == exp + + i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64))) + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=False, dtype='category')""" # noqa: E501 + assert repr(i) == exp + + def test_categorical_index_repr_ordered(self): + i = CategoricalIndex(Categorical([1, 2, 3], ordered=True)) + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa: E501 + assert repr(i) == exp + + i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64), ordered=True)) + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=True, dtype='category')""" # noqa: E501 + assert repr(i) == exp + + def test_categorical_index_repr_datetime(self): + idx = date_range("2011-01-01 09:00", freq="h", periods=5) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', + '2011-01-01 11:00:00', '2011-01-01 12:00:00', + '2011-01-01 13:00:00'], + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa: E501 + + assert repr(i) == exp + + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa: E501 + + assert repr(i) == exp + + def test_categorical_index_repr_datetime_ordered(self): + idx = date_range("2011-01-01 09:00", freq="h", periods=5) + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', + '2011-01-01 11:00:00', '2011-01-01 12:00:00', + '2011-01-01 13:00:00'], + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa: E501 + + assert repr(i) == exp + + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501 + + assert repr(i) == exp + + i = CategoricalIndex(Categorical(idx.append(idx), ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', + '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', + '2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00', + '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', + '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501 + + assert repr(i) == exp + + def test_categorical_index_repr_period(self): + # test all length + idx = period_range("2011-01-01 09:00", freq="h", periods=1) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa: E501 + assert repr(i) == exp + + idx = period_range("2011-01-01 09:00", freq="h", periods=2) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa: E501 + assert repr(i) == exp + + idx = period_range("2011-01-01 09:00", freq="h", periods=3) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa: E501 + assert repr(i) == exp + + idx = period_range("2011-01-01 09:00", freq="h", periods=5) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501 + + assert repr(i) == exp + + i = CategoricalIndex(Categorical(idx.append(idx))) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00', + '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', + '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501 + + assert repr(i) == exp + + idx = period_range("2011-01", freq="M", periods=5) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa: E501 + assert repr(i) == exp + + def test_categorical_index_repr_period_ordered(self): + idx = period_range("2011-01-01 09:00", freq="h", periods=5) + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', + '2011-01-01 12:00', '2011-01-01 13:00'], + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa: E501 + + assert repr(i) == exp + + idx = period_range("2011-01", freq="M", periods=5) + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa: E501 + assert repr(i) == exp + + def test_categorical_index_repr_timedelta(self): + idx = timedelta_range("1 days", periods=5) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=False, dtype='category')""" # noqa: E501 + assert repr(i) == exp + + idx = timedelta_range("1 hours", periods=10) + i = CategoricalIndex(Categorical(idx)) + exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', + '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', + '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', + '9 days 01:00:00'], + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=False, dtype='category')""" # noqa: E501 + + assert repr(i) == exp + + def test_categorical_index_repr_timedelta_ordered(self): + idx = timedelta_range("1 days", periods=5) + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=True, dtype='category')""" # noqa: E501 + assert repr(i) == exp + + idx = timedelta_range("1 hours", periods=10) + i = CategoricalIndex(Categorical(idx, ordered=True)) + exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', + '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', + '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', + '9 days 01:00:00'], + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=True, dtype='category')""" # noqa: E501 + + assert repr(i) == exp + + def test_categorical_str_repr(self): + # GH 33676 + result = repr(Categorical([1, "2", 3, 4])) + expected = "[1, '2', 3, 4]\nCategories (4, object): [1, 3, 4, '2']" + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_sorting.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_sorting.py new file mode 100644 index 0000000000000000000000000000000000000000..ae527065b3fb970263609881d217f5c6d2761231 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_sorting.py @@ -0,0 +1,128 @@ +import numpy as np +import pytest + +from pandas import ( + Categorical, + Index, +) +import pandas._testing as tm + + +class TestCategoricalSort: + def test_argsort(self): + c = Categorical([5, 3, 1, 4, 2], ordered=True) + + expected = np.array([2, 4, 1, 3, 0]) + tm.assert_numpy_array_equal( + c.argsort(ascending=True), expected, check_dtype=False + ) + + expected = expected[::-1] + tm.assert_numpy_array_equal( + c.argsort(ascending=False), expected, check_dtype=False + ) + + def test_numpy_argsort(self): + c = Categorical([5, 3, 1, 4, 2], ordered=True) + + expected = np.array([2, 4, 1, 3, 0]) + tm.assert_numpy_array_equal(np.argsort(c), expected, check_dtype=False) + + tm.assert_numpy_array_equal( + np.argsort(c, kind="mergesort"), expected, check_dtype=False + ) + + msg = "the 'axis' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(c, axis=0) + + msg = "the 'order' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argsort(c, order="C") + + def test_sort_values(self): + # unordered cats are sortable + cat = Categorical(["a", "b", "b", "a"], ordered=False) + cat.sort_values() + + cat = Categorical(["a", "c", "b", "d"], ordered=True) + + # sort_values + res = cat.sort_values() + exp = np.array(["a", "b", "c", "d"], dtype=object) + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, cat.categories) + + cat = Categorical( + ["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True + ) + res = cat.sort_values() + exp = np.array(["a", "b", "c", "d"], dtype=object) + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, cat.categories) + + res = cat.sort_values(ascending=False) + exp = np.array(["d", "c", "b", "a"], dtype=object) + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, cat.categories) + + # sort (inplace order) + cat1 = cat.copy() + orig_codes = cat1._codes + cat1.sort_values(inplace=True) + assert cat1._codes is orig_codes + exp = np.array(["a", "b", "c", "d"], dtype=object) + tm.assert_numpy_array_equal(cat1.__array__(), exp) + tm.assert_index_equal(res.categories, cat.categories) + + # reverse + cat = Categorical(["a", "c", "c", "b", "d"], ordered=True) + res = cat.sort_values(ascending=False) + exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) + tm.assert_numpy_array_equal(res.__array__(), exp_val) + tm.assert_index_equal(res.categories, exp_categories) + + def test_sort_values_na_position(self): + # see gh-12882 + cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True) + exp_categories = Index([2, 5]) + + exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) + res = cat.sort_values() # default arguments + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, exp_categories) + + exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0]) + res = cat.sort_values(ascending=True, na_position="first") + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, exp_categories) + + exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0]) + res = cat.sort_values(ascending=False, na_position="first") + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, exp_categories) + + exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) + res = cat.sort_values(ascending=True, na_position="last") + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, exp_categories) + + exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan]) + res = cat.sort_values(ascending=False, na_position="last") + tm.assert_numpy_array_equal(res.__array__(), exp) + tm.assert_index_equal(res.categories, exp_categories) + + cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) + res = cat.sort_values(ascending=False, na_position="last") + exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) + tm.assert_numpy_array_equal(res.__array__(), exp_val) + tm.assert_index_equal(res.categories, exp_categories) + + cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) + res = cat.sort_values(ascending=False, na_position="first") + exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) + tm.assert_numpy_array_equal(res.__array__(), exp_val) + tm.assert_index_equal(res.categories, exp_categories) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_subclass.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_subclass.py new file mode 100644 index 0000000000000000000000000000000000000000..5b0c0a44e655d5dd943f95415336204aa12f0b67 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_subclass.py @@ -0,0 +1,26 @@ +from pandas import Categorical +import pandas._testing as tm + + +class SubclassedCategorical(Categorical): + pass + + +class TestCategoricalSubclassing: + def test_constructor(self): + sc = SubclassedCategorical(["a", "b", "c"]) + assert isinstance(sc, SubclassedCategorical) + tm.assert_categorical_equal(sc, Categorical(["a", "b", "c"])) + + def test_from_codes(self): + sc = SubclassedCategorical.from_codes([1, 0, 2], ["a", "b", "c"]) + assert isinstance(sc, SubclassedCategorical) + exp = Categorical.from_codes([1, 0, 2], ["a", "b", "c"]) + tm.assert_categorical_equal(sc, exp) + + def test_map(self): + sc = SubclassedCategorical(["a", "b", "c"]) + res = sc.map(lambda x: x.upper(), na_action=None) + assert isinstance(res, SubclassedCategorical) + exp = Categorical(["A", "B", "C"]) + tm.assert_categorical_equal(res, exp) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_take.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_take.py new file mode 100644 index 0000000000000000000000000000000000000000..373f1b30a13c2daff23e14a3e0640e7a716cceb3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_take.py @@ -0,0 +1,89 @@ +import numpy as np +import pytest + +from pandas import Categorical +import pandas._testing as tm + + +@pytest.fixture(params=[True, False]) +def allow_fill(request): + """Boolean 'allow_fill' parameter for Categorical.take""" + return request.param + + +class TestTake: + # https://github.com/pandas-dev/pandas/issues/20664 + + def test_take_default_allow_fill(self): + cat = Categorical(["a", "b"]) + with tm.assert_produces_warning(None): + result = cat.take([0, -1]) + + assert result.equals(cat) + + def test_take_positive_no_warning(self): + cat = Categorical(["a", "b"]) + with tm.assert_produces_warning(None): + cat.take([0, 0]) + + def test_take_bounds(self, allow_fill): + # https://github.com/pandas-dev/pandas/issues/20664 + cat = Categorical(["a", "b", "a"]) + if allow_fill: + msg = "indices are out-of-bounds" + else: + msg = "index 4 is out of bounds for( axis 0 with)? size 3" + with pytest.raises(IndexError, match=msg): + cat.take([4, 5], allow_fill=allow_fill) + + def test_take_empty(self, allow_fill): + # https://github.com/pandas-dev/pandas/issues/20664 + cat = Categorical([], categories=["a", "b"]) + if allow_fill: + msg = "indices are out-of-bounds" + else: + msg = "cannot do a non-empty take from an empty axes" + with pytest.raises(IndexError, match=msg): + cat.take([0], allow_fill=allow_fill) + + def test_positional_take(self, ordered): + cat = Categorical(["a", "a", "b", "b"], categories=["b", "a"], ordered=ordered) + result = cat.take([0, 1, 2], allow_fill=False) + expected = Categorical( + ["a", "a", "b"], categories=cat.categories, ordered=ordered + ) + tm.assert_categorical_equal(result, expected) + + def test_positional_take_unobserved(self, ordered): + cat = Categorical(["a", "b"], categories=["a", "b", "c"], ordered=ordered) + result = cat.take([1, 0], allow_fill=False) + expected = Categorical(["b", "a"], categories=cat.categories, ordered=ordered) + tm.assert_categorical_equal(result, expected) + + def test_take_allow_fill(self): + # https://github.com/pandas-dev/pandas/issues/23296 + cat = Categorical(["a", "a", "b"]) + result = cat.take([0, -1, -1], allow_fill=True) + expected = Categorical(["a", np.nan, np.nan], categories=["a", "b"]) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_with_negative_one(self): + # -1 was a category + cat = Categorical([-1, 0, 1]) + result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1) + expected = Categorical([-1, -1, 0], categories=[-1, 0, 1]) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_value(self): + # https://github.com/pandas-dev/pandas/issues/23296 + cat = Categorical(["a", "b", "c"]) + result = cat.take([0, 1, -1], fill_value="a", allow_fill=True) + expected = Categorical(["a", "b", "a"], categories=["a", "b", "c"]) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_value_new_raises(self): + # https://github.com/pandas-dev/pandas/issues/23296 + cat = Categorical(["a", "b", "c"]) + xpr = r"Cannot setitem on a Categorical with a new category \(d\)" + with pytest.raises(TypeError, match=xpr): + cat.take([0, 1, -1], fill_value="d", allow_fill=True) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_warnings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_warnings.py new file mode 100644 index 0000000000000000000000000000000000000000..68c59706a6c3bf93908108c337b51c8da187cbb4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/categorical/test_warnings.py @@ -0,0 +1,19 @@ +import pytest + +import pandas._testing as tm + + +class TestCategoricalWarnings: + def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 + pytest.importorskip("IPython", minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; c = pd.Categorical([])" + ip.run_cell(code) + + # GH 31324 newer jedi version raises Deprecation warning; + # appears resolved 2021-02-02 + with tm.assert_produces_warning(None, raise_on_extra_warnings=False): + with provisionalcompleter("ignore"): + list(ip.Completer.completions("c.", 1)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/test_constructors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..3652b5fec46bbe7a519dd2c3a196ac87bd74784f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/test_constructors.py @@ -0,0 +1,284 @@ +import numpy as np +import pytest + +from pandas._libs import iNaT + +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray + + +class TestDatetimeArrayConstructor: + def test_from_sequence_invalid_type(self): + mi = pd.MultiIndex.from_product([np.arange(5), np.arange(5)]) + with pytest.raises(TypeError, match="Cannot create a DatetimeArray"): + DatetimeArray._from_sequence(mi, dtype="M8[ns]") + + def test_only_1dim_accepted(self): + arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]") + + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + DatetimeArray(arr.reshape(2, 2, 1)) + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 0-dim + DatetimeArray(arr[[0]].squeeze()) + + def test_freq_validation(self): + # GH#24623 check that invalid instances cannot be created with the + # public constructor + arr = np.arange(5, dtype=np.int64) * 3600 * 10**9 + + msg = ( + "Inferred frequency h from passed values does not " + "conform to passed frequency W-SUN" + ) + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr, freq="W") + + @pytest.mark.parametrize( + "meth", + [ + DatetimeArray._from_sequence, + pd.to_datetime, + pd.DatetimeIndex, + ], + ) + def test_mixing_naive_tzaware_raises(self, meth): + # GH#24569 + arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]) + + msg = ( + "Cannot mix tz-aware with tz-naive values|" + "Tz-aware datetime.datetime cannot be converted " + "to datetime64 unless utc=True" + ) + + for obj in [arr, arr[::-1]]: + # check that we raise regardless of whether naive is found + # before aware or vice-versa + with pytest.raises(ValueError, match=msg): + meth(obj) + + def test_from_pandas_array(self): + arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9 + + result = DatetimeArray._from_sequence(arr, dtype="M8[ns]")._with_freq("infer") + + expected = pd.date_range("1970-01-01", periods=5, freq="h")._data + tm.assert_datetime_array_equal(result, expected) + + def test_mismatched_timezone_raises(self): + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + arr = DatetimeArray( + np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"), + dtype=DatetimeTZDtype(tz="US/Central"), + ) + dtype = DatetimeTZDtype(tz="US/Eastern") + msg = r"dtype=datetime64\[ns.*\] does not match data dtype datetime64\[ns.*\]" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(TypeError, match=msg): + DatetimeArray(arr, dtype=dtype) + + # also with mismatched tzawareness + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(TypeError, match=msg): + DatetimeArray(arr, dtype=np.dtype("M8[ns]")) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(TypeError, match=msg): + DatetimeArray(arr.tz_localize(None), dtype=arr.dtype) + + def test_non_array_raises(self): + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="list"): + DatetimeArray([1, 2, 3]) + + def test_bool_dtype_raises(self): + arr = np.array([1, 2, 3], dtype="bool") + + depr_msg = "DatetimeArray.__init__ is deprecated" + msg = "Unexpected value for 'dtype': 'bool'. Must be" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr) + + msg = r"dtype bool cannot be converted to datetime64\[ns\]" + with pytest.raises(TypeError, match=msg): + DatetimeArray._from_sequence(arr, dtype="M8[ns]") + + with pytest.raises(TypeError, match=msg): + pd.DatetimeIndex(arr) + + with pytest.raises(TypeError, match=msg): + pd.to_datetime(arr) + + def test_incorrect_dtype_raises(self): + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category") + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="m8[s]") + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="M8[D]") + + def test_mismatched_values_dtype_units(self): + arr = np.array([1, 2, 3], dtype="M8[s]") + dtype = np.dtype("M8[ns]") + msg = "Values resolution does not match dtype." + depr_msg = "DatetimeArray.__init__ is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr, dtype=dtype) + + dtype2 = DatetimeTZDtype(tz="UTC", unit="ns") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr, dtype=dtype2) + + def test_freq_infer_raises(self): + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Frequency inference"): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer") + + def test_copy(self): + data = np.array([1, 2, 3], dtype="M8[ns]") + arr = DatetimeArray._from_sequence(data, copy=False) + assert arr._ndarray is data + + arr = DatetimeArray._from_sequence(data, copy=True) + assert arr._ndarray is not data + + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_numpy_datetime_unit(self, unit): + data = np.array([1, 2, 3], dtype=f"M8[{unit}]") + arr = DatetimeArray._from_sequence(data) + assert arr.unit == unit + assert arr[0].unit == unit + + +class TestSequenceToDT64NS: + def test_tz_dtype_mismatch_raises(self): + arr = DatetimeArray._from_sequence( + ["2000"], dtype=DatetimeTZDtype(tz="US/Central") + ) + with pytest.raises(TypeError, match="data is already tz-aware"): + DatetimeArray._from_sequence(arr, dtype=DatetimeTZDtype(tz="UTC")) + + def test_tz_dtype_matches(self): + dtype = DatetimeTZDtype(tz="US/Central") + arr = DatetimeArray._from_sequence(["2000"], dtype=dtype) + result = DatetimeArray._from_sequence(arr, dtype=dtype) + tm.assert_equal(arr, result) + + @pytest.mark.parametrize("order", ["F", "C"]) + def test_2d(self, order): + dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific") + arr = np.array(dti, dtype=object).reshape(3, 2) + if order == "F": + arr = arr.T + + res = DatetimeArray._from_sequence(arr, dtype=dti.dtype) + expected = DatetimeArray._from_sequence(arr.ravel(), dtype=dti.dtype).reshape( + arr.shape + ) + tm.assert_datetime_array_equal(res, expected) + + +# ---------------------------------------------------------------------------- +# Arrow interaction + + +EXTREME_VALUES = [0, 123456789, None, iNaT, 2**63 - 1, -(2**63) + 1] +FINE_TO_COARSE_SAFE = [123_000_000_000, None, -123_000_000_000] +COARSE_TO_FINE_SAFE = [123, None, -123] + + +@pytest.mark.parametrize( + ("pa_unit", "pd_unit", "pa_tz", "pd_tz", "data"), + [ + ("s", "s", "UTC", "UTC", EXTREME_VALUES), + ("ms", "ms", "UTC", "Europe/Berlin", EXTREME_VALUES), + ("us", "us", "US/Eastern", "UTC", EXTREME_VALUES), + ("ns", "ns", "US/Central", "Asia/Kolkata", EXTREME_VALUES), + ("ns", "s", "UTC", "UTC", FINE_TO_COARSE_SAFE), + ("us", "ms", "UTC", "Europe/Berlin", FINE_TO_COARSE_SAFE), + ("ms", "us", "US/Eastern", "UTC", COARSE_TO_FINE_SAFE), + ("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE), + ], +) +def test_from_arrow_with_different_units_and_timezones_with( + pa_unit, pd_unit, pa_tz, pd_tz, data +): + pa = pytest.importorskip("pyarrow") + + pa_type = pa.timestamp(pa_unit, tz=pa_tz) + arr = pa.array(data, type=pa_type) + dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz) + + result = dtype.__from_arrow__(arr) + expected = DatetimeArray._from_sequence(data, dtype=f"M8[{pa_unit}, UTC]").astype( + dtype, copy=False + ) + tm.assert_extension_array_equal(result, expected) + + result = dtype.__from_arrow__(pa.chunked_array([arr])) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + ("unit", "tz"), + [ + ("s", "UTC"), + ("ms", "Europe/Berlin"), + ("us", "US/Eastern"), + ("ns", "Asia/Kolkata"), + ("ns", "UTC"), + ], +) +def test_from_arrow_from_empty(unit, tz): + pa = pytest.importorskip("pyarrow") + + data = [] + arr = pa.array(data) + dtype = DatetimeTZDtype(unit=unit, tz=tz) + + result = dtype.__from_arrow__(arr) + expected = DatetimeArray._from_sequence(np.array(data, dtype=f"datetime64[{unit}]")) + expected = expected.tz_localize(tz=tz) + tm.assert_extension_array_equal(result, expected) + + result = dtype.__from_arrow__(pa.chunked_array([arr])) + tm.assert_extension_array_equal(result, expected) + + +def test_from_arrow_from_integers(): + pa = pytest.importorskip("pyarrow") + + data = [0, 123456789, None, 2**63 - 1, iNaT, -123456789] + arr = pa.array(data) + dtype = DatetimeTZDtype(unit="ns", tz="UTC") + + result = dtype.__from_arrow__(arr) + expected = DatetimeArray._from_sequence(np.array(data, dtype="datetime64[ns]")) + expected = expected.tz_localize("UTC") + tm.assert_extension_array_equal(result, expected) + + result = dtype.__from_arrow__(pa.chunked_array([arr])) + tm.assert_extension_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/test_cumulative.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/test_cumulative.py new file mode 100644 index 0000000000000000000000000000000000000000..e9d2dfdd0048a42a3f23e41be1d45a89aae11d23 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/test_cumulative.py @@ -0,0 +1,44 @@ +import pytest + +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray + + +class TestAccumulator: + def test_accumulators_freq(self): + # GH#50297 + arr = DatetimeArray._from_sequence( + [ + "2000-01-01", + "2000-01-02", + "2000-01-03", + ], + dtype="M8[ns]", + )._with_freq("infer") + result = arr._accumulate("cummin") + expected = DatetimeArray._from_sequence(["2000-01-01"] * 3, dtype="M8[ns]") + tm.assert_datetime_array_equal(result, expected) + + result = arr._accumulate("cummax") + expected = DatetimeArray._from_sequence( + [ + "2000-01-01", + "2000-01-02", + "2000-01-03", + ], + dtype="M8[ns]", + ) + tm.assert_datetime_array_equal(result, expected) + + @pytest.mark.parametrize("func", ["cumsum", "cumprod"]) + def test_accumulators_disallowed(self, func): + # GH#50297 + arr = DatetimeArray._from_sequence( + [ + "2000-01-01", + "2000-01-02", + ], + dtype="M8[ns]", + )._with_freq("infer") + with pytest.raises(TypeError, match=f"Accumulation {func}"): + arr._accumulate(func) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/test_reductions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/test_reductions.py new file mode 100644 index 0000000000000000000000000000000000000000..a941546b13a567b705f61a3a667119cd55a2f0e4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/datetimes/test_reductions.py @@ -0,0 +1,183 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +from pandas import NaT +import pandas._testing as tm +from pandas.core.arrays import DatetimeArray + + +class TestReductions: + @pytest.fixture(params=["s", "ms", "us", "ns"]) + def unit(self, request): + return request.param + + @pytest.fixture + def arr1d(self, tz_naive_fixture): + """Fixture returning DatetimeArray with parametrized timezones""" + tz = tz_naive_fixture + dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]") + arr = DatetimeArray._from_sequence( + [ + "2000-01-03", + "2000-01-03", + "NaT", + "2000-01-02", + "2000-01-05", + "2000-01-04", + ], + dtype=dtype, + ) + return arr + + def test_min_max(self, arr1d, unit): + arr = arr1d + arr = arr.as_unit(unit) + tz = arr.tz + + result = arr.min() + expected = pd.Timestamp("2000-01-02", tz=tz).as_unit(unit) + assert result == expected + assert result.unit == expected.unit + + result = arr.max() + expected = pd.Timestamp("2000-01-05", tz=tz).as_unit(unit) + assert result == expected + assert result.unit == expected.unit + + result = arr.min(skipna=False) + assert result is NaT + + result = arr.max(skipna=False) + assert result is NaT + + @pytest.mark.parametrize("tz", [None, "US/Central"]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_empty(self, skipna, tz): + dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]") + arr = DatetimeArray._from_sequence([], dtype=dtype) + result = arr.min(skipna=skipna) + assert result is NaT + + result = arr.max(skipna=skipna) + assert result is NaT + + @pytest.mark.parametrize("tz", [None, "US/Central"]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_median_empty(self, skipna, tz): + dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]") + arr = DatetimeArray._from_sequence([], dtype=dtype) + result = arr.median(skipna=skipna) + assert result is NaT + + arr = arr.reshape(0, 3) + result = arr.median(axis=0, skipna=skipna) + expected = type(arr)._from_sequence([NaT, NaT, NaT], dtype=arr.dtype) + tm.assert_equal(result, expected) + + result = arr.median(axis=1, skipna=skipna) + expected = type(arr)._from_sequence([], dtype=arr.dtype) + tm.assert_equal(result, expected) + + def test_median(self, arr1d): + arr = arr1d + + result = arr.median() + assert result == arr[0] + result = arr.median(skipna=False) + assert result is NaT + + result = arr.dropna().median(skipna=False) + assert result == arr[0] + + result = arr.median(axis=0) + assert result == arr[0] + + def test_median_axis(self, arr1d): + arr = arr1d + assert arr.median(axis=0) == arr.median() + assert arr.median(axis=0, skipna=False) is NaT + + msg = r"abs\(axis\) must be less than ndim" + with pytest.raises(ValueError, match=msg): + arr.median(axis=1) + + @pytest.mark.filterwarnings("ignore:All-NaN slice encountered:RuntimeWarning") + def test_median_2d(self, arr1d): + arr = arr1d.reshape(1, -1) + + # axis = None + assert arr.median() == arr1d.median() + assert arr.median(skipna=False) is NaT + + # axis = 0 + result = arr.median(axis=0) + expected = arr1d + tm.assert_equal(result, expected) + + # Since column 3 is all-NaT, we get NaT there with or without skipna + result = arr.median(axis=0, skipna=False) + expected = arr1d + tm.assert_equal(result, expected) + + # axis = 1 + result = arr.median(axis=1) + expected = type(arr)._from_sequence([arr1d.median()], dtype=arr.dtype) + tm.assert_equal(result, expected) + + result = arr.median(axis=1, skipna=False) + expected = type(arr)._from_sequence([NaT], dtype=arr.dtype) + tm.assert_equal(result, expected) + + def test_mean(self, arr1d): + arr = arr1d + + # manually verified result + expected = arr[0] + 0.4 * pd.Timedelta(days=1) + + result = arr.mean() + assert result == expected + result = arr.mean(skipna=False) + assert result is NaT + + result = arr.dropna().mean(skipna=False) + assert result == expected + + result = arr.mean(axis=0) + assert result == expected + + def test_mean_2d(self): + dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific") + dta = dti._data.reshape(3, 2) + + result = dta.mean(axis=0) + expected = dta[1] + tm.assert_datetime_array_equal(result, expected) + + result = dta.mean(axis=1) + expected = dta[:, 0] + pd.Timedelta(hours=12) + tm.assert_datetime_array_equal(result, expected) + + result = dta.mean(axis=None) + expected = dti.mean() + assert result == expected + + @pytest.mark.parametrize("skipna", [True, False]) + def test_mean_empty(self, arr1d, skipna): + arr = arr1d[:0] + + assert arr.mean(skipna=skipna) is NaT + + arr2d = arr.reshape(0, 3) + result = arr2d.mean(axis=0, skipna=skipna) + expected = DatetimeArray._from_sequence([NaT, NaT, NaT], dtype=arr.dtype) + tm.assert_datetime_array_equal(result, expected) + + result = arr2d.mean(axis=1, skipna=skipna) + expected = arr # i.e. 1D, empty + tm.assert_datetime_array_equal(result, expected) + + result = arr2d.mean(axis=None, skipna=skipna) + assert result is NaT diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_comparison.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_comparison.py new file mode 100644 index 0000000000000000000000000000000000000000..a429649f1ce1dc10fc9610faa73a81dd94255b37 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_comparison.py @@ -0,0 +1,65 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import FloatingArray +from pandas.tests.arrays.masked_shared import ( + ComparisonOps, + NumericOps, +) + + +class TestComparisonOps(NumericOps, ComparisonOps): + @pytest.mark.parametrize("other", [True, False, pd.NA, -1.0, 0.0, 1]) + def test_scalar(self, other, comparison_op, dtype): + ComparisonOps.test_scalar(self, other, comparison_op, dtype) + + def test_compare_with_integerarray(self, comparison_op): + op = comparison_op + a = pd.array([0, 1, None] * 3, dtype="Int64") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Float64") + other = b.astype("Int64") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + expected = op(other, a) + result = op(b, a) + tm.assert_extension_array_equal(result, expected) + + +def test_equals(): + # GH-30652 + # equals is generally tested in /tests/extension/base/methods, but this + # specifically tests that two arrays of the same class but different dtype + # do not evaluate equal + a1 = pd.array([1, 2, None], dtype="Float64") + a2 = pd.array([1, 2, None], dtype="Float32") + assert a1.equals(a2) is False + + +def test_equals_nan_vs_na(): + # GH#44382 + + mask = np.zeros(3, dtype=bool) + data = np.array([1.0, np.nan, 3.0], dtype=np.float64) + + left = FloatingArray(data, mask) + assert left.equals(left) + tm.assert_extension_array_equal(left, left) + + assert left.equals(left.copy()) + assert left.equals(FloatingArray(data.copy(), mask.copy())) + + mask2 = np.array([False, True, False], dtype=bool) + data2 = np.array([1.0, 2.0, 3.0], dtype=np.float64) + right = FloatingArray(data2, mask2) + assert right.equals(right) + tm.assert_extension_array_equal(right, right) + + assert not left.equals(right) + + # with mask[1] = True, the only difference is data[1], which should + # not matter for equals + mask[1] = True + assert left.equals(right) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_concat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_concat.py new file mode 100644 index 0000000000000000000000000000000000000000..2174a834aa959b88d899971f83247258a94476e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_concat.py @@ -0,0 +1,20 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + (["Float64", "Float64"], "Float64"), + (["Float32", "Float64"], "Float64"), + (["Float32", "Float32"], "Float32"), + ], +) +def test_concat_series(to_concat_dtypes, result_dtype): + result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes]) + expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype( + result_dtype + ) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_contains.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_contains.py new file mode 100644 index 0000000000000000000000000000000000000000..956642697bf3285e5c661c43047a5f0dafa83144 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_contains.py @@ -0,0 +1,12 @@ +import numpy as np + +import pandas as pd + + +def test_contains_nan(): + # GH#52840 + arr = pd.array(range(5)) / 0 + + assert np.isnan(arr._data[0]) + assert not arr.isna()[0] + assert np.nan in arr diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_to_numpy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_to_numpy.py new file mode 100644 index 0000000000000000000000000000000000000000..e954cecba417afd71059a35f7506c650eb780373 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/floating/test_to_numpy.py @@ -0,0 +1,132 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import FloatingArray + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + + # default (with or without missing values) -> object dtype + arr = con([0.1, 0.2, 0.3], dtype="Float64") + result = arr.to_numpy() + expected = np.array([0.1, 0.2, 0.3], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + arr = con([0.1, 0.2, None], dtype="Float64") + result = arr.to_numpy() + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_float(box): + con = pd.Series if box else pd.array + + # no missing values -> can convert to float, otherwise raises + arr = con([0.1, 0.2, 0.3], dtype="Float64") + result = arr.to_numpy(dtype="float64") + expected = np.array([0.1, 0.2, 0.3], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + arr = con([0.1, 0.2, None], dtype="Float64") + result = arr.to_numpy(dtype="float64") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_int(box): + con = pd.Series if box else pd.array + + # no missing values -> can convert to int, otherwise raises + arr = con([1.0, 2.0, 3.0], dtype="Float64") + result = arr.to_numpy(dtype="int64") + expected = np.array([1, 2, 3], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + arr = con([1.0, 2.0, None], dtype="Float64") + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): + result = arr.to_numpy(dtype="int64") + + # automatic casting (floors the values) + arr = con([0.1, 0.9, 1.1], dtype="Float64") + result = arr.to_numpy(dtype="int64") + expected = np.array([0, 0, 1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_na_value(box): + con = pd.Series if box else pd.array + + arr = con([0.0, 1.0, None], dtype="Float64") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([0.0, 1.0, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([False, True, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([0, 1, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_na_value_with_nan(): + # array with both NaN and NA -> only fill NA with `na_value` + arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True])) + result = arr.to_numpy(dtype="float64", na_value=-1) + expected = np.array([0.0, np.nan, -1.0], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_dtype(box, dtype): + con = pd.Series if box else pd.array + arr = con([0.0, 1.0], dtype="Float64") + + result = arr.to_numpy(dtype=dtype) + expected = np.array([0, 1], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_na_raises(box, dtype): + con = pd.Series if box else pd.array + arr = con([0.0, 1.0, None], dtype="Float64") + with pytest.raises(ValueError, match=dtype): + arr.to_numpy(dtype=dtype) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_string(box, dtype): + con = pd.Series if box else pd.array + arr = con([0.0, 1.0, None], dtype="Float64") + + result = arr.to_numpy(dtype="str") + expected = np.array([0.0, 1.0, pd.NA], dtype=f"{tm.ENDIAN}U32") + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_copy(): + # to_numpy can be zero-copy if no missing values + arr = pd.array([0.1, 0.2, 0.3], dtype="Float64") + result = arr.to_numpy(dtype="float64") + result[0] = 10 + tm.assert_extension_array_equal(arr, pd.array([10, 0.2, 0.3], dtype="Float64")) + + arr = pd.array([0.1, 0.2, 0.3], dtype="Float64") + result = arr.to_numpy(dtype="float64", copy=True) + result[0] = 10 + tm.assert_extension_array_equal(arr, pd.array([0.1, 0.2, 0.3], dtype="Float64")) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/conftest.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..f73400dfe689e91c4c2b457c4be1a0a41380fd6a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/conftest.py @@ -0,0 +1,68 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.integer import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) + + +@pytest.fixture( + params=[ + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ] +) +def dtype(request): + """Parametrized fixture returning integer 'dtype'""" + return request.param() + + +@pytest.fixture +def data(dtype): + """ + Fixture returning 'data' array with valid and missing values according to + parametrized integer 'dtype'. + + Used to test dtype conversion with and without missing values. + """ + return pd.array( + list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100], + dtype=dtype, + ) + + +@pytest.fixture +def data_missing(dtype): + """ + Fixture returning array with exactly one NaN and one valid integer, + according to parametrized integer 'dtype'. + + Used to test dtype conversion with and without missing values. + """ + return pd.array([np.nan, 1], dtype=dtype) + + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture returning 'data' or 'data_missing' integer arrays. + + Used to test dtype conversion with and without missing values. + """ + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_arithmetic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_arithmetic.py new file mode 100644 index 0000000000000000000000000000000000000000..8acd298f37a0795851c4c37bceff5a8222f521ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_arithmetic.py @@ -0,0 +1,385 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core import ops +from pandas.core.arrays import FloatingArray + +# Basic test for the arithmetic array ops +# ----------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "opname, exp", + [("add", [1, 3, None, None, 9]), ("mul", [0, 2, None, None, 20])], + ids=["add", "mul"], +) +def test_add_mul(dtype, opname, exp): + a = pd.array([0, 1, None, 3, 4], dtype=dtype) + b = pd.array([1, 2, 3, None, 5], dtype=dtype) + + # array / array + expected = pd.array(exp, dtype=dtype) + + op = getattr(operator, opname) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + + op = getattr(ops, "r" + opname) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + + +def test_sub(dtype): + a = pd.array([1, 2, 3, None, 5], dtype=dtype) + b = pd.array([0, 1, None, 3, 4], dtype=dtype) + + result = a - b + expected = pd.array([1, 1, None, None, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_div(dtype): + a = pd.array([1, 2, 3, None, 5], dtype=dtype) + b = pd.array([0, 1, None, 3, 4], dtype=dtype) + + result = a / b + expected = pd.array([np.inf, 2, None, None, 1.25], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) +def test_divide_by_zero(zero, negative): + # https://github.com/pandas-dev/pandas/issues/27398, GH#22793 + a = pd.array([0, 1, -1, None], dtype="Int64") + result = a / zero + expected = FloatingArray( + np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"), + np.array([False, False, False, True]), + ) + if negative: + expected *= -1 + tm.assert_extension_array_equal(result, expected) + + +def test_floordiv(dtype): + a = pd.array([1, 2, 3, None, 5], dtype=dtype) + b = pd.array([0, 1, None, 3, 4], dtype=dtype) + + result = a // b + # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) + expected = pd.array([0, 2, None, None, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_floordiv_by_int_zero_no_mask(any_int_ea_dtype): + # GH 48223: Aligns with non-masked floordiv + # but differs from numpy + # https://github.com/pandas-dev/pandas/issues/30188#issuecomment-564452740 + ser = pd.Series([0, 1], dtype=any_int_ea_dtype) + result = 1 // ser + expected = pd.Series([np.inf, 1.0], dtype="Float64") + tm.assert_series_equal(result, expected) + + ser_non_nullable = ser.astype(ser.dtype.numpy_dtype) + result = 1 // ser_non_nullable + expected = expected.astype(np.float64) + tm.assert_series_equal(result, expected) + + +def test_mod(dtype): + a = pd.array([1, 2, 3, None, 5], dtype=dtype) + b = pd.array([0, 1, None, 3, 4], dtype=dtype) + + result = a % b + expected = pd.array([0, 0, None, None, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_pow_scalar(): + a = pd.array([-1, 0, 1, None, 2], dtype="Int64") + result = a**0 + expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a**1 + expected = pd.array([-1, 0, 1, None, 2], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a**pd.NA + expected = pd.array([None, None, 1, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a**np.nan + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"), + np.array([False, False, False, True, False]), + ) + tm.assert_extension_array_equal(result, expected) + + # reversed + a = a[1:] # Can't raise integers to negative powers. + + result = 0**a + expected = pd.array([1, 0, None, 0], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = 1**a + expected = pd.array([1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = pd.NA**a + expected = pd.array([1, None, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = np.nan**a + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype="float64"), + np.array([False, False, True, False]), + ) + tm.assert_extension_array_equal(result, expected) + + +def test_pow_array(): + a = pd.array([0, 0, 0, 1, 1, 1, None, None, None]) + b = pd.array([0, 1, None, 0, 1, None, 0, 1, None]) + result = a**b + expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None]) + tm.assert_extension_array_equal(result, expected) + + +def test_rpow_one_to_na(): + # https://github.com/pandas-dev/pandas/issues/22022 + # https://github.com/pandas-dev/pandas/issues/29997 + arr = pd.array([np.nan, np.nan], dtype="Int64") + result = np.array([1.0, 2.0]) ** arr + expected = pd.array([1.0, np.nan], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("other", [0, 0.5]) +def test_numpy_zero_dim_ndarray(other): + arr = pd.array([1, None, 2]) + result = arr + np.array(other) + expected = arr + other + tm.assert_equal(result, expected) + + +# Test generic characteristics / errors +# ----------------------------------------------------------------------------- + + +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + errs = TypeError + + # invalid scalars + msg = "|".join( + [ + r"can only perform ops with numeric values", + r"IntegerArray cannot perform the operation mod", + r"unsupported operand type", + r"can only concatenate str \(not \"int\"\) to str", + "not all arguments converted during string", + "ufunc '.*' not supported for the input types, and the inputs could not", + "ufunc '.*' did not contain a loop with signature matching types", + "Addition/subtraction of integers and integer-arrays with Timestamp", + "has no kernel", + "not implemented", + "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.", + ] + ) + with pytest.raises(errs, match=msg): + ops("foo") + with pytest.raises(errs, match=msg): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + str_ser = pd.Series("foo", index=s.index) + # with pytest.raises(TypeError, match=msg): + if ( + all_arithmetic_operators + in [ + "__mul__", + "__rmul__", + ] + and not using_infer_string + ): # (data[~data.isna()] >= 0).all(): + res = ops(str_ser) + expected = pd.Series(["foo" * x for x in data], index=s.index) + expected = expected.fillna(np.nan) + # TODO: doing this fillna to keep tests passing as we make + # assert_almost_equal stricter, but the expected with pd.NA seems + # more-correct than np.nan here. + tm.assert_series_equal(res, expected) + else: + with pytest.raises(errs, match=msg): + ops(str_ser) + + msg = "|".join( + [ + "can only perform ops with numeric values", + "cannot perform .* with this index type: DatetimeArray", + "Addition/subtraction of integers and integer-arrays " + "with DatetimeArray is no longer supported. *", + "unsupported operand type", + r"can only concatenate str \(not \"int\"\) to str", + "not all arguments converted during string", + "cannot subtract DatetimeArray from ndarray", + "has no kernel", + "not implemented", + ] + ) + with pytest.raises(errs, match=msg): + ops(pd.Series(pd.date_range("20180101", periods=len(s)))) + + +# Various +# ----------------------------------------------------------------------------- + + +# TODO test unsigned overflow + + +def test_arith_coerce_scalar(data, all_arithmetic_operators): + op = tm.get_op_from_name(all_arithmetic_operators) + s = pd.Series(data) + other = 0.01 + + result = op(s, other) + expected = op(s.astype(float), other) + expected = expected.astype("Float64") + + # rmod results in NaN that wasn't NA in original nullable Series -> unmask it + if all_arithmetic_operators == "__rmod__": + mask = (s == 0).fillna(False).to_numpy(bool) + expected.array._mask[mask] = False + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("other", [1.0, np.array(1.0)]) +def test_arithmetic_conversion(all_arithmetic_operators, other): + # if we have a float operand we should have a float result + # if that is equal to an integer + op = tm.get_op_from_name(all_arithmetic_operators) + + s = pd.Series([1, 2, 3], dtype="Int64") + result = op(s, other) + assert result.dtype == "Float64" + + +def test_cross_type_arithmetic(): + df = pd.DataFrame( + { + "A": pd.Series([1, 2, np.nan], dtype="Int64"), + "B": pd.Series([1, np.nan, 3], dtype="UInt8"), + "C": [1, 2, 3], + } + ) + + result = df.A + df.C + expected = pd.Series([2, 4, np.nan], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = (df.A + df.C) * 3 == 12 + expected = pd.Series([False, True, None], dtype="boolean") + tm.assert_series_equal(result, expected) + + result = df.A + df.B + expected = pd.Series([2, np.nan, np.nan], dtype="Int64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("op", ["mean"]) +def test_reduce_to_float(op): + # some reduce ops always return float, even if the result + # is a rounded number + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": pd.array([1, None, 3], dtype="Int64"), + } + ) + + # op + result = getattr(df.C, op)() + assert isinstance(result, float) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": pd.array([1, 3], dtype="Float64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "source, neg_target, abs_target", + [ + ([1, 2, 3], [-1, -2, -3], [1, 2, 3]), + ([1, 2, None], [-1, -2, None], [1, 2, None]), + ([-1, 0, 1], [1, 0, -1], [1, 0, 1]), + ], +) +def test_unary_int_operators(any_signed_int_ea_dtype, source, neg_target, abs_target): + dtype = any_signed_int_ea_dtype + arr = pd.array(source, dtype=dtype) + neg_result, pos_result, abs_result = -arr, +arr, abs(arr) + neg_target = pd.array(neg_target, dtype=dtype) + abs_target = pd.array(abs_target, dtype=dtype) + + tm.assert_extension_array_equal(neg_result, neg_target) + tm.assert_extension_array_equal(pos_result, arr) + assert not tm.shares_memory(pos_result, arr) + tm.assert_extension_array_equal(abs_result, abs_target) + + +def test_values_multiplying_large_series_by_NA(): + # GH#33701 + + result = pd.NA * pd.Series(np.zeros(10001)) + expected = pd.Series([pd.NA] * 10001) + + tm.assert_series_equal(result, expected) + + +def test_bitwise(dtype): + left = pd.array([1, None, 3, 4], dtype=dtype) + right = pd.array([None, 3, 5, 4], dtype=dtype) + + result = left | right + expected = pd.array([None, None, 3 | 5, 4 | 4], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = left & right + expected = pd.array([None, None, 3 & 5, 4 & 4], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = left ^ right + expected = pd.array([None, None, 3 ^ 5, 4 ^ 4], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + # TODO: desired behavior when operating with boolean? defer? + + floats = right.astype("Float64") + with pytest.raises(TypeError, match="unsupported operand type"): + left | floats + with pytest.raises(TypeError, match="unsupported operand type"): + left & floats + with pytest.raises(TypeError, match="unsupported operand type"): + left ^ floats diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_comparison.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_comparison.py new file mode 100644 index 0000000000000000000000000000000000000000..568b0b087bf1db9610960dba12ea2e0bab8f1729 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_comparison.py @@ -0,0 +1,39 @@ +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.arrays.masked_shared import ( + ComparisonOps, + NumericOps, +) + + +class TestComparisonOps(NumericOps, ComparisonOps): + @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) + def test_scalar(self, other, comparison_op, dtype): + ComparisonOps.test_scalar(self, other, comparison_op, dtype) + + def test_compare_to_int(self, dtype, comparison_op): + # GH 28930 + op_name = f"__{comparison_op.__name__}__" + s1 = pd.Series([1, None, 3], dtype=dtype) + s2 = pd.Series([1, None, 3], dtype="float") + + method = getattr(s1, op_name) + result = method(2) + + method = getattr(s2, op_name) + expected = method(2).astype("boolean") + expected[s2.isna()] = pd.NA + + tm.assert_series_equal(result, expected) + + +def test_equals(): + # GH-30652 + # equals is generally tested in /tests/extension/base/methods, but this + # specifically tests that two arrays of the same class but different dtype + # do not evaluate equal + a1 = pd.array([1, 2, None], dtype="Int64") + a2 = pd.array([1, 2, None], dtype="Int32") + assert a1.equals(a2) is False diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_concat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_concat.py new file mode 100644 index 0000000000000000000000000000000000000000..feba574da548fd597c25103f67821145bccec9ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_concat.py @@ -0,0 +1,69 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + (["Int64", "Int64"], "Int64"), + (["UInt64", "UInt64"], "UInt64"), + (["Int8", "Int8"], "Int8"), + (["Int8", "Int16"], "Int16"), + (["UInt8", "Int8"], "Int16"), + (["Int32", "UInt32"], "Int64"), + (["Int64", "UInt64"], "Float64"), + (["Int64", "boolean"], "object"), + (["UInt8", "boolean"], "object"), + ], +) +def test_concat_series(to_concat_dtypes, result_dtype): + # we expect the same dtypes as we would get with non-masked inputs, + # just masked where available. + + result = pd.concat([pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes]) + expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype( + result_dtype + ) + tm.assert_series_equal(result, expected) + + # order doesn't matter for result + result = pd.concat( + [pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes[::-1]] + ) + expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype( + result_dtype + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + (["Int64", "int64"], "Int64"), + (["UInt64", "uint64"], "UInt64"), + (["Int8", "int8"], "Int8"), + (["Int8", "int16"], "Int16"), + (["UInt8", "int8"], "Int16"), + (["Int32", "uint32"], "Int64"), + (["Int64", "uint64"], "Float64"), + (["Int64", "bool"], "object"), + (["UInt8", "bool"], "object"), + ], +) +def test_concat_series_with_numpy(to_concat_dtypes, result_dtype): + # we expect the same dtypes as we would get with non-masked inputs, + # just masked where available. + + s1 = pd.Series([0, 1, pd.NA], dtype=to_concat_dtypes[0]) + s2 = pd.Series(np.array([0, 1], dtype=to_concat_dtypes[1])) + result = pd.concat([s1, s2], ignore_index=True) + expected = pd.Series([0, 1, pd.NA, 0, 1], dtype=object).astype(result_dtype) + tm.assert_series_equal(result, expected) + + # order doesn't matter for result + result = pd.concat([s2, s1], ignore_index=True) + expected = pd.Series([0, 1, 0, 1, pd.NA], dtype=object).astype(result_dtype) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_construction.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_construction.py new file mode 100644 index 0000000000000000000000000000000000000000..64fe40e53a9d287b6240a908b7ed9d0cf7ec1396 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_construction.py @@ -0,0 +1,245 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.api.types import is_integer +from pandas.core.arrays import IntegerArray +from pandas.core.arrays.integer import ( + Int8Dtype, + Int32Dtype, + Int64Dtype, +) + + +@pytest.fixture(params=[pd.array, IntegerArray._from_sequence]) +def constructor(request): + """Fixture returning parametrized IntegerArray from given sequence. + + Used to test dtype conversions. + """ + return request.param + + +def test_uses_pandas_na(): + a = pd.array([1, None], dtype=Int64Dtype()) + assert a[1] is pd.NA + + +def test_from_dtype_from_float(data): + # construct from our dtype & string dtype + dtype = data.dtype + + # from float + expected = pd.Series(data) + result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # from int / list + expected = pd.Series(data) + result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # from int / array + expected = pd.Series(data).dropna().reset_index(drop=True) + dropped = np.array(data.dropna()).astype(np.dtype(dtype.type)) + result = pd.Series(dropped, dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + +def test_conversions(data_missing): + # astype to object series + df = pd.DataFrame({"A": data_missing}) + result = df["A"].astype("object") + expected = pd.Series(np.array([pd.NA, 1], dtype=object), name="A") + tm.assert_series_equal(result, expected) + + # convert to object ndarray + # we assert that we are exactly equal + # including type conversions of scalars + result = df["A"].astype("object").values + expected = np.array([pd.NA, 1], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + for r, e in zip(result, expected): + if pd.isnull(r): + assert pd.isnull(e) + elif is_integer(r): + assert r == e + assert is_integer(e) + else: + assert r == e + assert type(r) == type(e) + + +def test_integer_array_constructor(): + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") + + result = IntegerArray(values, mask) + expected = pd.array([1, 2, 3, np.nan], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + msg = r".* should be .* numpy array. Use the 'pd.array' function instead" + with pytest.raises(TypeError, match=msg): + IntegerArray(values.tolist(), mask) + + with pytest.raises(TypeError, match=msg): + IntegerArray(values, mask.tolist()) + + with pytest.raises(TypeError, match=msg): + IntegerArray(values.astype(float), mask) + msg = r"__init__\(\) missing 1 required positional argument: 'mask'" + with pytest.raises(TypeError, match=msg): + IntegerArray(values) + + +def test_integer_array_constructor_copy(): + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") + + result = IntegerArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = IntegerArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +@pytest.mark.parametrize( + "a, b", + [ + ([1, None], [1, np.nan]), + ([None], [np.nan]), + ([None, np.nan], [np.nan, np.nan]), + ([np.nan, np.nan], [np.nan, np.nan]), + ], +) +def test_to_integer_array_none_is_nan(a, b): + result = pd.array(a, dtype="Int64") + expected = pd.array(b, dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + "foo", + 1, + 1.0, + pd.date_range("20130101", periods=2), + np.array(["foo"]), + [[1, 2], [3, 4]], + [np.nan, {"a": 1}], + ], +) +def test_to_integer_array_error(values): + # error in converting existing arrays to IntegerArrays + msg = "|".join( + [ + r"cannot be converted to IntegerDtype", + r"invalid literal for int\(\) with base 10:", + r"values must be a 1D list-like", + r"Cannot pass scalar", + r"int\(\) argument must be a string", + ] + ) + with pytest.raises((ValueError, TypeError), match=msg): + pd.array(values, dtype="Int64") + + with pytest.raises((ValueError, TypeError), match=msg): + IntegerArray._from_sequence(values) + + +def test_to_integer_array_inferred_dtype(constructor): + # if values has dtype -> respect it + result = constructor(np.array([1, 2], dtype="int8")) + assert result.dtype == Int8Dtype() + result = constructor(np.array([1, 2], dtype="int32")) + assert result.dtype == Int32Dtype() + + # if values have no dtype -> always int64 + result = constructor([1, 2]) + assert result.dtype == Int64Dtype() + + +def test_to_integer_array_dtype_keyword(constructor): + result = constructor([1, 2], dtype="Int8") + assert result.dtype == Int8Dtype() + + # if values has dtype -> override it + result = constructor(np.array([1, 2], dtype="int8"), dtype="Int32") + assert result.dtype == Int32Dtype() + + +def test_to_integer_array_float(): + result = IntegerArray._from_sequence([1.0, 2.0], dtype="Int64") + expected = pd.array([1, 2], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): + IntegerArray._from_sequence([1.5, 2.0], dtype="Int64") + + # for float dtypes, the itemsize is not preserved + result = IntegerArray._from_sequence( + np.array([1.0, 2.0], dtype="float32"), dtype="Int64" + ) + assert result.dtype == Int64Dtype() + + +def test_to_integer_array_str(): + result = IntegerArray._from_sequence(["1", "2", None], dtype="Int64") + expected = pd.array([1, 2, np.nan], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises( + ValueError, match=r"invalid literal for int\(\) with base 10: .*" + ): + IntegerArray._from_sequence(["1", "2", ""], dtype="Int64") + + with pytest.raises( + ValueError, match=r"invalid literal for int\(\) with base 10: .*" + ): + IntegerArray._from_sequence(["1.5", "2.0"], dtype="Int64") + + +@pytest.mark.parametrize( + "bool_values, int_values, target_dtype, expected_dtype", + [ + ([False, True], [0, 1], Int64Dtype(), Int64Dtype()), + ([False, True], [0, 1], "Int64", Int64Dtype()), + ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()), + ], +) +def test_to_integer_array_bool( + constructor, bool_values, int_values, target_dtype, expected_dtype +): + result = constructor(bool_values, dtype=target_dtype) + assert result.dtype == expected_dtype + expected = pd.array(int_values, dtype=target_dtype) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values, to_dtype, result_dtype", + [ + (np.array([1], dtype="int64"), None, Int64Dtype), + (np.array([1, np.nan]), None, Int64Dtype), + (np.array([1, np.nan]), "int8", Int8Dtype), + ], +) +def test_to_integer_array(values, to_dtype, result_dtype): + # convert existing arrays to IntegerArrays + result = IntegerArray._from_sequence(values, dtype=to_dtype) + assert result.dtype == result_dtype() + expected = pd.array(values, dtype=result_dtype()) + tm.assert_extension_array_equal(result, expected) + + +def test_integer_array_from_boolean(): + # GH31104 + expected = pd.array(np.array([True, False]), dtype="Int64") + result = pd.array(np.array([True, False], dtype=object), dtype="Int64") + tm.assert_extension_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_dtypes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..8620763988e0676e00d8e61affa75848fe38dd48 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_dtypes.py @@ -0,0 +1,294 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.generic import ABCIndex + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays.integer import ( + Int8Dtype, + UInt32Dtype, +) + + +def test_dtypes(dtype): + # smoke tests on auto dtype construction + + if dtype.is_signed_integer: + assert np.dtype(dtype.type).kind == "i" + else: + assert np.dtype(dtype.type).kind == "u" + assert dtype.name is not None + + +@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) +def test_preserve_dtypes(op): + # for ops that enable (mean would actually work here + # but generally it is a float return value) + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": pd.array([1, None, 3], dtype="Int64"), + } + ) + + # op + result = getattr(df.C, op)() + if op in {"sum", "prod", "min", "max"}: + assert isinstance(result, np.int64) + else: + assert isinstance(result, int) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": pd.array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +def test_astype_nansafe(): + # see gh-22343 + arr = pd.array([np.nan, 1, 2], dtype="Int8") + msg = "cannot convert NA to integer" + + with pytest.raises(ValueError, match=msg): + arr.astype("uint32") + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_construct_index(all_data, dropna): + # ensure that we do not coerce to different Index dtype or non-index + + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Index(pd.array(other, dtype=all_data.dtype)) + expected = pd.Index(other, dtype=all_data.dtype) + assert all_data.dtype == expected.dtype # dont coerce to object + + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_astype_index(all_data, dropna): + # as an int/uint index to Index + + all_data = all_data[:10] + if dropna: + other = all_data[~all_data.isna()] + else: + other = all_data + + dtype = all_data.dtype + idx = pd.Index(np.array(other)) + assert isinstance(idx, ABCIndex) + + result = idx.astype(dtype) + expected = idx.astype(object).astype(dtype) + tm.assert_index_equal(result, expected) + + +def test_astype(all_data): + all_data = all_data[:10] + + ints = all_data[~all_data.isna()] + mixed = all_data + dtype = Int8Dtype() + + # coerce to same type - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype) + expected = pd.Series(ints) + tm.assert_series_equal(result, expected) + + # coerce to same other - ints + s = pd.Series(ints) + result = s.astype(dtype) + expected = pd.Series(ints, dtype=dtype) + tm.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype.numpy_dtype) + expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype)) + tm.assert_series_equal(result, expected) + + # coerce to same type - mixed + s = pd.Series(mixed) + result = s.astype(all_data.dtype) + expected = pd.Series(mixed) + tm.assert_series_equal(result, expected) + + # coerce to same other - mixed + s = pd.Series(mixed) + result = s.astype(dtype) + expected = pd.Series(mixed, dtype=dtype) + tm.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - mixed + s = pd.Series(mixed) + msg = "cannot convert NA to integer" + with pytest.raises(ValueError, match=msg): + s.astype(all_data.dtype.numpy_dtype) + + # coerce to object + s = pd.Series(mixed) + result = s.astype("object") + expected = pd.Series(np.asarray(mixed, dtype=object)) + tm.assert_series_equal(result, expected) + + +def test_astype_copy(): + arr = pd.array([1, 2, 3, None], dtype="Int64") + orig = pd.array([1, 2, 3, None], dtype="Int64") + + # copy=True -> ensure both data and mask are actual copies + result = arr.astype("Int64", copy=True) + assert result is not arr + assert not tm.shares_memory(result, arr) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + # copy=False + result = arr.astype("Int64", copy=False) + assert result is arr + assert np.shares_memory(result._data, arr._data) + assert np.shares_memory(result._mask, arr._mask) + result[0] = 10 + assert arr[0] == 10 + result[0] = pd.NA + assert arr[0] is pd.NA + + # astype to different dtype -> always needs a copy -> even with copy=False + # we need to ensure that also the mask is actually copied + arr = pd.array([1, 2, 3, None], dtype="Int64") + orig = pd.array([1, 2, 3, None], dtype="Int64") + + result = arr.astype("Int32", copy=False) + assert not tm.shares_memory(result, arr) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + +def test_astype_to_larger_numpy(): + a = pd.array([1, 2], dtype="Int32") + result = a.astype("int64") + expected = np.array([1, 2], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + a = pd.array([1, 2], dtype="UInt32") + result = a.astype("uint64") + expected = np.array([1, 2], dtype="uint64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) +def test_astype_specific_casting(dtype): + s = pd.Series([1, 2, 3], dtype="Int64") + result = s.astype(dtype) + expected = pd.Series([1, 2, 3], dtype=dtype) + tm.assert_series_equal(result, expected) + + s = pd.Series([1, 2, 3, None], dtype="Int64") + result = s.astype(dtype) + expected = pd.Series([1, 2, 3, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + +def test_astype_floating(): + arr = pd.array([1, 2, None], dtype="Int64") + result = arr.astype("Float64") + expected = pd.array([1.0, 2.0, None], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +def test_astype_dt64(): + # GH#32435 + arr = pd.array([1, 2, 3, pd.NA]) * 10**9 + + result = arr.astype("datetime64[ns]") + + expected = np.array([1, 2, 3, "NaT"], dtype="M8[s]").astype("M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + +def test_construct_cast_invalid(dtype): + msg = "cannot safely" + arr = [1.2, 2.3, 3.7] + with pytest.raises(TypeError, match=msg): + pd.array(arr, dtype=dtype) + + with pytest.raises(TypeError, match=msg): + pd.Series(arr).astype(dtype) + + arr = [1.2, 2.3, 3.7, np.nan] + with pytest.raises(TypeError, match=msg): + pd.array(arr, dtype=dtype) + + with pytest.raises(TypeError, match=msg): + pd.Series(arr).astype(dtype) + + +@pytest.mark.parametrize("in_series", [True, False]) +def test_to_numpy_na_nan(in_series): + a = pd.array([0, 1, None], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([0.0, 1.0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="int64", na_value=-1) + expected = np.array([0, 1, -1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="bool", na_value=False) + expected = np.array([False, True, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("in_series", [True, False]) +@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) +def test_to_numpy_dtype(dtype, in_series): + a = pd.array([0, 1], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype=dtype) + expected = np.array([0, 1], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64", "bool"]) +def test_to_numpy_na_raises(dtype): + a = pd.array([0, 1, None], dtype="Int64") + with pytest.raises(ValueError, match=dtype): + a.to_numpy(dtype=dtype) + + +def test_astype_str(): + a = pd.array([1, 2, None], dtype="Int64") + expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) + + +def test_astype_boolean(): + # https://github.com/pandas-dev/pandas/issues/31102 + a = pd.array([1, 0, -1, 2, None], dtype="Int64") + result = a.astype("boolean") + expected = pd.array([True, False, True, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_function.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_function.py new file mode 100644 index 0000000000000000000000000000000000000000..d48b636a98feb94c5eaeee9eefbf5730fe9c6f79 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_function.py @@ -0,0 +1,203 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import FloatingArray + + +@pytest.mark.parametrize("ufunc", [np.abs, np.sign]) +# np.sign emits a warning with nans, +@pytest.mark.filterwarnings("ignore:invalid value encountered in sign:RuntimeWarning") +def test_ufuncs_single_int(ufunc): + a = pd.array([1, 2, -3, np.nan]) + result = ufunc(a) + expected = pd.array(ufunc(a.astype(float)), dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(pd.array(ufunc(a.astype(float)), dtype="Int64")) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) +def test_ufuncs_single_float(ufunc): + a = pd.array([1, 2, -3, np.nan]) + with np.errstate(invalid="ignore"): + result = ufunc(a) + expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask) + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + with np.errstate(invalid="ignore"): + result = ufunc(s) + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.add, np.subtract]) +def test_ufuncs_binary_int(ufunc): + # two IntegerArrays + a = pd.array([1, 2, -3, np.nan]) + result = ufunc(a, a) + expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + # IntegerArray with numpy array + arr = np.array([1, 2, 3, 4]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a.astype(float), arr), dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a.astype(float)), dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + # IntegerArray with scalar + result = ufunc(a, 1) + expected = pd.array(ufunc(a.astype(float), 1), dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = ufunc(1, a) + expected = pd.array(ufunc(1, a.astype(float)), dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + +def test_ufunc_binary_output(): + a = pd.array([1, 2, np.nan]) + result = np.modf(a) + expected = np.modf(a.to_numpy(na_value=np.nan, dtype="float")) + expected = (pd.array(expected[0]), pd.array(expected[1])) + + assert isinstance(result, tuple) + assert len(result) == 2 + + for x, y in zip(result, expected): + tm.assert_extension_array_equal(x, y) + + +@pytest.mark.parametrize("values", [[0, 1], [0, None]]) +def test_ufunc_reduce_raises(values): + arr = pd.array(values) + + res = np.add.reduce(arr) + expected = arr.sum(skipna=False) + tm.assert_almost_equal(res, expected) + + +@pytest.mark.parametrize( + "pandasmethname, kwargs", + [ + ("var", {"ddof": 0}), + ("var", {"ddof": 1}), + ("std", {"ddof": 0}), + ("std", {"ddof": 1}), + ("kurtosis", {}), + ("skew", {}), + ("sem", {}), + ], +) +def test_stat_method(pandasmethname, kwargs): + s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64") + pandasmeth = getattr(s, pandasmethname) + result = pandasmeth(**kwargs) + s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64") + pandasmeth = getattr(s2, pandasmethname) + expected = pandasmeth(**kwargs) + assert expected == result + + +def test_value_counts_na(): + arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") + result = arr.value_counts(dropna=False) + ex_index = pd.Index([1, 2, pd.NA], dtype="Int64") + assert ex_index.dtype == "Int64" + expected = pd.Series([2, 1, 1], index=ex_index, dtype="Int64", name="count") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count") + assert expected.index.dtype == arr.dtype + tm.assert_series_equal(result, expected) + + +def test_value_counts_empty(): + # https://github.com/pandas-dev/pandas/issues/33317 + ser = pd.Series([], dtype="Int64") + result = ser.value_counts() + idx = pd.Index([], dtype=ser.dtype) + assert idx.dtype == ser.dtype + expected = pd.Series([], index=idx, dtype="Int64", name="count") + tm.assert_series_equal(result, expected) + + +def test_value_counts_with_normalize(): + # GH 33172 + ser = pd.Series([1, 2, 1, pd.NA], dtype="Int64") + result = ser.value_counts(normalize=True) + expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3 + assert expected.index.dtype == ser.dtype + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("min_count", [0, 4]) +def test_integer_array_sum(skipna, min_count, any_int_ea_dtype): + dtype = any_int_ea_dtype + arr = pd.array([1, 2, 3, None], dtype=dtype) + result = arr.sum(skipna=skipna, min_count=min_count) + if skipna and min_count == 0: + assert result == 6 + else: + assert result is pd.NA + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("method", ["min", "max"]) +def test_integer_array_min_max(skipna, method, any_int_ea_dtype): + dtype = any_int_ea_dtype + arr = pd.array([0, 1, None], dtype=dtype) + func = getattr(arr, method) + result = func(skipna=skipna) + if skipna: + assert result == (0 if method == "min" else 1) + else: + assert result is pd.NA + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("min_count", [0, 9]) +def test_integer_array_prod(skipna, min_count, any_int_ea_dtype): + dtype = any_int_ea_dtype + arr = pd.array([1, 2, None], dtype=dtype) + result = arr.prod(skipna=skipna, min_count=min_count) + if skipna and min_count == 0: + assert result == 2 + else: + assert result is pd.NA + + +@pytest.mark.parametrize( + "values, expected", [([1, 2, 3], 6), ([1, 2, 3, None], 6), ([None], 0)] +) +def test_integer_array_numpy_sum(values, expected): + arr = pd.array(values, dtype="Int64") + result = np.sum(arr) + assert result == expected + + +@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"]) +def test_dataframe_reductions(op): + # https://github.com/pandas-dev/pandas/pull/32867 + # ensure the integers are not cast to float during reductions + df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")}) + result = df.max() + assert isinstance(result["a"], np.int64) + + +# TODO(jreback) - these need testing / are broken + +# shift + +# set_index (destroys type) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..4b953d699108b2aed1c992cf3a33f3013b298254 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_indexing.py @@ -0,0 +1,19 @@ +import pandas as pd +import pandas._testing as tm + + +def test_array_setitem_nullable_boolean_mask(): + # GH 31446 + ser = pd.Series([1, 2], dtype="Int64") + result = ser.where(ser > 1) + expected = pd.Series([pd.NA, 2], dtype="Int64") + tm.assert_series_equal(result, expected) + + +def test_array_setitem(): + # GH 31446 + arr = pd.Series([1, 2], dtype="Int64").array + arr[arr > 1] = 1 + + expected = pd.array([1, 1], dtype="Int64") + tm.assert_extension_array_equal(arr, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_reduction.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..db04862e4ea0797ffcc1562bbcb603448131fae8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_reduction.py @@ -0,0 +1,125 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Series, + array, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", np.int64(3)], + ["prod", np.int64(2)], + ["min", np.int64(1)], + ["max", np.int64(2)], + ["mean", np.float64(1.5)], + ["median", np.float64(1.5)], + ["var", np.float64(0.5)], + ["std", np.float64(0.5**0.5)], + ["skew", pd.NA], + ["kurt", pd.NA], + ["any", True], + ["all", True], + ], +) +def test_series_reductions(op, expected): + ser = Series([1, 2], dtype="Int64") + result = getattr(ser, op)() + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", Series([3], index=["a"], dtype="Int64")], + ["prod", Series([2], index=["a"], dtype="Int64")], + ["min", Series([1], index=["a"], dtype="Int64")], + ["max", Series([2], index=["a"], dtype="Int64")], + ["mean", Series([1.5], index=["a"], dtype="Float64")], + ["median", Series([1.5], index=["a"], dtype="Float64")], + ["var", Series([0.5], index=["a"], dtype="Float64")], + ["std", Series([0.5**0.5], index=["a"], dtype="Float64")], + ["skew", Series([pd.NA], index=["a"], dtype="Float64")], + ["kurt", Series([pd.NA], index=["a"], dtype="Float64")], + ["any", Series([True], index=["a"], dtype="boolean")], + ["all", Series([True], index=["a"], dtype="boolean")], + ], +) +def test_dataframe_reductions(op, expected): + df = DataFrame({"a": array([1, 2], dtype="Int64")}) + result = getattr(df, op)() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", array([1, 3], dtype="Int64")], + ["prod", array([1, 3], dtype="Int64")], + ["min", array([1, 3], dtype="Int64")], + ["max", array([1, 3], dtype="Int64")], + ["mean", array([1, 3], dtype="Float64")], + ["median", array([1, 3], dtype="Float64")], + ["var", array([pd.NA], dtype="Float64")], + ["std", array([pd.NA], dtype="Float64")], + ["skew", array([pd.NA], dtype="Float64")], + ["any", array([True, True], dtype="boolean")], + ["all", array([True, True], dtype="boolean")], + ], +) +def test_groupby_reductions(op, expected): + df = DataFrame( + { + "A": ["a", "b", "b"], + "B": array([1, None, 3], dtype="Int64"), + } + ) + result = getattr(df.groupby("A"), op)() + expected = DataFrame(expected, index=pd.Index(["a", "b"], name="A"), columns=["B"]) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "op, expected", + [ + ["sum", Series([4, 4], index=["B", "C"], dtype="Float64")], + ["prod", Series([3, 3], index=["B", "C"], dtype="Float64")], + ["min", Series([1, 1], index=["B", "C"], dtype="Float64")], + ["max", Series([3, 3], index=["B", "C"], dtype="Float64")], + ["mean", Series([2, 2], index=["B", "C"], dtype="Float64")], + ["median", Series([2, 2], index=["B", "C"], dtype="Float64")], + ["var", Series([2, 2], index=["B", "C"], dtype="Float64")], + ["std", Series([2**0.5, 2**0.5], index=["B", "C"], dtype="Float64")], + ["skew", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")], + ["kurt", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")], + ["any", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], + ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], + ], +) +def test_mixed_reductions(op, expected, using_infer_string): + if op in ["any", "all"] and using_infer_string: + expected = expected.astype("bool") + df = DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": array([1, None, 3], dtype="Int64"), + } + ) + + # series + result = getattr(df.C, op)() + tm.assert_equal(result, expected["C"]) + + # frame + if op in ["any", "all"]: + result = getattr(df, op)() + else: + result = getattr(df, op)(numeric_only=True) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_repr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_repr.py new file mode 100644 index 0000000000000000000000000000000000000000..168210eed5d06a461bbf42dd1e1fae3db0fd851c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/integer/test_repr.py @@ -0,0 +1,67 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.integer import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) + + +def test_dtypes(dtype): + # smoke tests on auto dtype construction + + if dtype.is_signed_integer: + assert np.dtype(dtype.type).kind == "i" + else: + assert np.dtype(dtype.type).kind == "u" + assert dtype.name is not None + + +@pytest.mark.parametrize( + "dtype, expected", + [ + (Int8Dtype(), "Int8Dtype()"), + (Int16Dtype(), "Int16Dtype()"), + (Int32Dtype(), "Int32Dtype()"), + (Int64Dtype(), "Int64Dtype()"), + (UInt8Dtype(), "UInt8Dtype()"), + (UInt16Dtype(), "UInt16Dtype()"), + (UInt32Dtype(), "UInt32Dtype()"), + (UInt64Dtype(), "UInt64Dtype()"), + ], +) +def test_repr_dtype(dtype, expected): + assert repr(dtype) == expected + + +def test_repr_array(): + result = repr(pd.array([1, None, 3])) + expected = "\n[1, , 3]\nLength: 3, dtype: Int64" + assert result == expected + + +def test_repr_array_long(): + data = pd.array([1, 2, None] * 1000) + expected = ( + "\n" + "[ 1, 2, , 1, 2, , 1, 2, , 1,\n" + " ...\n" + " , 1, 2, , 1, 2, , 1, 2, ]\n" + "Length: 3000, dtype: Int64" + ) + result = repr(data) + assert result == expected + + +def test_frame_repr(data_missing): + df = pd.DataFrame({"A": data_missing}) + result = repr(df) + expected = " A\n0 \n1 1" + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..d7a2140f817f3a8e5689d001768cf5642118b105 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_astype.py @@ -0,0 +1,28 @@ +import pytest + +from pandas import ( + Categorical, + CategoricalDtype, + Index, + IntervalIndex, +) +import pandas._testing as tm + + +class TestAstype: + @pytest.mark.parametrize("ordered", [True, False]) + def test_astype_categorical_retains_ordered(self, ordered): + index = IntervalIndex.from_breaks(range(5)) + arr = index._data + + dtype = CategoricalDtype(None, ordered=ordered) + + expected = Categorical(list(arr), ordered=ordered) + result = arr.astype(dtype) + assert result.ordered is ordered + tm.assert_categorical_equal(result, expected) + + # test IntervalIndex.astype while we're at it. + result = index.astype(dtype) + expected = Index(expected) + tm.assert_index_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_formats.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_formats.py new file mode 100644 index 0000000000000000000000000000000000000000..535efee51937473071c9490330eb68364769d5aa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_formats.py @@ -0,0 +1,13 @@ +from pandas.core.arrays import IntervalArray + + +def test_repr(): + # GH#25022 + arr = IntervalArray.from_tuples([(0, 1), (1, 2)]) + result = repr(arr) + expected = ( + "\n" + "[(0, 1], (1, 2]]\n" + "Length: 2, dtype: interval[int64, right]" + ) + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_interval.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_interval.py new file mode 100644 index 0000000000000000000000000000000000000000..be4b2c3e7e74cddfaf8b2efa08879d3a6d8f1757 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_interval.py @@ -0,0 +1,231 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Index, + Interval, + IntervalIndex, + Timedelta, + Timestamp, + date_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +@pytest.fixture( + params=[ + (Index([0, 2, 4]), Index([1, 3, 5])), + (Index([0.0, 1.0, 2.0]), Index([1.0, 2.0, 3.0])), + (timedelta_range("0 days", periods=3), timedelta_range("1 day", periods=3)), + (date_range("20170101", periods=3), date_range("20170102", periods=3)), + ( + date_range("20170101", periods=3, tz="US/Eastern"), + date_range("20170102", periods=3, tz="US/Eastern"), + ), + ], + ids=lambda x: str(x[0].dtype), +) +def left_right_dtypes(request): + """ + Fixture for building an IntervalArray from various dtypes + """ + return request.param + + +class TestAttributes: + @pytest.mark.parametrize( + "left, right", + [ + (0, 1), + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timestamp("2018-01-02")), + ( + Timestamp("2018-01-01", tz="US/Eastern"), + Timestamp("2018-01-02", tz="US/Eastern"), + ), + ], + ) + @pytest.mark.parametrize("constructor", [IntervalArray, IntervalIndex]) + def test_is_empty(self, constructor, left, right, closed): + # GH27219 + tuples = [(left, left), (left, right), np.nan] + expected = np.array([closed != "both", False, False]) + result = constructor.from_tuples(tuples, closed=closed).is_empty + tm.assert_numpy_array_equal(result, expected) + + +class TestMethods: + @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"]) + def test_set_closed(self, closed, new_closed): + # GH 21670 + array = IntervalArray.from_breaks(range(10), closed=closed) + result = array.set_closed(new_closed) + expected = IntervalArray.from_breaks(range(10), closed=new_closed) + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + Interval(0, 1, closed="right"), + IntervalArray.from_breaks([1, 2, 3, 4], closed="right"), + ], + ) + def test_where_raises(self, other): + # GH#45768 The IntervalArray methods raises; the Series method coerces + ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4], closed="left")) + mask = np.array([True, False, True]) + match = "'value.closed' is 'right', expected 'left'." + with pytest.raises(ValueError, match=match): + ser.array._where(mask, other) + + res = ser.where(mask, other=other) + expected = ser.astype(object).where(mask, other) + tm.assert_series_equal(res, expected) + + def test_shift(self): + # https://github.com/pandas-dev/pandas/issues/31495, GH#22428, GH#31502 + a = IntervalArray.from_breaks([1, 2, 3]) + result = a.shift() + # int -> float + expected = IntervalArray.from_tuples([(np.nan, np.nan), (1.0, 2.0)]) + tm.assert_interval_array_equal(result, expected) + + msg = "can only insert Interval objects and NA into an IntervalArray" + with pytest.raises(TypeError, match=msg): + a.shift(1, fill_value=pd.NaT) + + def test_shift_datetime(self): + # GH#31502, GH#31504 + a = IntervalArray.from_breaks(date_range("2000", periods=4)) + result = a.shift(2) + expected = a.take([-1, -1, 0], allow_fill=True) + tm.assert_interval_array_equal(result, expected) + + result = a.shift(-1) + expected = a.take([1, 2, -1], allow_fill=True) + tm.assert_interval_array_equal(result, expected) + + msg = "can only insert Interval objects and NA into an IntervalArray" + with pytest.raises(TypeError, match=msg): + a.shift(1, fill_value=np.timedelta64("NaT", "ns")) + + +class TestSetitem: + def test_set_na(self, left_right_dtypes): + left, right = left_right_dtypes + left = left.copy(deep=True) + right = right.copy(deep=True) + result = IntervalArray.from_arrays(left, right) + + if result.dtype.subtype.kind not in ["m", "M"]: + msg = "'value' should be an interval type, got <.*NaTType'> instead." + with pytest.raises(TypeError, match=msg): + result[0] = pd.NaT + if result.dtype.subtype.kind in ["i", "u"]: + msg = "Cannot set float NaN to integer-backed IntervalArray" + # GH#45484 TypeError, not ValueError, matches what we get with + # non-NA un-holdable value. + with pytest.raises(TypeError, match=msg): + result[0] = np.nan + return + + result[0] = np.nan + + expected_left = Index([left._na_value] + list(left[1:])) + expected_right = Index([right._na_value] + list(right[1:])) + expected = IntervalArray.from_arrays(expected_left, expected_right) + + tm.assert_extension_array_equal(result, expected) + + def test_setitem_mismatched_closed(self): + arr = IntervalArray.from_breaks(range(4)) + orig = arr.copy() + other = arr.set_closed("both") + + msg = "'value.closed' is 'both', expected 'right'" + with pytest.raises(ValueError, match=msg): + arr[0] = other[0] + with pytest.raises(ValueError, match=msg): + arr[:1] = other[:1] + with pytest.raises(ValueError, match=msg): + arr[:0] = other[:0] + with pytest.raises(ValueError, match=msg): + arr[:] = other[::-1] + with pytest.raises(ValueError, match=msg): + arr[:] = list(other[::-1]) + with pytest.raises(ValueError, match=msg): + arr[:] = other[::-1].astype(object) + with pytest.raises(ValueError, match=msg): + arr[:] = other[::-1].astype("category") + + # empty list should be no-op + arr[:0] = [] + tm.assert_interval_array_equal(arr, orig) + + +class TestReductions: + def test_min_max_invalid_axis(self, left_right_dtypes): + left, right = left_right_dtypes + left = left.copy(deep=True) + right = right.copy(deep=True) + arr = IntervalArray.from_arrays(left, right) + + msg = "`axis` must be fewer than the number of dimensions" + for axis in [-2, 1]: + with pytest.raises(ValueError, match=msg): + arr.min(axis=axis) + with pytest.raises(ValueError, match=msg): + arr.max(axis=axis) + + msg = "'>=' not supported between" + with pytest.raises(TypeError, match=msg): + arr.min(axis="foo") + with pytest.raises(TypeError, match=msg): + arr.max(axis="foo") + + def test_min_max(self, left_right_dtypes, index_or_series_or_array): + # GH#44746 + left, right = left_right_dtypes + left = left.copy(deep=True) + right = right.copy(deep=True) + arr = IntervalArray.from_arrays(left, right) + + # The expected results below are only valid if monotonic + assert left.is_monotonic_increasing + assert Index(arr).is_monotonic_increasing + + MIN = arr[0] + MAX = arr[-1] + + indexer = np.arange(len(arr)) + np.random.default_rng(2).shuffle(indexer) + arr = arr.take(indexer) + + arr_na = arr.insert(2, np.nan) + + arr = index_or_series_or_array(arr) + arr_na = index_or_series_or_array(arr_na) + + for skipna in [True, False]: + res = arr.min(skipna=skipna) + assert res == MIN + assert type(res) == type(MIN) + + res = arr.max(skipna=skipna) + assert res == MAX + assert type(res) == type(MAX) + + res = arr_na.min(skipna=False) + assert np.isnan(res) + res = arr_na.max(skipna=False) + assert np.isnan(res) + + res = arr_na.min(skipna=True) + assert res == MIN + assert type(res) == type(MIN) + res = arr_na.max(skipna=True) + assert res == MAX + assert type(res) == type(MAX) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_interval_pyarrow.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_interval_pyarrow.py new file mode 100644 index 0000000000000000000000000000000000000000..ef8701be81e2b9248c29fc4e901161fd18d72bbe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -0,0 +1,160 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +def test_arrow_extension_type(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + p1 = ArrowIntervalType(pa.int64(), "left") + p2 = ArrowIntervalType(pa.int64(), "left") + p3 = ArrowIntervalType(pa.int64(), "right") + + assert p1.closed == "left" + assert p1 == p2 + assert p1 != p3 + assert hash(p1) == hash(p2) + assert hash(p1) != hash(p3) + + +def test_arrow_array(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + intervals = pd.interval_range(1, 5, freq=1).array + + result = pa.array(intervals) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == intervals.closed + assert result.type.subtype == pa.int64() + assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) + assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) + + expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(intervals, type=expected.type) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): + pa.array(intervals, type="float64") + + with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): + pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) + + +def test_arrow_array_missing(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) + arr[1] = None + + result = pa.array(arr) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == arr.closed + assert result.type.subtype == pa.float64() + + # fields have missing values (not NaN) + left = pa.array([0.0, None, 2.0], type="float64") + right = pa.array([1.0, None, 3.0], type="float64") + assert result.storage.field("left").equals(left) + assert result.storage.field("right").equals(right) + + # structarray itself also has missing values on the array level + vals = [ + {"left": 0.0, "right": 1.0}, + {"left": None, "right": None}, + {"left": 2.0, "right": 3.0}, + ] + expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) + assert result.storage.equals(expected) + + +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +@pytest.mark.parametrize( + "breaks", + [[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")], + ids=["float", "datetime64[ns]"], +) +def test_arrow_table_roundtrip(breaks): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowIntervalType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) + + # GH#41040 + table = pa.table( + [pa.chunked_array([], type=table.column(0).type)], schema=table.schema + ) + result = table.to_pandas() + tm.assert_frame_equal(result, expected[0:0]) + + +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +@pytest.mark.parametrize( + "breaks", + [[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")], + ids=["float", "datetime64[ns]"], +) +def test_arrow_table_roundtrip_without_metadata(breaks): + pa = pytest.importorskip("pyarrow") + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + # remove the metadata + table = table.replace_schema_metadata() + assert table.schema.metadata is None + + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + +def test_from_arrow_from_raw_struct_array(): + # in case pyarrow lost the Interval extension type (eg on parquet roundtrip + # with datetime64[ns] subtype, see GH-45881), still allow conversion + # from arrow to IntervalArray + pa = pytest.importorskip("pyarrow") + + arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}]) + dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither") + + result = dtype.__from_arrow__(arr) + expected = IntervalArray.from_breaks( + np.array([0, 1, 2], dtype="int64"), closed="neither" + ) + tm.assert_extension_array_equal(result, expected) + + result = dtype.__from_arrow__(pa.chunked_array([arr])) + tm.assert_extension_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_overlaps.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_overlaps.py new file mode 100644 index 0000000000000000000000000000000000000000..4853bec51106c05781a4c11921f0e082934147ec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/interval/test_overlaps.py @@ -0,0 +1,93 @@ +"""Tests for Interval-Interval operations, such as overlaps, contains, etc.""" +import numpy as np +import pytest + +from pandas import ( + Interval, + IntervalIndex, + Timedelta, + Timestamp, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +@pytest.fixture(params=[IntervalArray, IntervalIndex]) +def constructor(request): + """ + Fixture for testing both interval container classes. + """ + return request.param + + +@pytest.fixture( + params=[ + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timedelta("1 day")), + (0, 1), + ], + ids=lambda x: type(x[0]).__name__, +) +def start_shift(request): + """ + Fixture for generating intervals of different types from a start value + and a shift value that can be added to start to generate an endpoint. + """ + return request.param + + +class TestOverlaps: + def test_overlaps_interval(self, constructor, start_shift, closed, other_closed): + start, shift = start_shift + interval = Interval(start, start + 3 * shift, other_closed) + + # intervals: identical, nested, spanning, partial, adjacent, disjoint + tuples = [ + (start, start + 3 * shift), + (start + shift, start + 2 * shift), + (start - shift, start + 4 * shift), + (start + 2 * shift, start + 4 * shift), + (start + 3 * shift, start + 4 * shift), + (start + 4 * shift, start + 5 * shift), + ] + interval_container = constructor.from_tuples(tuples, closed) + + adjacent = interval.closed_right and interval_container.closed_left + expected = np.array([True, True, True, True, adjacent, False]) + result = interval_container.overlaps(interval) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("other_constructor", [IntervalArray, IntervalIndex]) + def test_overlaps_interval_container(self, constructor, other_constructor): + # TODO: modify this test when implemented + interval_container = constructor.from_breaks(range(5)) + other_container = other_constructor.from_breaks(range(5)) + with pytest.raises(NotImplementedError, match="^$"): + interval_container.overlaps(other_container) + + def test_overlaps_na(self, constructor, start_shift): + """NA values are marked as False""" + start, shift = start_shift + interval = Interval(start, start + shift) + + tuples = [ + (start, start + shift), + np.nan, + (start + 2 * shift, start + 3 * shift), + ] + interval_container = constructor.from_tuples(tuples) + + expected = np.array([True, False, False]) + result = interval_container.overlaps(interval) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")], + ids=lambda x: type(x).__name__, + ) + def test_overlaps_invalid_type(self, constructor, other): + interval_container = constructor.from_breaks(range(5)) + msg = f"`other` must be Interval-like, got {type(other).__name__}" + with pytest.raises(TypeError, match=msg): + interval_container.overlaps(other) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_arithmetic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_arithmetic.py new file mode 100644 index 0000000000000000000000000000000000000000..f4b571ca627b3da34e943972ba70b03bae74417a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_arithmetic.py @@ -0,0 +1,248 @@ +from __future__ import annotations + +from typing import Any + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +# integer dtypes +arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES] +scalars: list[Any] = [2] * len(arrays) +# floating dtypes +arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] +scalars += [0.2, 0.2] +# boolean +arrays += [pd.array([True, False, True, None], dtype="boolean")] +scalars += [False] + + +@pytest.fixture(params=zip(arrays, scalars), ids=[a.dtype.name for a in arrays]) +def data(request): + """Fixture returning parametrized (array, scalar) tuple. + + Used to test equivalence of scalars, numpy arrays with array ops, and the + equivalence of DataFrame and Series ops. + """ + return request.param + + +def check_skip(data, op_name): + if isinstance(data.dtype, pd.BooleanDtype) and "sub" in op_name: + pytest.skip("subtract not implemented for boolean") + + +def is_bool_not_implemented(data, op_name): + # match non-masked behavior + return data.dtype.kind == "b" and op_name.strip("_").lstrip("r") in [ + "pow", + "truediv", + "floordiv", + ] + + +# Test equivalence of scalars, numpy arrays with array ops +# ----------------------------------------------------------------------------- + + +def test_array_scalar_like_equivalence(data, all_arithmetic_operators): + data, scalar = data + op = tm.get_op_from_name(all_arithmetic_operators) + check_skip(data, all_arithmetic_operators) + + scalar_array = pd.array([scalar] * len(data), dtype=data.dtype) + + # TODO also add len-1 array (np.array([scalar], dtype=data.dtype.numpy_dtype)) + for scalar in [scalar, data.dtype.type(scalar)]: + if is_bool_not_implemented(data, all_arithmetic_operators): + msg = "operator '.*' not implemented for bool dtypes" + with pytest.raises(NotImplementedError, match=msg): + op(data, scalar) + with pytest.raises(NotImplementedError, match=msg): + op(data, scalar_array) + else: + result = op(data, scalar) + expected = op(data, scalar_array) + tm.assert_extension_array_equal(result, expected) + + +def test_array_NA(data, all_arithmetic_operators): + data, _ = data + op = tm.get_op_from_name(all_arithmetic_operators) + check_skip(data, all_arithmetic_operators) + + scalar = pd.NA + scalar_array = pd.array([pd.NA] * len(data), dtype=data.dtype) + + mask = data._mask.copy() + + if is_bool_not_implemented(data, all_arithmetic_operators): + msg = "operator '.*' not implemented for bool dtypes" + with pytest.raises(NotImplementedError, match=msg): + op(data, scalar) + # GH#45421 check op doesn't alter data._mask inplace + tm.assert_numpy_array_equal(mask, data._mask) + return + + result = op(data, scalar) + # GH#45421 check op doesn't alter data._mask inplace + tm.assert_numpy_array_equal(mask, data._mask) + + expected = op(data, scalar_array) + tm.assert_numpy_array_equal(mask, data._mask) + + tm.assert_extension_array_equal(result, expected) + + +def test_numpy_array_equivalence(data, all_arithmetic_operators): + data, scalar = data + op = tm.get_op_from_name(all_arithmetic_operators) + check_skip(data, all_arithmetic_operators) + + numpy_array = np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype) + pd_array = pd.array(numpy_array, dtype=data.dtype) + + if is_bool_not_implemented(data, all_arithmetic_operators): + msg = "operator '.*' not implemented for bool dtypes" + with pytest.raises(NotImplementedError, match=msg): + op(data, numpy_array) + with pytest.raises(NotImplementedError, match=msg): + op(data, pd_array) + return + + result = op(data, numpy_array) + expected = op(data, pd_array) + tm.assert_extension_array_equal(result, expected) + + +# Test equivalence with Series and DataFrame ops +# ----------------------------------------------------------------------------- + + +def test_frame(data, all_arithmetic_operators): + data, scalar = data + op = tm.get_op_from_name(all_arithmetic_operators) + check_skip(data, all_arithmetic_operators) + + # DataFrame with scalar + df = pd.DataFrame({"A": data}) + + if is_bool_not_implemented(data, all_arithmetic_operators): + msg = "operator '.*' not implemented for bool dtypes" + with pytest.raises(NotImplementedError, match=msg): + op(df, scalar) + with pytest.raises(NotImplementedError, match=msg): + op(data, scalar) + return + + result = op(df, scalar) + expected = pd.DataFrame({"A": op(data, scalar)}) + tm.assert_frame_equal(result, expected) + + +def test_series(data, all_arithmetic_operators): + data, scalar = data + op = tm.get_op_from_name(all_arithmetic_operators) + check_skip(data, all_arithmetic_operators) + + ser = pd.Series(data) + + others = [ + scalar, + np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype), + pd.array([scalar] * len(data), dtype=data.dtype), + pd.Series([scalar] * len(data), dtype=data.dtype), + ] + + for other in others: + if is_bool_not_implemented(data, all_arithmetic_operators): + msg = "operator '.*' not implemented for bool dtypes" + with pytest.raises(NotImplementedError, match=msg): + op(ser, other) + + else: + result = op(ser, other) + expected = pd.Series(op(data, other)) + tm.assert_series_equal(result, expected) + + +# Test generic characteristics / errors +# ----------------------------------------------------------------------------- + + +def test_error_invalid_object(data, all_arithmetic_operators): + data, _ = data + + op = all_arithmetic_operators + opa = getattr(data, op) + + # 2d -> return NotImplemented + result = opa(pd.DataFrame({"A": data})) + assert result is NotImplemented + + msg = r"can only perform ops with 1-d structures" + with pytest.raises(NotImplementedError, match=msg): + opa(np.arange(len(data)).reshape(-1, len(data))) + + +def test_error_len_mismatch(data, all_arithmetic_operators): + # operating with a list-like with non-matching length raises + data, scalar = data + op = tm.get_op_from_name(all_arithmetic_operators) + + other = [scalar] * (len(data) - 1) + + err = ValueError + msg = "|".join( + [ + r"operands could not be broadcast together with shapes \(3,\) \(4,\)", + r"operands could not be broadcast together with shapes \(4,\) \(3,\)", + ] + ) + if data.dtype.kind == "b" and all_arithmetic_operators.strip("_") in [ + "sub", + "rsub", + ]: + err = TypeError + msg = ( + r"numpy boolean subtract, the `\-` operator, is not supported, use " + r"the bitwise_xor, the `\^` operator, or the logical_xor function instead" + ) + elif is_bool_not_implemented(data, all_arithmetic_operators): + msg = "operator '.*' not implemented for bool dtypes" + err = NotImplementedError + + for other in [other, np.array(other)]: + with pytest.raises(err, match=msg): + op(data, other) + + s = pd.Series(data) + with pytest.raises(err, match=msg): + op(s, other) + + +@pytest.mark.parametrize("op", ["__neg__", "__abs__", "__invert__"]) +def test_unary_op_does_not_propagate_mask(data, op): + # https://github.com/pandas-dev/pandas/issues/39943 + data, _ = data + ser = pd.Series(data) + + if op == "__invert__" and data.dtype.kind == "f": + # we follow numpy in raising + msg = "ufunc 'invert' not supported for the input types" + with pytest.raises(TypeError, match=msg): + getattr(ser, op)() + with pytest.raises(TypeError, match=msg): + getattr(data, op)() + with pytest.raises(TypeError, match=msg): + # Check that this is still the numpy behavior + getattr(data._data, op)() + + return + + result = getattr(ser, op)() + expected = result.copy(deep=True) + ser[0] = None + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_arrow_compat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_arrow_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..7a89656bd5aa0bff079f58fb1acaa1f7742ed8e8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_arrow_compat.py @@ -0,0 +1,209 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +pa = pytest.importorskip("pyarrow") + +from pandas.core.arrays.arrow._arrow_utils import pyarrow_array_to_numpy_and_mask + +arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES] +arrays += [pd.array([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES] +arrays += [pd.array([True, False, True, None], dtype="boolean")] + + +@pytest.fixture(params=arrays, ids=[a.dtype.name for a in arrays]) +def data(request): + """ + Fixture returning parametrized array from given dtype, including integer, + float and boolean + """ + return request.param + + +def test_arrow_array(data): + arr = pa.array(data) + expected = pa.array( + data.to_numpy(object, na_value=None), + type=pa.from_numpy_dtype(data.dtype.numpy_dtype), + ) + assert arr.equals(expected) + + +def test_arrow_roundtrip(data): + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + + result = table.to_pandas() + assert result["a"].dtype == data.dtype + tm.assert_frame_equal(result, df) + + +def test_dataframe_from_arrow_types_mapper(): + def types_mapper(arrow_type): + if pa.types.is_boolean(arrow_type): + return pd.BooleanDtype() + elif pa.types.is_integer(arrow_type): + return pd.Int64Dtype() + + bools_array = pa.array([True, None, False], type=pa.bool_()) + ints_array = pa.array([1, None, 2], type=pa.int64()) + small_ints_array = pa.array([-1, 0, 7], type=pa.int8()) + record_batch = pa.RecordBatch.from_arrays( + [bools_array, ints_array, small_ints_array], ["bools", "ints", "small_ints"] + ) + result = record_batch.to_pandas(types_mapper=types_mapper) + bools = pd.Series([True, None, False], dtype="boolean") + ints = pd.Series([1, None, 2], dtype="Int64") + small_ints = pd.Series([-1, 0, 7], dtype="Int64") + expected = pd.DataFrame({"bools": bools, "ints": ints, "small_ints": small_ints}) + tm.assert_frame_equal(result, expected) + + +def test_arrow_load_from_zero_chunks(data): + # GH-41040 + + df = pd.DataFrame({"a": data[0:0]}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + table = pa.table( + [pa.chunked_array([], type=table.field("a").type)], schema=table.schema + ) + result = table.to_pandas() + assert result["a"].dtype == data.dtype + tm.assert_frame_equal(result, df) + + +def test_arrow_from_arrow_uint(): + # https://github.com/pandas-dev/pandas/issues/31896 + # possible mismatch in types + + dtype = pd.UInt32Dtype() + result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64")) + expected = pd.array([1, 2, 3, 4, None], dtype="UInt32") + + tm.assert_extension_array_equal(result, expected) + + +def test_arrow_sliced(data): + # https://github.com/pandas-dev/pandas/issues/38525 + + df = pd.DataFrame({"a": data}) + table = pa.table(df) + result = table.slice(2, None).to_pandas() + expected = df.iloc[2:].reset_index(drop=True) + tm.assert_frame_equal(result, expected) + + # no missing values + df2 = df.fillna(data[0]) + table = pa.table(df2) + result = table.slice(2, None).to_pandas() + expected = df2.iloc[2:].reset_index(drop=True) + tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def np_dtype_to_arrays(any_real_numpy_dtype): + """ + Fixture returning actual and expected dtype, pandas and numpy arrays and + mask from a given numpy dtype + """ + np_dtype = np.dtype(any_real_numpy_dtype) + pa_type = pa.from_numpy_dtype(np_dtype) + + # None ensures the creation of a bitmask buffer. + pa_array = pa.array([0, 1, 2, None], type=pa_type) + # Since masked Arrow buffer slots are not required to contain a specific + # value, assert only the first three values of the created np.array + np_expected = np.array([0, 1, 2], dtype=np_dtype) + mask_expected = np.array([True, True, True, False]) + return np_dtype, pa_array, np_expected, mask_expected + + +def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): + """ + Test conversion from pyarrow array to numpy array. + + Modifies the pyarrow buffer to contain padding and offset, which are + considered valid buffers by pyarrow. + + Also tests empty pyarrow arrays with non empty buffers. + See https://github.com/pandas-dev/pandas/issues/40896 + """ + np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays + data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + mask_buffer = pa_array.buffers()[0] + data_buffer = pa_array.buffers()[1] + data_buffer_bytes = pa_array.buffers()[1].to_pybytes() + + # Add trailing padding to the buffer. + data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00") + pa_array_trail = pa.Array.from_buffers( + type=pa_array.type, + length=len(pa_array), + buffers=[mask_buffer, data_buffer_trail], + offset=pa_array.offset, + ) + pa_array_trail.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + # Add offset to the buffer. + offset = b"\x00" * (pa_array.type.bit_width // 8) + data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes) + mask_buffer_offset = pa.py_buffer(b"\x0E") + pa_array_offset = pa.Array.from_buffers( + type=pa_array.type, + length=len(pa_array), + buffers=[mask_buffer_offset, data_buffer_offset], + offset=pa_array.offset + 1, + ) + pa_array_offset.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected) + tm.assert_numpy_array_equal(mask, mask_expected) + + # Empty array + np_expected_empty = np.array([], dtype=np_dtype) + mask_expected_empty = np.array([], dtype=np.bool_) + + pa_array_offset = pa.Array.from_buffers( + type=pa_array.type, + length=0, + buffers=[mask_buffer, data_buffer], + offset=pa_array.offset, + ) + pa_array_offset.validate() + data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) + tm.assert_numpy_array_equal(data[:3], np_expected_empty) + tm.assert_numpy_array_equal(mask, mask_expected_empty) + + +@pytest.mark.parametrize( + "arr", [pa.nulls(10), pa.chunked_array([pa.nulls(4), pa.nulls(6)])] +) +def test_from_arrow_null(data, arr): + res = data.dtype.__from_arrow__(arr) + assert res.isna().all() + assert len(res) == 10 + + +def test_from_arrow_type_error(data): + # ensure that __from_arrow__ returns a TypeError when getting a wrong + # array type + + arr = pa.array(data).cast("string") + with pytest.raises(TypeError, match=None): + # we don't test the exact error message, only the fact that it raises + # a TypeError is relevant + data.dtype.__from_arrow__(arr) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_function.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_function.py new file mode 100644 index 0000000000000000000000000000000000000000..b259018cd6121c53c767e36e3c757211643262d6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_function.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_integer_dtype + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import BaseMaskedArray + +arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES] +arrays += [ + pd.array([0.141, -0.268, 5.895, None], dtype=dtype) for dtype in tm.FLOAT_EA_DTYPES +] + + +@pytest.fixture(params=arrays, ids=[a.dtype.name for a in arrays]) +def data(request): + """ + Fixture returning parametrized 'data' array with different integer and + floating point types + """ + return request.param + + +@pytest.fixture() +def numpy_dtype(data): + """ + Fixture returning numpy dtype from 'data' input array. + """ + # For integer dtype, the numpy conversion must be done to float + if is_integer_dtype(data): + numpy_dtype = float + else: + numpy_dtype = data.dtype.type + return numpy_dtype + + +def test_round(data, numpy_dtype): + # No arguments + result = data.round() + expected = pd.array( + np.round(data.to_numpy(dtype=numpy_dtype, na_value=None)), dtype=data.dtype + ) + tm.assert_extension_array_equal(result, expected) + + # Decimals argument + result = data.round(decimals=2) + expected = pd.array( + np.round(data.to_numpy(dtype=numpy_dtype, na_value=None), decimals=2), + dtype=data.dtype, + ) + tm.assert_extension_array_equal(result, expected) + + +def test_tolist(data): + result = data.tolist() + expected = list(data) + tm.assert_equal(result, expected) + + +def test_to_numpy(): + # GH#56991 + + class MyStringArray(BaseMaskedArray): + dtype = pd.StringDtype() + _dtype_cls = pd.StringDtype + _internal_fill_value = pd.NA + + arr = MyStringArray( + values=np.array(["a", "b", "c"]), mask=np.array([False, True, False]) + ) + result = arr.to_numpy() + expected = np.array(["a", pd.NA, "c"]) + tm.assert_numpy_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..28ee451a7ddd777bc19b1b7623ec2da64f700ede --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/masked/test_indexing.py @@ -0,0 +1,60 @@ +import re + +import numpy as np +import pytest + +import pandas as pd + + +class TestSetitemValidation: + def _check_setitem_invalid(self, arr, invalid): + msg = f"Invalid value '{str(invalid)}' for dtype {arr.dtype}" + msg = re.escape(msg) + with pytest.raises(TypeError, match=msg): + arr[0] = invalid + + with pytest.raises(TypeError, match=msg): + arr[:] = invalid + + with pytest.raises(TypeError, match=msg): + arr[[0]] = invalid + + # FIXME: don't leave commented-out + # with pytest.raises(TypeError): + # arr[[0]] = [invalid] + + # with pytest.raises(TypeError): + # arr[[0]] = np.array([invalid], dtype=object) + + # Series non-coercion, behavior subject to change + ser = pd.Series(arr) + with pytest.raises(TypeError, match=msg): + ser[0] = invalid + # TODO: so, so many other variants of this... + + _invalid_scalars = [ + 1 + 2j, + "True", + "1", + "1.0", + pd.NaT, + np.datetime64("NaT"), + np.timedelta64("NaT"), + ] + + @pytest.mark.parametrize( + "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] + ) + def test_setitem_validation_scalar_bool(self, invalid): + arr = pd.array([True, False, None], dtype="boolean") + self._check_setitem_invalid(arr, invalid) + + @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) + def test_setitem_validation_scalar_int(self, invalid, any_int_ea_dtype): + arr = pd.array([1, 2, None], dtype=any_int_ea_dtype) + self._check_setitem_invalid(arr, invalid) + + @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) + def test_setitem_validation_scalar_float(self, invalid, float_ea_dtype): + arr = pd.array([1, 2, None], dtype=float_ea_dtype) + self._check_setitem_invalid(arr, invalid) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..225d64ad7d2580f877505f0ac3a459e2ea4f0f53 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/test_indexing.py @@ -0,0 +1,41 @@ +import numpy as np + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +import pandas._testing as tm + + +class TestSearchsorted: + def test_searchsorted_string(self, string_dtype): + arr = pd.array(["a", "b", "c"], dtype=string_dtype) + + result = arr.searchsorted("a", side="left") + assert is_scalar(result) + assert result == 0 + + result = arr.searchsorted("a", side="right") + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_numeric_dtypes_scalar(self, any_real_numpy_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_numpy_dtype) + result = arr.searchsorted(30) + assert is_scalar(result) + assert result == 2 + + result = arr.searchsorted([30]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_searchsorted_numeric_dtypes_vector(self, any_real_numpy_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_numpy_dtype) + result = arr.searchsorted([2, 30]) + expected = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_searchsorted_sorter(self, any_real_numpy_dtype): + arr = pd.array([3, 1, 2], dtype=any_real_numpy_dtype) + result = arr.searchsorted([0, 3], sorter=np.argsort(arr)) + expected = np.array([0, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/test_numpy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/test_numpy.py new file mode 100644 index 0000000000000000000000000000000000000000..5112ce262f7711d09e59870c399a588cb2a7fd91 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/numpy_/test_numpy.py @@ -0,0 +1,324 @@ +""" +Additional tests for NumpyExtensionArray that aren't covered by +the interface tests. +""" +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import NumpyEADtype + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import NumpyExtensionArray + + +@pytest.fixture( + params=[ + np.array(["a", "b"], dtype=object), + np.array([0, 1], dtype=float), + np.array([0, 1], dtype=int), + np.array([0, 1 + 2j], dtype=complex), + np.array([True, False], dtype=bool), + np.array([0, 1], dtype="datetime64[ns]"), + np.array([0, 1], dtype="timedelta64[ns]"), + ] +) +def any_numpy_array(request): + """ + Parametrized fixture for NumPy arrays with different dtypes. + + This excludes string and bytes. + """ + return request.param + + +# ---------------------------------------------------------------------------- +# NumpyEADtype + + +@pytest.mark.parametrize( + "dtype, expected", + [ + ("bool", True), + ("int", True), + ("uint", True), + ("float", True), + ("complex", True), + ("str", False), + ("bytes", False), + ("datetime64[ns]", False), + ("object", False), + ("void", False), + ], +) +def test_is_numeric(dtype, expected): + dtype = NumpyEADtype(dtype) + assert dtype._is_numeric is expected + + +@pytest.mark.parametrize( + "dtype, expected", + [ + ("bool", True), + ("int", False), + ("uint", False), + ("float", False), + ("complex", False), + ("str", False), + ("bytes", False), + ("datetime64[ns]", False), + ("object", False), + ("void", False), + ], +) +def test_is_boolean(dtype, expected): + dtype = NumpyEADtype(dtype) + assert dtype._is_boolean is expected + + +def test_repr(): + dtype = NumpyEADtype(np.dtype("int64")) + assert repr(dtype) == "NumpyEADtype('int64')" + + +def test_constructor_from_string(): + result = NumpyEADtype.construct_from_string("int64") + expected = NumpyEADtype(np.dtype("int64")) + assert result == expected + + +def test_dtype_idempotent(any_numpy_dtype): + dtype = NumpyEADtype(any_numpy_dtype) + + result = NumpyEADtype(dtype) + assert result == dtype + + +# ---------------------------------------------------------------------------- +# Construction + + +def test_constructor_no_coercion(): + with pytest.raises(ValueError, match="NumPy array"): + NumpyExtensionArray([1, 2, 3]) + + +def test_series_constructor_with_copy(): + ndarray = np.array([1, 2, 3]) + ser = pd.Series(NumpyExtensionArray(ndarray), copy=True) + + assert ser.values is not ndarray + + +def test_series_constructor_with_astype(): + ndarray = np.array([1, 2, 3]) + result = pd.Series(NumpyExtensionArray(ndarray), dtype="float64") + expected = pd.Series([1.0, 2.0, 3.0], dtype="float64") + tm.assert_series_equal(result, expected) + + +def test_from_sequence_dtype(): + arr = np.array([1, 2, 3], dtype="int64") + result = NumpyExtensionArray._from_sequence(arr, dtype="uint64") + expected = NumpyExtensionArray(np.array([1, 2, 3], dtype="uint64")) + tm.assert_extension_array_equal(result, expected) + + +def test_constructor_copy(): + arr = np.array([0, 1]) + result = NumpyExtensionArray(arr, copy=True) + + assert not tm.shares_memory(result, arr) + + +def test_constructor_with_data(any_numpy_array): + nparr = any_numpy_array + arr = NumpyExtensionArray(nparr) + assert arr.dtype.numpy_dtype == nparr.dtype + + +# ---------------------------------------------------------------------------- +# Conversion + + +def test_to_numpy(): + arr = NumpyExtensionArray(np.array([1, 2, 3])) + result = arr.to_numpy() + assert result is arr._ndarray + + result = arr.to_numpy(copy=True) + assert result is not arr._ndarray + + result = arr.to_numpy(dtype="f8") + expected = np.array([1, 2, 3], dtype="f8") + tm.assert_numpy_array_equal(result, expected) + + +# ---------------------------------------------------------------------------- +# Setitem + + +def test_setitem_series(): + ser = pd.Series([1, 2, 3]) + ser.array[0] = 10 + expected = pd.Series([10, 2, 3]) + tm.assert_series_equal(ser, expected) + + +def test_setitem(any_numpy_array): + nparr = any_numpy_array + arr = NumpyExtensionArray(nparr, copy=True) + + arr[0] = arr[1] + nparr[0] = nparr[1] + + tm.assert_numpy_array_equal(arr.to_numpy(), nparr) + + +# ---------------------------------------------------------------------------- +# Reductions + + +def test_bad_reduce_raises(): + arr = np.array([1, 2, 3], dtype="int64") + arr = NumpyExtensionArray(arr) + msg = "cannot perform not_a_method with type int" + with pytest.raises(TypeError, match=msg): + arr._reduce(msg) + + +def test_validate_reduction_keyword_args(): + arr = NumpyExtensionArray(np.array([1, 2, 3])) + msg = "the 'keepdims' parameter is not supported .*all" + with pytest.raises(ValueError, match=msg): + arr.all(keepdims=True) + + +def test_np_max_nested_tuples(): + # case where checking in ufunc.nout works while checking for tuples + # does not + vals = [ + (("j", "k"), ("l", "m")), + (("l", "m"), ("o", "p")), + (("o", "p"), ("j", "k")), + ] + ser = pd.Series(vals) + arr = ser.array + + assert arr.max() is arr[2] + assert ser.max() is arr[2] + + result = np.maximum.reduce(arr) + assert result == arr[2] + + result = np.maximum.reduce(ser) + assert result == arr[2] + + +def test_np_reduce_2d(): + raw = np.arange(12).reshape(4, 3) + arr = NumpyExtensionArray(raw) + + res = np.maximum.reduce(arr, axis=0) + tm.assert_extension_array_equal(res, arr[-1]) + + alt = arr.max(axis=0) + tm.assert_extension_array_equal(alt, arr[-1]) + + +# ---------------------------------------------------------------------------- +# Ops + + +@pytest.mark.parametrize("ufunc", [np.abs, np.negative, np.positive]) +def test_ufunc_unary(ufunc): + arr = NumpyExtensionArray(np.array([-1.0, 0.0, 1.0])) + result = ufunc(arr) + expected = NumpyExtensionArray(ufunc(arr._ndarray)) + tm.assert_extension_array_equal(result, expected) + + # same thing but with the 'out' keyword + out = NumpyExtensionArray(np.array([-9.0, -9.0, -9.0])) + ufunc(arr, out=out) + tm.assert_extension_array_equal(out, expected) + + +def test_ufunc(): + arr = NumpyExtensionArray(np.array([-1.0, 0.0, 1.0])) + + r1, r2 = np.divmod(arr, np.add(arr, 2)) + e1, e2 = np.divmod(arr._ndarray, np.add(arr._ndarray, 2)) + e1 = NumpyExtensionArray(e1) + e2 = NumpyExtensionArray(e2) + tm.assert_extension_array_equal(r1, e1) + tm.assert_extension_array_equal(r2, e2) + + +def test_basic_binop(): + # Just a basic smoke test. The EA interface tests exercise this + # more thoroughly. + x = NumpyExtensionArray(np.array([1, 2, 3])) + result = x + x + expected = NumpyExtensionArray(np.array([2, 4, 6])) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [None, object]) +def test_setitem_object_typecode(dtype): + arr = NumpyExtensionArray(np.array(["a", "b", "c"], dtype=dtype)) + arr[0] = "t" + expected = NumpyExtensionArray(np.array(["t", "b", "c"], dtype=dtype)) + tm.assert_extension_array_equal(arr, expected) + + +def test_setitem_no_coercion(): + # https://github.com/pandas-dev/pandas/issues/28150 + arr = NumpyExtensionArray(np.array([1, 2, 3])) + with pytest.raises(ValueError, match="int"): + arr[0] = "a" + + # With a value that we do coerce, check that we coerce the value + # and not the underlying array. + arr[0] = 2.5 + assert isinstance(arr[0], (int, np.integer)), type(arr[0]) + + +def test_setitem_preserves_views(): + # GH#28150, see also extension test of the same name + arr = NumpyExtensionArray(np.array([1, 2, 3])) + view1 = arr.view() + view2 = arr[:] + view3 = np.asarray(arr) + + arr[0] = 9 + assert view1[0] == 9 + assert view2[0] == 9 + assert view3[0] == 9 + + arr[-1] = 2.5 + view1[-1] = 5 + assert arr[-1] == 5 + + +@pytest.mark.parametrize("dtype", [np.int64, np.uint64]) +def test_quantile_empty(dtype): + # we should get back np.nans, not -1s + arr = NumpyExtensionArray(np.array([], dtype=dtype)) + idx = pd.Index([0.0, 0.5]) + + result = arr._quantile(idx, interpolation="linear") + expected = NumpyExtensionArray(np.array([np.nan, np.nan])) + tm.assert_extension_array_equal(result, expected) + + +def test_factorize_unsigned(): + # don't raise when calling factorize on unsigned int NumpyExtensionArray + arr = np.array([1, 2, 3], dtype=np.uint64) + obj = NumpyExtensionArray(arr) + + res_codes, res_unique = obj.factorize() + exp_codes, exp_unique = pd.factorize(arr) + + tm.assert_numpy_array_equal(res_codes, exp_codes) + + tm.assert_extension_array_equal(res_unique, NumpyExtensionArray(exp_unique)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_accessor.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_accessor.py new file mode 100644 index 0000000000000000000000000000000000000000..87eb7bcfa9cee3e92386ad0f148b896c0e682b07 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_accessor.py @@ -0,0 +1,253 @@ +import string + +import numpy as np +import pytest + +import pandas as pd +from pandas import SparseDtype +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray + + +class TestSeriesAccessor: + def test_to_dense(self): + ser = pd.Series([0, 1, 0, 10], dtype="Sparse[int64]") + result = ser.sparse.to_dense() + expected = pd.Series([0, 1, 0, 10]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("attr", ["npoints", "density", "fill_value", "sp_values"]) + def test_get_attributes(self, attr): + arr = SparseArray([0, 1]) + ser = pd.Series(arr) + + result = getattr(ser.sparse, attr) + expected = getattr(arr, attr) + assert result == expected + + def test_from_coo(self): + scipy_sparse = pytest.importorskip("scipy.sparse") + + row = [0, 3, 1, 0] + col = [0, 3, 1, 2] + data = [4, 5, 7, 9] + + sp_array = scipy_sparse.coo_matrix((data, (row, col))) + result = pd.Series.sparse.from_coo(sp_array) + + index = pd.MultiIndex.from_arrays( + [ + np.array([0, 0, 1, 3], dtype=np.int32), + np.array([0, 2, 1, 3], dtype=np.int32), + ], + ) + expected = pd.Series([4, 9, 7, 5], index=index, dtype="Sparse[int]") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "sort_labels, expected_rows, expected_cols, expected_values_pos", + [ + ( + False, + [("b", 2), ("a", 2), ("b", 1), ("a", 1)], + [("z", 1), ("z", 2), ("x", 2), ("z", 0)], + {1: (1, 0), 3: (3, 3)}, + ), + ( + True, + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], + [("x", 2), ("z", 0), ("z", 1), ("z", 2)], + {1: (1, 2), 3: (0, 1)}, + ), + ], + ) + def test_to_coo( + self, sort_labels, expected_rows, expected_cols, expected_values_pos + ): + sp_sparse = pytest.importorskip("scipy.sparse") + + values = SparseArray([0, np.nan, 1, 0, None, 3], fill_value=0) + index = pd.MultiIndex.from_tuples( + [ + ("b", 2, "z", 1), + ("a", 2, "z", 2), + ("a", 2, "z", 1), + ("a", 2, "x", 2), + ("b", 1, "z", 1), + ("a", 1, "z", 0), + ] + ) + ss = pd.Series(values, index=index) + + expected_A = np.zeros((4, 4)) + for value, (row, col) in expected_values_pos.items(): + expected_A[row, col] = value + + A, rows, cols = ss.sparse.to_coo( + row_levels=(0, 1), column_levels=(2, 3), sort_labels=sort_labels + ) + assert isinstance(A, sp_sparse.coo_matrix) + tm.assert_numpy_array_equal(A.toarray(), expected_A) + assert rows == expected_rows + assert cols == expected_cols + + def test_non_sparse_raises(self): + ser = pd.Series([1, 2, 3]) + with pytest.raises(AttributeError, match=".sparse"): + ser.sparse.density + + +class TestFrameAccessor: + def test_accessor_raises(self): + df = pd.DataFrame({"A": [0, 1]}) + with pytest.raises(AttributeError, match="sparse"): + df.sparse + + @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) + @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) + @pytest.mark.parametrize("dtype", ["float64", "int64"]) + def test_from_spmatrix(self, format, labels, dtype): + sp_sparse = pytest.importorskip("scipy.sparse") + + sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) + + mat = sp_sparse.eye(10, format=format, dtype=dtype) + result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) + expected = pd.DataFrame( + np.eye(10, dtype=dtype), index=labels, columns=labels + ).astype(sp_dtype) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) + def test_from_spmatrix_including_explicit_zero(self, format): + sp_sparse = pytest.importorskip("scipy.sparse") + + mat = sp_sparse.random(10, 2, density=0.5, format=format) + mat.data[0] = 0 + result = pd.DataFrame.sparse.from_spmatrix(mat) + dtype = SparseDtype("float64", 0.0) + expected = pd.DataFrame(mat.todense()).astype(dtype) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "columns", + [["a", "b"], pd.MultiIndex.from_product([["A"], ["a", "b"]]), ["a", "a"]], + ) + def test_from_spmatrix_columns(self, columns): + sp_sparse = pytest.importorskip("scipy.sparse") + + dtype = SparseDtype("float64", 0.0) + + mat = sp_sparse.random(10, 2, density=0.5) + result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) + expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] + ) + def test_to_coo(self, colnames): + sp_sparse = pytest.importorskip("scipy.sparse") + + df = pd.DataFrame( + {colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]" + ) + result = df.sparse.to_coo() + expected = sp_sparse.coo_matrix(np.asarray(df)) + assert (result != expected).nnz == 0 + + @pytest.mark.parametrize("fill_value", [1, np.nan]) + def test_to_coo_nonzero_fill_val_raises(self, fill_value): + pytest.importorskip("scipy") + df = pd.DataFrame( + { + "A": SparseArray( + [fill_value, fill_value, fill_value, 2], fill_value=fill_value + ), + "B": SparseArray( + [fill_value, 2, fill_value, fill_value], fill_value=fill_value + ), + } + ) + with pytest.raises(ValueError, match="fill value must be 0"): + df.sparse.to_coo() + + def test_to_coo_midx_categorical(self): + # GH#50996 + sp_sparse = pytest.importorskip("scipy.sparse") + + midx = pd.MultiIndex.from_arrays( + [ + pd.CategoricalIndex(list("ab"), name="x"), + pd.CategoricalIndex([0, 1], name="y"), + ] + ) + + ser = pd.Series(1, index=midx, dtype="Sparse[int]") + result = ser.sparse.to_coo(row_levels=["x"], column_levels=["y"])[0] + expected = sp_sparse.coo_matrix( + (np.array([1, 1]), (np.array([0, 1]), np.array([0, 1]))), shape=(2, 2) + ) + assert (result != expected).nnz == 0 + + def test_to_dense(self): + df = pd.DataFrame( + { + "A": SparseArray([1, 0], dtype=SparseDtype("int64", 0)), + "B": SparseArray([1, 0], dtype=SparseDtype("int64", 1)), + "C": SparseArray([1.0, 0.0], dtype=SparseDtype("float64", 0.0)), + }, + index=["b", "a"], + ) + result = df.sparse.to_dense() + expected = pd.DataFrame( + {"A": [1, 0], "B": [1, 0], "C": [1.0, 0.0]}, index=["b", "a"] + ) + tm.assert_frame_equal(result, expected) + + def test_density(self): + df = pd.DataFrame( + { + "A": SparseArray([1, 0, 2, 1], fill_value=0), + "B": SparseArray([0, 1, 1, 1], fill_value=0), + } + ) + res = df.sparse.density + expected = 0.75 + assert res == expected + + @pytest.mark.parametrize("dtype", ["int64", "float64"]) + @pytest.mark.parametrize("dense_index", [True, False]) + def test_series_from_coo(self, dtype, dense_index): + sp_sparse = pytest.importorskip("scipy.sparse") + + A = sp_sparse.eye(3, format="coo", dtype=dtype) + result = pd.Series.sparse.from_coo(A, dense_index=dense_index) + + index = pd.MultiIndex.from_tuples( + [ + np.array([0, 0], dtype=np.int32), + np.array([1, 1], dtype=np.int32), + np.array([2, 2], dtype=np.int32), + ], + ) + expected = pd.Series(SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index) + if dense_index: + expected = expected.reindex(pd.MultiIndex.from_product(index.levels)) + + tm.assert_series_equal(result, expected) + + def test_series_from_coo_incorrect_format_raises(self): + # gh-26554 + sp_sparse = pytest.importorskip("scipy.sparse") + + m = sp_sparse.csr_matrix(np.array([[0, 1], [0, 0]])) + with pytest.raises( + TypeError, match="Expected coo_matrix. Got csr_matrix instead." + ): + pd.Series.sparse.from_coo(m) + + def test_with_column_named_sparse(self): + # https://github.com/pandas-dev/pandas/issues/30758 + df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) + assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_arithmetics.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_arithmetics.py new file mode 100644 index 0000000000000000000000000000000000000000..ffc93b4e4f176385ac7b2b8a0b51027cb0bad9f6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_arithmetics.py @@ -0,0 +1,514 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +from pandas import SparseDtype +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray + + +@pytest.fixture(params=["integer", "block"]) +def kind(request): + """kind kwarg to pass to SparseArray""" + return request.param + + +@pytest.fixture(params=[True, False]) +def mix(request): + """ + Fixture returning True or False, determining whether to operate + op(sparse, dense) instead of op(sparse, sparse) + """ + return request.param + + +class TestSparseArrayArithmetics: + def _assert(self, a, b): + # We have to use tm.assert_sp_array_equal. See GH #45126 + tm.assert_numpy_array_equal(a, b) + + def _check_numeric_ops(self, a, b, a_dense, b_dense, mix: bool, op): + # Check that arithmetic behavior matches non-Sparse Series arithmetic + + if isinstance(a_dense, np.ndarray): + expected = op(pd.Series(a_dense), b_dense).values + elif isinstance(b_dense, np.ndarray): + expected = op(a_dense, pd.Series(b_dense)).values + else: + raise NotImplementedError + + with np.errstate(invalid="ignore", divide="ignore"): + if mix: + result = op(a, b_dense).to_dense() + else: + result = op(a, b).to_dense() + + self._assert(result, expected) + + def _check_bool_result(self, res): + assert isinstance(res, SparseArray) + assert isinstance(res.dtype, SparseDtype) + assert res.dtype.subtype == np.bool_ + assert isinstance(res.fill_value, bool) + + def _check_comparison_ops(self, a, b, a_dense, b_dense): + with np.errstate(invalid="ignore"): + # Unfortunately, trying to wrap the computation of each expected + # value is with np.errstate() is too tedious. + # + # sparse & sparse + self._check_bool_result(a == b) + self._assert((a == b).to_dense(), a_dense == b_dense) + + self._check_bool_result(a != b) + self._assert((a != b).to_dense(), a_dense != b_dense) + + self._check_bool_result(a >= b) + self._assert((a >= b).to_dense(), a_dense >= b_dense) + + self._check_bool_result(a <= b) + self._assert((a <= b).to_dense(), a_dense <= b_dense) + + self._check_bool_result(a > b) + self._assert((a > b).to_dense(), a_dense > b_dense) + + self._check_bool_result(a < b) + self._assert((a < b).to_dense(), a_dense < b_dense) + + # sparse & dense + self._check_bool_result(a == b_dense) + self._assert((a == b_dense).to_dense(), a_dense == b_dense) + + self._check_bool_result(a != b_dense) + self._assert((a != b_dense).to_dense(), a_dense != b_dense) + + self._check_bool_result(a >= b_dense) + self._assert((a >= b_dense).to_dense(), a_dense >= b_dense) + + self._check_bool_result(a <= b_dense) + self._assert((a <= b_dense).to_dense(), a_dense <= b_dense) + + self._check_bool_result(a > b_dense) + self._assert((a > b_dense).to_dense(), a_dense > b_dense) + + self._check_bool_result(a < b_dense) + self._assert((a < b_dense).to_dense(), a_dense < b_dense) + + def _check_logical_ops(self, a, b, a_dense, b_dense): + # sparse & sparse + self._check_bool_result(a & b) + self._assert((a & b).to_dense(), a_dense & b_dense) + + self._check_bool_result(a | b) + self._assert((a | b).to_dense(), a_dense | b_dense) + # sparse & dense + self._check_bool_result(a & b_dense) + self._assert((a & b_dense).to_dense(), a_dense & b_dense) + + self._check_bool_result(a | b_dense) + self._assert((a | b_dense).to_dense(), a_dense | b_dense) + + @pytest.mark.parametrize("scalar", [0, 1, 3]) + @pytest.mark.parametrize("fill_value", [None, 0, 2]) + def test_float_scalar( + self, kind, mix, all_arithmetic_functions, fill_value, scalar, request + ): + op = all_arithmetic_functions + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + a = SparseArray(values, kind=kind, fill_value=fill_value) + self._check_numeric_ops(a, scalar, values, scalar, mix, op) + + def test_float_scalar_comparison(self, kind): + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + + a = SparseArray(values, kind=kind) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + a = SparseArray(values, kind=kind, fill_value=0) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + a = SparseArray(values, kind=kind, fill_value=2) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + def test_float_same_index_without_nans(self, kind, mix, all_arithmetic_functions): + # when sp_index are the same + op = all_arithmetic_functions + + values = np.array([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = np.array([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) + + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + def test_float_same_index_with_nans( + self, kind, mix, all_arithmetic_functions, request + ): + # when sp_index are the same + op = all_arithmetic_functions + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + def test_float_same_index_comparison(self, kind): + # when sp_index are the same + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + values = np.array([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = np.array([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) + + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + def test_float_array(self, kind, mix, all_arithmetic_functions): + op = all_arithmetic_functions + + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) + + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = SparseArray(values, kind=kind, fill_value=1) + b = SparseArray(rvalues, kind=kind, fill_value=2) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + def test_float_array_different_kind(self, mix, all_arithmetic_functions): + op = all_arithmetic_functions + + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + a = SparseArray(values, kind="integer") + b = SparseArray(rvalues, kind="block") + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) + + a = SparseArray(values, kind="integer", fill_value=0) + b = SparseArray(rvalues, kind="block") + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = SparseArray(values, kind="integer", fill_value=0) + b = SparseArray(rvalues, kind="block", fill_value=0) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = SparseArray(values, kind="integer", fill_value=1) + b = SparseArray(rvalues, kind="block", fill_value=2) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + def test_float_array_comparison(self, kind): + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + a = SparseArray(values, kind=kind, fill_value=1) + b = SparseArray(rvalues, kind=kind, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) + + def test_int_array(self, kind, mix, all_arithmetic_functions): + op = all_arithmetic_functions + + # have to specify dtype explicitly until fixing GH 667 + dtype = np.int64 + + values = np.array([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) + rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) + + a = SparseArray(values, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype) + b = SparseArray(rvalues, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype) + + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) + + a = SparseArray(values, fill_value=0, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype) + b = SparseArray(rvalues, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype) + + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = SparseArray(values, fill_value=0, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype) + b = SparseArray(rvalues, fill_value=0, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = SparseArray(values, fill_value=1, dtype=dtype, kind=kind) + assert a.dtype == SparseDtype(dtype, fill_value=1) + b = SparseArray(rvalues, fill_value=2, dtype=dtype, kind=kind) + assert b.dtype == SparseDtype(dtype, fill_value=2) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + def test_int_array_comparison(self, kind): + dtype = "int64" + # int32 NI ATM + + values = np.array([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) + rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) + + a = SparseArray(values, dtype=dtype, kind=kind) + b = SparseArray(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = SparseArray(values, dtype=dtype, kind=kind, fill_value=0) + b = SparseArray(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + a = SparseArray(values, dtype=dtype, kind=kind, fill_value=0) + b = SparseArray(rvalues, dtype=dtype, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + a = SparseArray(values, dtype=dtype, kind=kind, fill_value=1) + b = SparseArray(rvalues, dtype=dtype, kind=kind, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) + + @pytest.mark.parametrize("fill_value", [True, False, np.nan]) + def test_bool_same_index(self, kind, fill_value): + # GH 14000 + # when sp_index are the same + values = np.array([True, False, True, True], dtype=np.bool_) + rvalues = np.array([True, False, True, True], dtype=np.bool_) + + a = SparseArray(values, kind=kind, dtype=np.bool_, fill_value=fill_value) + b = SparseArray(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value) + self._check_logical_ops(a, b, values, rvalues) + + @pytest.mark.parametrize("fill_value", [True, False, np.nan]) + def test_bool_array_logical(self, kind, fill_value): + # GH 14000 + # when sp_index are the same + values = np.array([True, False, True, False, True, True], dtype=np.bool_) + rvalues = np.array([True, False, False, True, False, True], dtype=np.bool_) + + a = SparseArray(values, kind=kind, dtype=np.bool_, fill_value=fill_value) + b = SparseArray(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value) + self._check_logical_ops(a, b, values, rvalues) + + def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, request): + op = all_arithmetic_functions + rdtype = "int64" + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) + + self._check_numeric_ops(a, b, values, rvalues, mix, op) + self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op) + + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) + assert b.dtype == SparseDtype(rdtype) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + a = SparseArray(values, kind=kind, fill_value=1) + b = SparseArray(rvalues, kind=kind, fill_value=2) + assert b.dtype == SparseDtype(rdtype, fill_value=2) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + + def test_mixed_array_comparison(self, kind): + rdtype = "int64" + # int32 NI ATM + + values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + + a = SparseArray(values, kind=kind) + b = SparseArray(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) + + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind) + assert b.dtype == SparseDtype(rdtype) + self._check_comparison_ops(a, b, values, rvalues) + + a = SparseArray(values, kind=kind, fill_value=0) + b = SparseArray(rvalues, kind=kind, fill_value=0) + assert b.dtype == SparseDtype(rdtype) + self._check_comparison_ops(a, b, values, rvalues) + + a = SparseArray(values, kind=kind, fill_value=1) + b = SparseArray(rvalues, kind=kind, fill_value=2) + assert b.dtype == SparseDtype(rdtype, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) + + def test_xor(self): + s = SparseArray([True, True, False, False]) + t = SparseArray([True, False, True, False]) + result = s ^ t + sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32")) + expected = SparseArray([False, True, True], sparse_index=sp_index) + tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize("op", [operator.eq, operator.add]) +def test_with_list(op): + arr = SparseArray([0, 1], fill_value=0) + result = op(arr, [0, 1]) + expected = op(arr, SparseArray([0, 1])) + tm.assert_sp_array_equal(result, expected) + + +def test_with_dataframe(): + # GH#27910 + arr = SparseArray([0, 1], fill_value=0) + df = pd.DataFrame([[1, 2], [3, 4]]) + result = arr.__add__(df) + assert result is NotImplemented + + +def test_with_zerodim_ndarray(): + # GH#27910 + arr = SparseArray([0, 1], fill_value=0) + + result = arr * np.array(2) + expected = arr * 2 + tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.abs, np.exp]) +@pytest.mark.parametrize( + "arr", [SparseArray([0, 0, -1, 1]), SparseArray([None, None, -1, 1])] +) +def test_ufuncs(ufunc, arr): + result = ufunc(arr) + fill_value = ufunc(arr.fill_value) + expected = SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value) + tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + (SparseArray([0, 0, 0]), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + ], +) +@pytest.mark.parametrize("ufunc", [np.add, np.greater]) +def test_binary_ufuncs(ufunc, a, b): + # can't say anything about fill value here. + result = ufunc(a, b) + expected = ufunc(np.asarray(a), np.asarray(b)) + assert isinstance(result, SparseArray) + tm.assert_numpy_array_equal(np.asarray(result), expected) + + +def test_ndarray_inplace(): + sparray = SparseArray([0, 2, 0, 0]) + ndarray = np.array([0, 1, 2, 3]) + ndarray += sparray + expected = np.array([0, 3, 2, 3]) + tm.assert_numpy_array_equal(ndarray, expected) + + +def test_sparray_inplace(): + sparray = SparseArray([0, 2, 0, 0]) + ndarray = np.array([0, 1, 2, 3]) + sparray += ndarray + expected = SparseArray([0, 3, 2, 3], fill_value=0) + tm.assert_sp_array_equal(sparray, expected) + + +@pytest.mark.parametrize("cons", [list, np.array, SparseArray]) +def test_mismatched_length_cmp_op(cons): + left = SparseArray([True, True]) + right = cons([True, True, True]) + with pytest.raises(ValueError, match="operands have mismatched length"): + left & right + + +@pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"]) +@pytest.mark.parametrize("fill_value", [np.nan, 3]) +def test_binary_operators(op, fill_value): + op = getattr(operator, op) + data1 = np.random.default_rng(2).standard_normal(20) + data2 = np.random.default_rng(2).standard_normal(20) + + data1[::2] = fill_value + data2[::3] = fill_value + + first = SparseArray(data1, fill_value=fill_value) + second = SparseArray(data2, fill_value=fill_value) + + with np.errstate(all="ignore"): + res = op(first, second) + exp = SparseArray( + op(first.to_dense(), second.to_dense()), fill_value=first.fill_value + ) + assert isinstance(res, SparseArray) + tm.assert_almost_equal(res.to_dense(), exp.to_dense()) + + res2 = op(first, second.to_dense()) + assert isinstance(res2, SparseArray) + tm.assert_sp_array_equal(res, res2) + + res3 = op(first.to_dense(), second) + assert isinstance(res3, SparseArray) + tm.assert_sp_array_equal(res, res3) + + res4 = op(first, 4) + assert isinstance(res4, SparseArray) + + # Ignore this if the actual op raises (e.g. pow). + try: + exp = op(first.to_dense(), 4) + exp_fv = op(first.fill_value, 4) + except ValueError: + pass + else: + tm.assert_almost_equal(res4.fill_value, exp_fv) + tm.assert_almost_equal(res4.to_dense(), exp) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_array.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_array.py new file mode 100644 index 0000000000000000000000000000000000000000..883d6ea3959ff6c12659c55762b889be18349ef7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_array.py @@ -0,0 +1,480 @@ +import re + +import numpy as np +import pytest + +from pandas._libs.sparse import IntIndex + +import pandas as pd +from pandas import ( + SparseDtype, + isna, +) +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray + + +@pytest.fixture +def arr_data(): + """Fixture returning numpy array with valid and missing entries""" + return np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) + + +@pytest.fixture +def arr(arr_data): + """Fixture returning SparseArray from 'arr_data'""" + return SparseArray(arr_data) + + +@pytest.fixture +def zarr(): + """Fixture returning SparseArray with integer entries and 'fill_value=0'""" + return SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) + + +class TestSparseArray: + @pytest.mark.parametrize("fill_value", [0, None, np.nan]) + def test_shift_fill_value(self, fill_value): + # GH #24128 + sparse = SparseArray(np.array([1, 0, 0, 3, 0]), fill_value=8.0) + res = sparse.shift(1, fill_value=fill_value) + if isna(fill_value): + fill_value = res.dtype.na_value + exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), fill_value=8.0) + tm.assert_sp_array_equal(res, exp) + + def test_set_fill_value(self): + arr = SparseArray([1.0, np.nan, 2.0], fill_value=np.nan) + arr.fill_value = 2 + assert arr.fill_value == 2 + + arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64) + arr.fill_value = 2 + assert arr.fill_value == 2 + + msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + arr.fill_value = 3.1 + assert arr.fill_value == 3.1 + + arr.fill_value = np.nan + assert np.isnan(arr.fill_value) + + arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_) + arr.fill_value = True + assert arr.fill_value is True + + with tm.assert_produces_warning(FutureWarning, match=msg): + arr.fill_value = 0 + + arr.fill_value = np.nan + assert np.isnan(arr.fill_value) + + @pytest.mark.parametrize("val", [[1, 2, 3], np.array([1, 2]), (1, 2, 3)]) + def test_set_fill_invalid_non_scalar(self, val): + arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_) + msg = "fill_value must be a scalar" + + with pytest.raises(ValueError, match=msg): + arr.fill_value = val + + def test_copy(self, arr): + arr2 = arr.copy() + assert arr2.sp_values is not arr.sp_values + assert arr2.sp_index is arr.sp_index + + def test_values_asarray(self, arr_data, arr): + tm.assert_almost_equal(arr.to_dense(), arr_data) + + @pytest.mark.parametrize( + "data,shape,dtype", + [ + ([0, 0, 0, 0, 0], (5,), None), + ([], (0,), None), + ([0], (1,), None), + (["A", "A", np.nan, "B"], (4,), object), + ], + ) + def test_shape(self, data, shape, dtype): + # GH 21126 + out = SparseArray(data, dtype=dtype) + assert out.shape == shape + + @pytest.mark.parametrize( + "vals", + [ + [np.nan, np.nan, np.nan, np.nan, np.nan], + [1, np.nan, np.nan, 3, np.nan], + [1, np.nan, 0, 3, 0], + ], + ) + @pytest.mark.parametrize("fill_value", [None, 0]) + def test_dense_repr(self, vals, fill_value): + vals = np.array(vals) + arr = SparseArray(vals, fill_value=fill_value) + + res = arr.to_dense() + tm.assert_numpy_array_equal(res, vals) + + @pytest.mark.parametrize("fix", ["arr", "zarr"]) + def test_pickle(self, fix, request): + obj = request.getfixturevalue(fix) + unpickled = tm.round_trip_pickle(obj) + tm.assert_sp_array_equal(unpickled, obj) + + def test_generator_warnings(self): + sp_arr = SparseArray([1, 2, 3]) + with tm.assert_produces_warning(None): + for _ in sp_arr: + pass + + def test_where_retain_fill_value(self): + # GH#45691 don't lose fill_value on _where + arr = SparseArray([np.nan, 1.0], fill_value=0) + + mask = np.array([True, False]) + + res = arr._where(~mask, 1) + exp = SparseArray([1, 1.0], fill_value=0) + tm.assert_sp_array_equal(res, exp) + + ser = pd.Series(arr) + res = ser.where(~mask, 1) + tm.assert_series_equal(res, pd.Series(exp)) + + def test_fillna(self): + s = SparseArray([1, np.nan, np.nan, 3, np.nan]) + res = s.fillna(-1) + exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) + res = s.fillna(-1) + exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + s = SparseArray([1, np.nan, 0, 3, 0]) + res = s.fillna(-1) + exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0) + res = s.fillna(-1) + exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + s = SparseArray([np.nan, np.nan, np.nan, np.nan]) + res = s.fillna(-1) + exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0) + res = s.fillna(-1) + exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + # float dtype's fill_value is np.nan, replaced by -1 + s = SparseArray([0.0, 0.0, 0.0, 0.0]) + res = s.fillna(-1) + exp = SparseArray([0.0, 0.0, 0.0, 0.0], fill_value=-1) + tm.assert_sp_array_equal(res, exp) + + # int dtype shouldn't have missing. No changes. + s = SparseArray([0, 0, 0, 0]) + assert s.dtype == SparseDtype(np.int64) + assert s.fill_value == 0 + res = s.fillna(-1) + tm.assert_sp_array_equal(res, s) + + s = SparseArray([0, 0, 0, 0], fill_value=0) + assert s.dtype == SparseDtype(np.int64) + assert s.fill_value == 0 + res = s.fillna(-1) + exp = SparseArray([0, 0, 0, 0], fill_value=0) + tm.assert_sp_array_equal(res, exp) + + # fill_value can be nan if there is no missing hole. + # only fill_value will be changed + s = SparseArray([0, 0, 0, 0], fill_value=np.nan) + assert s.dtype == SparseDtype(np.int64, fill_value=np.nan) + assert np.isnan(s.fill_value) + res = s.fillna(-1) + exp = SparseArray([0, 0, 0, 0], fill_value=-1) + tm.assert_sp_array_equal(res, exp) + + def test_fillna_overlap(self): + s = SparseArray([1, np.nan, np.nan, 3, np.nan]) + # filling with existing value doesn't replace existing value with + # fill_value, i.e. existing 3 remains in sp_values + res = s.fillna(3) + exp = np.array([1, 3, 3, 3, 3], dtype=np.float64) + tm.assert_numpy_array_equal(res.to_dense(), exp) + + s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) + res = s.fillna(3) + exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64) + tm.assert_sp_array_equal(res, exp) + + def test_nonzero(self): + # Tests regression #21172. + sa = SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) + expected = np.array([2, 5, 9], dtype=np.int32) + (result,) = sa.nonzero() + tm.assert_numpy_array_equal(expected, result) + + sa = SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) + (result,) = sa.nonzero() + tm.assert_numpy_array_equal(expected, result) + + +class TestSparseArrayAnalytics: + @pytest.mark.parametrize( + "data,expected", + [ + ( + np.array([1, 2, 3, 4, 5], dtype=float), # non-null data + SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0])), + ), + ( + np.array([1, 2, np.nan, 4, 5], dtype=float), # null data + SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])), + ), + ], + ) + @pytest.mark.parametrize("numpy", [True, False]) + def test_cumsum(self, data, expected, numpy): + cumsum = np.cumsum if numpy else lambda s: s.cumsum() + + out = cumsum(SparseArray(data)) + tm.assert_sp_array_equal(out, expected) + + out = cumsum(SparseArray(data, fill_value=np.nan)) + tm.assert_sp_array_equal(out, expected) + + out = cumsum(SparseArray(data, fill_value=2)) + tm.assert_sp_array_equal(out, expected) + + if numpy: # numpy compatibility checks. + msg = "the 'dtype' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.cumsum(SparseArray(data), dtype=np.int64) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.cumsum(SparseArray(data), out=out) + else: + axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid. + msg = re.escape(f"axis(={axis}) out of bounds") + with pytest.raises(ValueError, match=msg): + SparseArray(data).cumsum(axis=axis) + + def test_ufunc(self): + # GH 13853 make sure ufunc is applied to fill_value + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray([1, np.nan, 2, np.nan, 2]) + tm.assert_sp_array_equal(abs(sparse), result) + tm.assert_sp_array_equal(np.abs(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1) + tm.assert_sp_array_equal(abs(sparse), result) + tm.assert_sp_array_equal(np.abs(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=-1) + exp = SparseArray([1, 1, 2, 2], fill_value=1) + tm.assert_sp_array_equal(abs(sparse), exp) + tm.assert_sp_array_equal(np.abs(sparse), exp) + + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2])) + tm.assert_sp_array_equal(np.sin(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray(np.sin([1, -1, 2, -2]), fill_value=np.sin(1)) + tm.assert_sp_array_equal(np.sin(sparse), result) + + sparse = SparseArray([1, -1, 0, -2], fill_value=0) + result = SparseArray(np.sin([1, -1, 0, -2]), fill_value=np.sin(0)) + tm.assert_sp_array_equal(np.sin(sparse), result) + + def test_ufunc_args(self): + # GH 13853 make sure ufunc is applied to fill_value, including its arg + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray([2, np.nan, 3, np.nan, -1]) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray([2, 0, 3, -1], fill_value=2) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + sparse = SparseArray([1, -1, 0, -2], fill_value=0) + result = SparseArray([2, 0, 1, -1], fill_value=1) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + @pytest.mark.parametrize("fill_value", [0.0, np.nan]) + def test_modf(self, fill_value): + # https://github.com/pandas-dev/pandas/issues/26946 + sparse = SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value) + r1, r2 = np.modf(sparse) + e1, e2 = np.modf(np.asarray(sparse)) + tm.assert_sp_array_equal(r1, SparseArray(e1, fill_value=fill_value)) + tm.assert_sp_array_equal(r2, SparseArray(e2, fill_value=fill_value)) + + def test_nbytes_integer(self): + arr = SparseArray([1, 0, 0, 0, 2], kind="integer") + result = arr.nbytes + # (2 * 8) + 2 * 4 + assert result == 24 + + def test_nbytes_block(self): + arr = SparseArray([1, 2, 0, 0, 0], kind="block") + result = arr.nbytes + # (2 * 8) + 4 + 4 + # sp_values, blocs, blengths + assert result == 24 + + def test_asarray_datetime64(self): + s = SparseArray(pd.to_datetime(["2012", None, None, "2013"])) + np.asarray(s) + + def test_density(self): + arr = SparseArray([0, 1]) + assert arr.density == 0.5 + + def test_npoints(self): + arr = SparseArray([0, 1]) + assert arr.npoints == 1 + + +def test_setting_fill_value_fillna_still_works(): + # This is why letting users update fill_value / dtype is bad + # astype has the same problem. + arr = SparseArray([1.0, np.nan, 1.0], fill_value=0.0) + arr.fill_value = np.nan + result = arr.isna() + # Can't do direct comparison, since the sp_index will be different + # So let's convert to ndarray and check there. + result = np.asarray(result) + + expected = np.array([False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + +def test_setting_fill_value_updates(): + arr = SparseArray([0.0, np.nan], fill_value=0) + arr.fill_value = np.nan + # use private constructor to get the index right + # otherwise both nans would be un-stored. + expected = SparseArray._simple_new( + sparse_array=np.array([np.nan]), + sparse_index=IntIndex(2, [1]), + dtype=SparseDtype(float, np.nan), + ) + tm.assert_sp_array_equal(arr, expected) + + +@pytest.mark.parametrize( + "arr,fill_value,loc", + [ + ([None, 1, 2], None, 0), + ([0, None, 2], None, 1), + ([0, 1, None], None, 2), + ([0, 1, 1, None, None], None, 3), + ([1, 1, 1, 2], None, -1), + ([], None, -1), + ([None, 1, 0, 0, None, 2], None, 0), + ([None, 1, 0, 0, None, 2], 1, 1), + ([None, 1, 0, 0, None, 2], 2, 5), + ([None, 1, 0, 0, None, 2], 3, -1), + ([None, 0, 0, 1, 2, 1], 0, 1), + ([None, 0, 0, 1, 2, 1], 1, 3), + ], +) +def test_first_fill_value_loc(arr, fill_value, loc): + result = SparseArray(arr, fill_value=fill_value)._first_fill_value_loc() + assert result == loc + + +@pytest.mark.parametrize( + "arr", + [ + [1, 2, np.nan, np.nan], + [1, np.nan, 2, np.nan], + [1, 2, np.nan], + [np.nan, 1, 0, 0, np.nan, 2], + [np.nan, 0, 0, 1, 2, 1], + ], +) +@pytest.mark.parametrize("fill_value", [np.nan, 0, 1]) +def test_unique_na_fill(arr, fill_value): + a = SparseArray(arr, fill_value=fill_value).unique() + b = pd.Series(arr).unique() + assert isinstance(a, SparseArray) + a = np.asarray(a) + tm.assert_numpy_array_equal(a, b) + + +def test_unique_all_sparse(): + # https://github.com/pandas-dev/pandas/issues/23168 + arr = SparseArray([0, 0]) + result = arr.unique() + expected = SparseArray([0]) + tm.assert_sp_array_equal(result, expected) + + +def test_map(): + arr = SparseArray([0, 1, 2]) + expected = SparseArray([10, 11, 12], fill_value=10) + + # dict + result = arr.map({0: 10, 1: 11, 2: 12}) + tm.assert_sp_array_equal(result, expected) + + # series + result = arr.map(pd.Series({0: 10, 1: 11, 2: 12})) + tm.assert_sp_array_equal(result, expected) + + # function + result = arr.map(pd.Series({0: 10, 1: 11, 2: 12})) + expected = SparseArray([10, 11, 12], fill_value=10) + tm.assert_sp_array_equal(result, expected) + + +def test_map_missing(): + arr = SparseArray([0, 1, 2]) + expected = SparseArray([10, 11, None], fill_value=10) + + result = arr.map({0: 10, 1: 11}) + tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize("fill_value", [np.nan, 1]) +def test_dropna(fill_value): + # GH-28287 + arr = SparseArray([np.nan, 1], fill_value=fill_value) + exp = SparseArray([1.0], fill_value=fill_value) + tm.assert_sp_array_equal(arr.dropna(), exp) + + df = pd.DataFrame({"a": [0, 1], "b": arr}) + expected_df = pd.DataFrame({"a": [1], "b": exp}, index=pd.Index([1])) + tm.assert_equal(df.dropna(), expected_df) + + +def test_drop_duplicates_fill_value(): + # GH 11726 + df = pd.DataFrame(np.zeros((5, 5))).apply(lambda x: SparseArray(x, fill_value=0)) + result = df.drop_duplicates() + expected = pd.DataFrame({i: SparseArray([0.0], fill_value=0) for i in range(5)}) + tm.assert_frame_equal(result, expected) + + +def test_zero_sparse_column(): + # GH 27781 + df1 = pd.DataFrame({"A": SparseArray([0, 0, 0]), "B": [1, 2, 3]}) + df2 = pd.DataFrame({"A": SparseArray([0, 1, 0]), "B": [1, 2, 3]}) + result = df1.loc[df1["B"] != 2] + expected = df2.loc[df2["B"] != 2] + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2]) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..83a507e679d460d687b27d09a4bed8321e1199e2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_astype.py @@ -0,0 +1,133 @@ +import numpy as np +import pytest + +from pandas._libs.sparse import IntIndex + +from pandas import ( + SparseDtype, + Timestamp, +) +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray + + +class TestAstype: + def test_astype(self): + # float -> float + arr = SparseArray([None, None, 0, 2]) + result = arr.astype("Sparse[float32]") + expected = SparseArray([None, None, 0, 2], dtype=np.dtype("float32")) + tm.assert_sp_array_equal(result, expected) + + dtype = SparseDtype("float64", fill_value=0) + result = arr.astype(dtype) + expected = SparseArray._simple_new( + np.array([0.0, 2.0], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype + ) + tm.assert_sp_array_equal(result, expected) + + dtype = SparseDtype("int64", 0) + result = arr.astype(dtype) + expected = SparseArray._simple_new( + np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype + ) + tm.assert_sp_array_equal(result, expected) + + arr = SparseArray([0, np.nan, 0, 1], fill_value=0) + with pytest.raises(ValueError, match="NA"): + arr.astype("Sparse[i8]") + + def test_astype_bool(self): + a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) + result = a.astype(bool) + expected = np.array([1, 0, 0, 1], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + # update fill value + result = a.astype(SparseDtype(bool, False)) + expected = SparseArray( + [True, False, False, True], dtype=SparseDtype(bool, False) + ) + tm.assert_sp_array_equal(result, expected) + + def test_astype_all(self, any_real_numpy_dtype): + vals = np.array([1, 2, 3]) + arr = SparseArray(vals, fill_value=1) + typ = np.dtype(any_real_numpy_dtype) + res = arr.astype(typ) + tm.assert_numpy_array_equal(res, vals.astype(any_real_numpy_dtype)) + + @pytest.mark.parametrize( + "arr, dtype, expected", + [ + ( + SparseArray([0, 1]), + "float", + SparseArray([0.0, 1.0], dtype=SparseDtype(float, 0.0)), + ), + (SparseArray([0, 1]), bool, SparseArray([False, True])), + ( + SparseArray([0, 1], fill_value=1), + bool, + SparseArray([False, True], dtype=SparseDtype(bool, True)), + ), + pytest.param( + SparseArray([0, 1]), + "datetime64[ns]", + SparseArray( + np.array([0, 1], dtype="datetime64[ns]"), + dtype=SparseDtype("datetime64[ns]", Timestamp("1970")), + ), + ), + ( + SparseArray([0, 1, 10]), + str, + SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + ), + (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), + ( + SparseArray([0, 1, 0]), + object, + SparseArray([0, 1, 0], dtype=SparseDtype(object, 0)), + ), + ], + ) + def test_astype_more(self, arr, dtype, expected): + result = arr.astype(arr.dtype.update_dtype(dtype)) + tm.assert_sp_array_equal(result, expected) + + def test_astype_nan_raises(self): + arr = SparseArray([1.0, np.nan]) + with pytest.raises(ValueError, match="Cannot convert non-finite"): + arr.astype(int) + + def test_astype_copy_false(self): + # GH#34456 bug caused by using .view instead of .astype in astype_nansafe + arr = SparseArray([1, 2, 3]) + + dtype = SparseDtype(float, 0) + + result = arr.astype(dtype, copy=False) + expected = SparseArray([1.0, 2.0, 3.0], fill_value=0.0) + tm.assert_sp_array_equal(result, expected) + + def test_astype_dt64_to_int64(self): + # GH#49631 match non-sparse behavior + values = np.array(["NaT", "2016-01-02", "2016-01-03"], dtype="M8[ns]") + + arr = SparseArray(values) + result = arr.astype("int64") + expected = values.astype("int64") + tm.assert_numpy_array_equal(result, expected) + + # we should also be able to cast to equivalent Sparse[int64] + dtype_int64 = SparseDtype("int64", np.iinfo(np.int64).min) + result2 = arr.astype(dtype_int64) + tm.assert_numpy_array_equal(result2.to_numpy(), expected) + + # GH#50087 we should match the non-sparse behavior regardless of + # if we have a fill_value other than NaT + dtype = SparseDtype("datetime64[ns]", values[1]) + arr3 = SparseArray(values, dtype=dtype) + result3 = arr3.astype("int64") + tm.assert_numpy_array_equal(result3, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_combine_concat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_combine_concat.py new file mode 100644 index 0000000000000000000000000000000000000000..0f09af269148bc6fec712b9b1df63cca6f44d248 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_combine_concat.py @@ -0,0 +1,62 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray + + +class TestSparseArrayConcat: + @pytest.mark.parametrize("kind", ["integer", "block"]) + def test_basic(self, kind): + a = SparseArray([1, 0, 0, 2], kind=kind) + b = SparseArray([1, 0, 2, 2], kind=kind) + + result = SparseArray._concat_same_type([a, b]) + # Can't make any assertions about the sparse index itself + # since we aren't don't merge sparse blocs across arrays + # in to_concat + expected = np.array([1, 2, 1, 2, 2], dtype="int64") + tm.assert_numpy_array_equal(result.sp_values, expected) + assert result.kind == kind + + @pytest.mark.parametrize("kind", ["integer", "block"]) + def test_uses_first_kind(self, kind): + other = "integer" if kind == "block" else "block" + a = SparseArray([1, 0, 0, 2], kind=kind) + b = SparseArray([1, 0, 2, 2], kind=other) + + result = SparseArray._concat_same_type([a, b]) + expected = np.array([1, 2, 1, 2, 2], dtype="int64") + tm.assert_numpy_array_equal(result.sp_values, expected) + assert result.kind == kind + + +@pytest.mark.parametrize( + "other, expected_dtype", + [ + # compatible dtype -> preserve sparse + (pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)), + # (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)), + # incompatible dtype -> Sparse[common dtype] + (pd.Series([1.5, 2.5, 3.5], dtype="float64"), pd.SparseDtype("float64", 0)), + # incompatible dtype -> Sparse[object] dtype + (pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)), + # categorical with compatible categories -> dtype of the categories + (pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")), + (pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")), + # categorical with incompatible categories -> object dtype + (pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)), + ], +) +def test_concat_with_non_sparse(other, expected_dtype): + # https://github.com/pandas-dev/pandas/issues/34336 + s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0)) + + result = pd.concat([s_sparse, other], ignore_index=True) + expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype) + tm.assert_series_equal(result, expected) + + result = pd.concat([other, s_sparse], ignore_index=True) + expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_constructors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..2831c8abdaf137b6454ea8f73bff7e94a3ec1b2b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_constructors.py @@ -0,0 +1,285 @@ +import numpy as np +import pytest + +from pandas._libs.sparse import IntIndex + +import pandas as pd +from pandas import ( + SparseDtype, + isna, +) +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray + + +class TestConstructors: + def test_constructor_dtype(self): + arr = SparseArray([np.nan, 1, 2, np.nan]) + assert arr.dtype == SparseDtype(np.float64, np.nan) + assert arr.dtype.subtype == np.float64 + assert np.isnan(arr.fill_value) + + arr = SparseArray([np.nan, 1, 2, np.nan], fill_value=0) + assert arr.dtype == SparseDtype(np.float64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], dtype=np.float64) + assert arr.dtype == SparseDtype(np.float64, np.nan) + assert np.isnan(arr.fill_value) + + arr = SparseArray([0, 1, 2, 4], dtype=np.int64) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], dtype=None) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None) + assert arr.dtype == SparseDtype(np.int64, 0) + assert arr.fill_value == 0 + + def test_constructor_dtype_str(self): + result = SparseArray([1, 2, 3], dtype="int") + expected = SparseArray([1, 2, 3], dtype=int) + tm.assert_sp_array_equal(result, expected) + + def test_constructor_sparse_dtype(self): + result = SparseArray([1, 0, 0, 1], dtype=SparseDtype("int64", -1)) + expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64) + tm.assert_sp_array_equal(result, expected) + assert result.sp_values.dtype == np.dtype("int64") + + def test_constructor_sparse_dtype_str(self): + result = SparseArray([1, 0, 0, 1], dtype="Sparse[int32]") + expected = SparseArray([1, 0, 0, 1], dtype=np.int32) + tm.assert_sp_array_equal(result, expected) + assert result.sp_values.dtype == np.dtype("int32") + + def test_constructor_object_dtype(self): + # GH#11856 + arr = SparseArray(["A", "A", np.nan, "B"], dtype=object) + assert arr.dtype == SparseDtype(object) + assert np.isnan(arr.fill_value) + + arr = SparseArray(["A", "A", np.nan, "B"], dtype=object, fill_value="A") + assert arr.dtype == SparseDtype(object, "A") + assert arr.fill_value == "A" + + def test_constructor_object_dtype_bool_fill(self): + # GH#17574 + data = [False, 0, 100.0, 0.0] + arr = SparseArray(data, dtype=object, fill_value=False) + assert arr.dtype == SparseDtype(object, False) + assert arr.fill_value is False + arr_expected = np.array(data, dtype=object) + it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected)) + assert np.fromiter(it, dtype=np.bool_).all() + + @pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int]) + def test_constructor_na_dtype(self, dtype): + with pytest.raises(ValueError, match="Cannot convert"): + SparseArray([0, 1, np.nan], dtype=dtype) + + def test_constructor_warns_when_losing_timezone(self): + # GH#32501 warn when losing timezone information + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + + expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]")) + + with tm.assert_produces_warning(UserWarning): + result = SparseArray(dti) + + tm.assert_sp_array_equal(result, expected) + + with tm.assert_produces_warning(UserWarning): + result = SparseArray(pd.Series(dti)) + + tm.assert_sp_array_equal(result, expected) + + def test_constructor_spindex_dtype(self): + arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) + # TODO: actionable? + # XXX: Behavior change: specifying SparseIndex no longer changes the + # fill_value + expected = SparseArray([0, 1, 2, 0], kind="integer") + tm.assert_sp_array_equal(arr, expected) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + arr = SparseArray( + data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=np.int64, + fill_value=0, + ) + exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + arr = SparseArray( + data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64 + ) + exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + arr = SparseArray( + data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=None, + fill_value=0, + ) + exp = SparseArray([0, 1, 2, 3], dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + @pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])]) + def test_constructor_spindex_dtype_scalar(self, sparse_index): + # scalar input + msg = "Constructing SparseArray with scalar data is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) + exp = SparseArray([1], dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + with tm.assert_produces_warning(FutureWarning, match=msg): + arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) + exp = SparseArray([1], dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + def test_constructor_spindex_dtype_scalar_broadcasts(self): + arr = SparseArray( + data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None + ) + exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) + tm.assert_sp_array_equal(arr, exp) + assert arr.dtype == SparseDtype(np.int64) + assert arr.fill_value == 0 + + @pytest.mark.parametrize( + "data, fill_value", + [ + (np.array([1, 2]), 0), + (np.array([1.0, 2.0]), np.nan), + ([True, False], False), + ([pd.Timestamp("2017-01-01")], pd.NaT), + ], + ) + def test_constructor_inferred_fill_value(self, data, fill_value): + result = SparseArray(data).fill_value + + if isna(fill_value): + assert isna(result) + else: + assert result == fill_value + + @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) + @pytest.mark.parametrize("size", [0, 10]) + def test_from_spmatrix(self, size, format): + sp_sparse = pytest.importorskip("scipy.sparse") + + mat = sp_sparse.random(size, 1, density=0.5, format=format) + result = SparseArray.from_spmatrix(mat) + + result = np.asarray(result) + expected = mat.toarray().ravel() + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) + def test_from_spmatrix_including_explicit_zero(self, format): + sp_sparse = pytest.importorskip("scipy.sparse") + + mat = sp_sparse.random(10, 1, density=0.5, format=format) + mat.data[0] = 0 + result = SparseArray.from_spmatrix(mat) + + result = np.asarray(result) + expected = mat.toarray().ravel() + tm.assert_numpy_array_equal(result, expected) + + def test_from_spmatrix_raises(self): + sp_sparse = pytest.importorskip("scipy.sparse") + + mat = sp_sparse.eye(5, 4, format="csc") + + with pytest.raises(ValueError, match="not '4'"): + SparseArray.from_spmatrix(mat) + + def test_constructor_from_too_large_array(self): + with pytest.raises(TypeError, match="expected dimension <= 1 data"): + SparseArray(np.arange(10).reshape((2, 5))) + + def test_constructor_from_sparse(self): + zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) + res = SparseArray(zarr) + assert res.fill_value == 0 + tm.assert_almost_equal(res.sp_values, zarr.sp_values) + + def test_constructor_copy(self): + arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) + arr = SparseArray(arr_data) + + cp = SparseArray(arr, copy=True) + cp.sp_values[:3] = 0 + assert not (arr.sp_values[:3] == 0).any() + + not_copy = SparseArray(arr) + not_copy.sp_values[:3] = 0 + assert (arr.sp_values[:3] == 0).all() + + def test_constructor_bool(self): + # GH#10648 + data = np.array([False, False, True, True, False, False]) + arr = SparseArray(data, fill_value=False, dtype=bool) + + assert arr.dtype == SparseDtype(bool) + tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True])) + # Behavior change: np.asarray densifies. + # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) + tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32)) + + dense = arr.to_dense() + assert dense.dtype == bool + tm.assert_numpy_array_equal(dense, data) + + def test_constructor_bool_fill_value(self): + arr = SparseArray([True, False, True], dtype=None) + assert arr.dtype == SparseDtype(np.bool_) + assert not arr.fill_value + + arr = SparseArray([True, False, True], dtype=np.bool_) + assert arr.dtype == SparseDtype(np.bool_) + assert not arr.fill_value + + arr = SparseArray([True, False, True], dtype=np.bool_, fill_value=True) + assert arr.dtype == SparseDtype(np.bool_, True) + assert arr.fill_value + + def test_constructor_float32(self): + # GH#10648 + data = np.array([1.0, np.nan, 3], dtype=np.float32) + arr = SparseArray(data, dtype=np.float32) + + assert arr.dtype == SparseDtype(np.float32) + tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32)) + # Behavior change: np.asarray densifies. + # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) + tm.assert_numpy_array_equal( + arr.sp_index.indices, np.array([0, 2], dtype=np.int32) + ) + + dense = arr.to_dense() + assert dense.dtype == np.float32 + tm.assert_numpy_array_equal(dense, data) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_dtype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_dtype.py new file mode 100644 index 0000000000000000000000000000000000000000..234f4092421e592b9c11b668e25f29fa108f13f0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_dtype.py @@ -0,0 +1,224 @@ +import re +import warnings + +import numpy as np +import pytest + +import pandas as pd +from pandas import SparseDtype + + +@pytest.mark.parametrize( + "dtype, fill_value", + [ + ("int", 0), + ("float", np.nan), + ("bool", False), + ("object", np.nan), + ("datetime64[ns]", np.datetime64("NaT", "ns")), + ("timedelta64[ns]", np.timedelta64("NaT", "ns")), + ], +) +def test_inferred_dtype(dtype, fill_value): + sparse_dtype = SparseDtype(dtype) + result = sparse_dtype.fill_value + if pd.isna(fill_value): + assert pd.isna(result) and type(result) == type(fill_value) + else: + assert result == fill_value + + +def test_from_sparse_dtype(): + dtype = SparseDtype("float", 0) + result = SparseDtype(dtype) + assert result.fill_value == 0 + + +def test_from_sparse_dtype_fill_value(): + dtype = SparseDtype("int", 1) + result = SparseDtype(dtype, fill_value=2) + expected = SparseDtype("int", 2) + assert result == expected + + +@pytest.mark.parametrize( + "dtype, fill_value", + [ + ("int", None), + ("float", None), + ("bool", None), + ("object", None), + ("datetime64[ns]", None), + ("timedelta64[ns]", None), + ("int", np.nan), + ("float", 0), + ], +) +def test_equal(dtype, fill_value): + a = SparseDtype(dtype, fill_value) + b = SparseDtype(dtype, fill_value) + assert a == b + assert b == a + + +def test_nans_equal(): + a = SparseDtype(float, float("nan")) + b = SparseDtype(float, np.nan) + assert a == b + assert b == a + + +with warnings.catch_warnings(): + msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated" + warnings.filterwarnings("ignore", msg, category=FutureWarning) + + tups = [ + (SparseDtype("float64"), SparseDtype("float32")), + (SparseDtype("float64"), SparseDtype("float64", 0)), + (SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)), + (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)), + (SparseDtype("float64"), np.dtype("float64")), + ] + + +@pytest.mark.parametrize( + "a, b", + tups, +) +def test_not_equal(a, b): + assert a != b + + +def test_construct_from_string_raises(): + with pytest.raises( + TypeError, match="Cannot construct a 'SparseDtype' from 'not a dtype'" + ): + SparseDtype.construct_from_string("not a dtype") + + +@pytest.mark.parametrize( + "dtype, expected", + [ + (SparseDtype(int), True), + (SparseDtype(float), True), + (SparseDtype(bool), True), + (SparseDtype(object), False), + (SparseDtype(str), False), + ], +) +def test_is_numeric(dtype, expected): + assert dtype._is_numeric is expected + + +def test_str_uses_object(): + result = SparseDtype(str).subtype + assert result == np.dtype("object") + + +@pytest.mark.parametrize( + "string, expected", + [ + ("Sparse[float64]", SparseDtype(np.dtype("float64"))), + ("Sparse[float32]", SparseDtype(np.dtype("float32"))), + ("Sparse[int]", SparseDtype(np.dtype("int"))), + ("Sparse[str]", SparseDtype(np.dtype("str"))), + ("Sparse[datetime64[ns]]", SparseDtype(np.dtype("datetime64[ns]"))), + ("Sparse", SparseDtype(np.dtype("float"), np.nan)), + ], +) +def test_construct_from_string(string, expected): + result = SparseDtype.construct_from_string(string) + assert result == expected + + +@pytest.mark.parametrize( + "a, b, expected", + [ + (SparseDtype(float, 0.0), SparseDtype(np.dtype("float"), 0.0), True), + (SparseDtype(int, 0), SparseDtype(int, 0), True), + (SparseDtype(float, float("nan")), SparseDtype(float, np.nan), True), + (SparseDtype(float, 0), SparseDtype(float, np.nan), False), + (SparseDtype(int, 0.0), SparseDtype(float, 0.0), False), + ], +) +def test_hash_equal(a, b, expected): + result = a == b + assert result is expected + + result = hash(a) == hash(b) + assert result is expected + + +@pytest.mark.parametrize( + "string, expected", + [ + ("Sparse[int]", "int"), + ("Sparse[int, 0]", "int"), + ("Sparse[int64]", "int64"), + ("Sparse[int64, 0]", "int64"), + ("Sparse[datetime64[ns], 0]", "datetime64[ns]"), + ], +) +def test_parse_subtype(string, expected): + subtype, _ = SparseDtype._parse_subtype(string) + assert subtype == expected + + +@pytest.mark.parametrize( + "string", ["Sparse[int, 1]", "Sparse[float, 0.0]", "Sparse[bool, True]"] +) +def test_construct_from_string_fill_value_raises(string): + with pytest.raises(TypeError, match="fill_value in the string is not"): + SparseDtype.construct_from_string(string) + + +@pytest.mark.parametrize( + "original, dtype, expected", + [ + (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), + (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), + (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), + ], +) +def test_update_dtype(original, dtype, expected): + result = original.update_dtype(dtype) + assert result == expected + + +@pytest.mark.parametrize( + "original, dtype, expected_error_msg", + [ + ( + SparseDtype(float, np.nan), + int, + re.escape("Cannot convert non-finite values (NA or inf) to integer"), + ), + ( + SparseDtype(str, "abc"), + int, + r"invalid literal for int\(\) with base 10: ('abc'|np\.str_\('abc'\))", + ), + ], +) +def test_update_dtype_raises(original, dtype, expected_error_msg): + with pytest.raises(ValueError, match=expected_error_msg): + original.update_dtype(dtype) + + +def test_repr(): + # GH-34352 + result = str(SparseDtype("int64", fill_value=0)) + expected = "Sparse[int64, 0]" + assert result == expected + + result = str(SparseDtype(object, fill_value="0")) + expected = "Sparse[object, '0']" + assert result == expected + + +def test_sparse_dtype_subtype_must_be_numpy_dtype(): + # GH#53160 + msg = "SparseDtype subtype must be a numpy dtype" + with pytest.raises(TypeError, match=msg): + SparseDtype("category", fill_value="c") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..60029ac06ddb47ef0ad4ee35a75fd09ca12f7f53 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_indexing.py @@ -0,0 +1,302 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import SparseDtype +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray + + +@pytest.fixture +def arr_data(): + return np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) + + +@pytest.fixture +def arr(arr_data): + return SparseArray(arr_data) + + +class TestGetitem: + def test_getitem(self, arr): + dense = arr.to_dense() + for i, value in enumerate(arr): + tm.assert_almost_equal(value, dense[i]) + tm.assert_almost_equal(arr[-i], dense[-i]) + + def test_getitem_arraylike_mask(self, arr): + arr = SparseArray([0, 1, 2]) + result = arr[[True, False, True]] + expected = SparseArray([0, 2]) + tm.assert_sp_array_equal(result, expected) + + @pytest.mark.parametrize( + "slc", + [ + np.s_[:], + np.s_[1:10], + np.s_[1:100], + np.s_[10:1], + np.s_[:-3], + np.s_[-5:-4], + np.s_[:-12], + np.s_[-12:], + np.s_[2:], + np.s_[2::3], + np.s_[::2], + np.s_[::-1], + np.s_[::-2], + np.s_[1:6:2], + np.s_[:-6:-2], + ], + ) + @pytest.mark.parametrize( + "as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []] + ) + def test_getslice(self, slc, as_dense): + as_dense = np.array(as_dense) + arr = SparseArray(as_dense) + + result = arr[slc] + expected = SparseArray(as_dense[slc]) + + tm.assert_sp_array_equal(result, expected) + + def test_getslice_tuple(self): + dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) + + sparse = SparseArray(dense) + res = sparse[(slice(4, None),)] + exp = SparseArray(dense[4:]) + tm.assert_sp_array_equal(res, exp) + + sparse = SparseArray(dense, fill_value=0) + res = sparse[(slice(4, None),)] + exp = SparseArray(dense[4:], fill_value=0) + tm.assert_sp_array_equal(res, exp) + + msg = "too many indices for array" + with pytest.raises(IndexError, match=msg): + sparse[4:, :] + + with pytest.raises(IndexError, match=msg): + # check numpy compat + dense[4:, :] + + def test_boolean_slice_empty(self): + arr = SparseArray([0, 1, 2]) + res = arr[[False, False, False]] + assert res.dtype == arr.dtype + + def test_getitem_bool_sparse_array(self, arr): + # GH 23122 + spar_bool = SparseArray([False, True] * 5, dtype=np.bool_, fill_value=True) + exp = SparseArray([np.nan, 2, np.nan, 5, 6]) + tm.assert_sp_array_equal(arr[spar_bool], exp) + + spar_bool = ~spar_bool + res = arr[spar_bool] + exp = SparseArray([np.nan, 1, 3, 4, np.nan]) + tm.assert_sp_array_equal(res, exp) + + spar_bool = SparseArray( + [False, True, np.nan] * 3, dtype=np.bool_, fill_value=np.nan + ) + res = arr[spar_bool] + exp = SparseArray([np.nan, 3, 5]) + tm.assert_sp_array_equal(res, exp) + + def test_getitem_bool_sparse_array_as_comparison(self): + # GH 45110 + arr = SparseArray([1, 2, 3, 4, np.nan, np.nan], fill_value=np.nan) + res = arr[arr > 2] + exp = SparseArray([3.0, 4.0], fill_value=np.nan) + tm.assert_sp_array_equal(res, exp) + + def test_get_item(self, arr): + zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) + + assert np.isnan(arr[1]) + assert arr[2] == 1 + assert arr[7] == 5 + + assert zarr[0] == 0 + assert zarr[2] == 1 + assert zarr[7] == 5 + + errmsg = "must be an integer between -10 and 10" + + with pytest.raises(IndexError, match=errmsg): + arr[11] + + with pytest.raises(IndexError, match=errmsg): + arr[-11] + + assert arr[-1] == arr[len(arr) - 1] + + +class TestSetitem: + def test_set_item(self, arr_data): + arr = SparseArray(arr_data).copy() + + def setitem(): + arr[5] = 3 + + def setslice(): + arr[1:5] = 2 + + with pytest.raises(TypeError, match="assignment via setitem"): + setitem() + + with pytest.raises(TypeError, match="assignment via setitem"): + setslice() + + +class TestTake: + def test_take_scalar_raises(self, arr): + msg = "'indices' must be an array, not a scalar '2'." + with pytest.raises(ValueError, match=msg): + arr.take(2) + + def test_take(self, arr_data, arr): + exp = SparseArray(np.take(arr_data, [2, 3])) + tm.assert_sp_array_equal(arr.take([2, 3]), exp) + + exp = SparseArray(np.take(arr_data, [0, 1, 2])) + tm.assert_sp_array_equal(arr.take([0, 1, 2]), exp) + + def test_take_all_empty(self): + sparse = pd.array([0, 0], dtype=SparseDtype("int64")) + result = sparse.take([0, 1], allow_fill=True, fill_value=np.nan) + tm.assert_sp_array_equal(sparse, result) + + def test_take_different_fill_value(self): + # Take with a different fill value shouldn't overwrite the original + sparse = pd.array([0.0], dtype=SparseDtype("float64", fill_value=0.0)) + result = sparse.take([0, -1], allow_fill=True, fill_value=np.nan) + expected = pd.array([0, np.nan], dtype=sparse.dtype) + tm.assert_sp_array_equal(expected, result) + + def test_take_fill_value(self): + data = np.array([1, np.nan, 0, 3, 0]) + sparse = SparseArray(data, fill_value=0) + + exp = SparseArray(np.take(data, [0]), fill_value=0) + tm.assert_sp_array_equal(sparse.take([0]), exp) + + exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0) + tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp) + + def test_take_negative(self, arr_data, arr): + exp = SparseArray(np.take(arr_data, [-1])) + tm.assert_sp_array_equal(arr.take([-1]), exp) + + exp = SparseArray(np.take(arr_data, [-4, -3, -2])) + tm.assert_sp_array_equal(arr.take([-4, -3, -2]), exp) + + def test_bad_take(self, arr): + with pytest.raises(IndexError, match="bounds"): + arr.take([11]) + + def test_take_filling(self): + # similar tests as GH 12631 + sparse = SparseArray([np.nan, np.nan, 1, np.nan, 4]) + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([np.nan, np.nan, 4]) + tm.assert_sp_array_equal(result, expected) + + # TODO: actionable? + # XXX: test change: fill_value=True -> allow_fill=True + result = sparse.take(np.array([1, 0, -1]), allow_fill=True) + expected = SparseArray([np.nan, np.nan, np.nan]) + tm.assert_sp_array_equal(result, expected) + + # allow_fill=False + result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = SparseArray([np.nan, np.nan, 4]) + tm.assert_sp_array_equal(result, expected) + + msg = "Invalid value in 'indices'" + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -2]), allow_fill=True) + + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -5]), allow_fill=True) + + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, -6])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5]), allow_fill=True) + + def test_take_filling_fill_value(self): + # same tests as GH#12631 + sparse = SparseArray([np.nan, 0, 1, 0, 4], fill_value=0) + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([0, np.nan, 4], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + # fill_value + result = sparse.take(np.array([1, 0, -1]), allow_fill=True) + # TODO: actionable? + # XXX: behavior change. + # the old way of filling self.fill_value doesn't follow EA rules. + # It's supposed to be self.dtype.na_value (nan in this case) + expected = SparseArray([0, np.nan, np.nan], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + # allow_fill=False + result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = SparseArray([0, np.nan, 4], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + msg = "Invalid value in 'indices'." + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -2]), allow_fill=True) + with pytest.raises(ValueError, match=msg): + sparse.take(np.array([1, 0, -5]), allow_fill=True) + + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, -6])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5]), fill_value=True) + + @pytest.mark.parametrize("kind", ["block", "integer"]) + def test_take_filling_all_nan(self, kind): + sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan], kind=kind) + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([np.nan, np.nan, np.nan], kind=kind) + tm.assert_sp_array_equal(result, expected) + + result = sparse.take(np.array([1, 0, -1]), fill_value=True) + expected = SparseArray([np.nan, np.nan, np.nan], kind=kind) + tm.assert_sp_array_equal(result, expected) + + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, -6])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5])) + with pytest.raises(IndexError, match=msg): + sparse.take(np.array([1, 5]), fill_value=True) + + +class TestWhere: + def test_where_retain_fill_value(self): + # GH#45691 don't lose fill_value on _where + arr = SparseArray([np.nan, 1.0], fill_value=0) + + mask = np.array([True, False]) + + res = arr._where(~mask, 1) + exp = SparseArray([1, 1.0], fill_value=0) + tm.assert_sp_array_equal(res, exp) + + ser = pd.Series(arr) + res = ser.where(~mask, 1) + tm.assert_series_equal(res, pd.Series(exp)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_libsparse.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_libsparse.py new file mode 100644 index 0000000000000000000000000000000000000000..7a77a2064e7e097f924a3901994d131d98164ad6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_libsparse.py @@ -0,0 +1,551 @@ +import operator + +import numpy as np +import pytest + +import pandas._libs.sparse as splib +import pandas.util._test_decorators as td + +from pandas import Series +import pandas._testing as tm +from pandas.core.arrays.sparse import ( + BlockIndex, + IntIndex, + make_sparse_index, +) + + +@pytest.fixture +def test_length(): + return 20 + + +@pytest.fixture( + params=[ + [ + [0, 7, 15], + [3, 5, 5], + [2, 9, 14], + [2, 3, 5], + [2, 9, 15], + [1, 3, 4], + ], + [ + [0, 5], + [4, 4], + [1], + [4], + [1], + [3], + ], + [ + [0], + [10], + [0, 5], + [3, 7], + [0, 5], + [3, 5], + ], + [ + [10], + [5], + [0, 12], + [5, 3], + [12], + [3], + ], + [ + [0, 10], + [4, 6], + [5, 17], + [4, 2], + [], + [], + ], + [ + [0], + [5], + [], + [], + [], + [], + ], + ], + ids=[ + "plain_case", + "delete_blocks", + "split_blocks", + "skip_block", + "no_intersect", + "one_empty", + ], +) +def cases(request): + return request.param + + +class TestSparseIndexUnion: + @pytest.mark.parametrize( + "xloc, xlen, yloc, ylen, eloc, elen", + [ + [[0], [5], [5], [4], [0], [9]], + [[0, 10], [5, 5], [2, 17], [5, 2], [0, 10, 17], [7, 5, 2]], + [[1], [5], [3], [5], [1], [7]], + [[2, 10], [4, 4], [4], [8], [2], [12]], + [[0, 5], [3, 5], [0], [7], [0], [10]], + [[2, 10], [4, 4], [4, 13], [8, 4], [2], [15]], + [[2], [15], [4, 9, 14], [3, 2, 2], [2], [15]], + [[0, 10], [3, 3], [5, 15], [2, 2], [0, 5, 10, 15], [3, 2, 3, 2]], + ], + ) + def test_index_make_union(self, xloc, xlen, yloc, ylen, eloc, elen, test_length): + # Case 1 + # x: ---- + # y: ---- + # r: -------- + # Case 2 + # x: ----- ----- + # y: ----- -- + # Case 3 + # x: ------ + # y: ------- + # r: ---------- + # Case 4 + # x: ------ ----- + # y: ------- + # r: ------------- + # Case 5 + # x: --- ----- + # y: ------- + # r: ------------- + # Case 6 + # x: ------ ----- + # y: ------- --- + # r: ------------- + # Case 7 + # x: ---------------------- + # y: ---- ---- --- + # r: ---------------------- + # Case 8 + # x: ---- --- + # y: --- --- + xindex = BlockIndex(test_length, xloc, xlen) + yindex = BlockIndex(test_length, yloc, ylen) + bresult = xindex.make_union(yindex) + assert isinstance(bresult, BlockIndex) + tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32)) + tm.assert_numpy_array_equal(bresult.blengths, np.array(elen, dtype=np.int32)) + + ixindex = xindex.to_int_index() + iyindex = yindex.to_int_index() + iresult = ixindex.make_union(iyindex) + assert isinstance(iresult, IntIndex) + tm.assert_numpy_array_equal(iresult.indices, bresult.to_int_index().indices) + + def test_int_index_make_union(self): + a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32)) + b = IntIndex(5, np.array([0, 2], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32)) + assert res.equals(exp) + + a = IntIndex(5, np.array([], dtype=np.int32)) + b = IntIndex(5, np.array([0, 2], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([0, 2], np.int32)) + assert res.equals(exp) + + a = IntIndex(5, np.array([], dtype=np.int32)) + b = IntIndex(5, np.array([], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([], np.int32)) + assert res.equals(exp) + + a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32)) + b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32)) + res = a.make_union(b) + exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32)) + assert res.equals(exp) + + a = IntIndex(5, np.array([0, 1], dtype=np.int32)) + b = IntIndex(4, np.array([0, 1], dtype=np.int32)) + + msg = "Indices must reference same underlying length" + with pytest.raises(ValueError, match=msg): + a.make_union(b) + + +class TestSparseIndexIntersect: + @td.skip_if_windows + def test_intersect(self, cases, test_length): + xloc, xlen, yloc, ylen, eloc, elen = cases + xindex = BlockIndex(test_length, xloc, xlen) + yindex = BlockIndex(test_length, yloc, ylen) + expected = BlockIndex(test_length, eloc, elen) + longer_index = BlockIndex(test_length + 1, yloc, ylen) + + result = xindex.intersect(yindex) + assert result.equals(expected) + result = xindex.to_int_index().intersect(yindex.to_int_index()) + assert result.equals(expected.to_int_index()) + + msg = "Indices must reference same underlying length" + with pytest.raises(Exception, match=msg): + xindex.intersect(longer_index) + with pytest.raises(Exception, match=msg): + xindex.to_int_index().intersect(longer_index.to_int_index()) + + def test_intersect_empty(self): + xindex = IntIndex(4, np.array([], dtype=np.int32)) + yindex = IntIndex(4, np.array([2, 3], dtype=np.int32)) + assert xindex.intersect(yindex).equals(xindex) + assert yindex.intersect(xindex).equals(xindex) + + xindex = xindex.to_block_index() + yindex = yindex.to_block_index() + assert xindex.intersect(yindex).equals(xindex) + assert yindex.intersect(xindex).equals(xindex) + + @pytest.mark.parametrize( + "case", + [ + # Argument 2 to "IntIndex" has incompatible type "ndarray[Any, + # dtype[signedinteger[_32Bit]]]"; expected "Sequence[int]" + IntIndex(5, np.array([1, 2], dtype=np.int32)), # type: ignore[arg-type] + IntIndex(5, np.array([0, 2, 4], dtype=np.int32)), # type: ignore[arg-type] + IntIndex(0, np.array([], dtype=np.int32)), # type: ignore[arg-type] + IntIndex(5, np.array([], dtype=np.int32)), # type: ignore[arg-type] + ], + ) + def test_intersect_identical(self, case): + assert case.intersect(case).equals(case) + case = case.to_block_index() + assert case.intersect(case).equals(case) + + +class TestSparseIndexCommon: + def test_int_internal(self): + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer") + assert isinstance(idx, IntIndex) + assert idx.npoints == 2 + tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32)) + + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer") + assert isinstance(idx, IntIndex) + assert idx.npoints == 0 + tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32)) + + idx = make_sparse_index( + 4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer" + ) + assert isinstance(idx, IntIndex) + assert idx.npoints == 4 + tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32)) + + def test_block_internal(self): + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 2 + tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32)) + + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 0 + tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32)) + + idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 4 + tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32)) + + idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 3 + tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32)) + + @pytest.mark.parametrize("kind", ["integer", "block"]) + def test_lookup(self, kind): + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind) + assert idx.lookup(-1) == -1 + assert idx.lookup(0) == -1 + assert idx.lookup(1) == -1 + assert idx.lookup(2) == 0 + assert idx.lookup(3) == 1 + assert idx.lookup(4) == -1 + + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind) + + for i in range(-1, 5): + assert idx.lookup(i) == -1 + + idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind) + assert idx.lookup(-1) == -1 + assert idx.lookup(0) == 0 + assert idx.lookup(1) == 1 + assert idx.lookup(2) == 2 + assert idx.lookup(3) == 3 + assert idx.lookup(4) == -1 + + idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) + assert idx.lookup(-1) == -1 + assert idx.lookup(0) == 0 + assert idx.lookup(1) == -1 + assert idx.lookup(2) == 1 + assert idx.lookup(3) == 2 + assert idx.lookup(4) == -1 + + @pytest.mark.parametrize("kind", ["integer", "block"]) + def test_lookup_array(self, kind): + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind) + + res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) + exp = np.array([-1, -1, 0], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32)) + exp = np.array([-1, 0, -1, 1], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind) + res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32)) + exp = np.array([-1, -1, -1, -1], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind) + res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) + exp = np.array([-1, 0, 2], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32)) + exp = np.array([-1, 2, 1, 3], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) + res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32)) + exp = np.array([1, -1, 2, 0], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + res = idx.lookup_array(np.array([1, 4, 2, 5], dtype=np.int32)) + exp = np.array([-1, -1, 1, -1], dtype=np.int32) + tm.assert_numpy_array_equal(res, exp) + + @pytest.mark.parametrize( + "idx, expected", + [ + [0, -1], + [5, 0], + [7, 2], + [8, -1], + [9, -1], + [10, -1], + [11, -1], + [12, 3], + [17, 8], + [18, -1], + ], + ) + def test_lookup_basics(self, idx, expected): + bindex = BlockIndex(20, [5, 12], [3, 6]) + assert bindex.lookup(idx) == expected + + iindex = bindex.to_int_index() + assert iindex.lookup(idx) == expected + + +class TestBlockIndex: + def test_block_internal(self): + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 2 + tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32)) + + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 0 + tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32)) + + idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 4 + tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32)) + + idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") + assert isinstance(idx, BlockIndex) + assert idx.npoints == 3 + tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32)) + + @pytest.mark.parametrize("i", [5, 10, 100, 101]) + def test_make_block_boundary(self, i): + idx = make_sparse_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block") + + exp = np.arange(0, i, 2, dtype=np.int32) + tm.assert_numpy_array_equal(idx.blocs, exp) + tm.assert_numpy_array_equal(idx.blengths, np.ones(len(exp), dtype=np.int32)) + + def test_equals(self): + index = BlockIndex(10, [0, 4], [2, 5]) + + assert index.equals(index) + assert not index.equals(BlockIndex(10, [0, 4], [2, 6])) + + def test_check_integrity(self): + locs = [] + lengths = [] + + # 0-length OK + BlockIndex(0, locs, lengths) + + # also OK even though empty + BlockIndex(1, locs, lengths) + + msg = "Block 0 extends beyond end" + with pytest.raises(ValueError, match=msg): + BlockIndex(10, [5], [10]) + + msg = "Block 0 overlaps" + with pytest.raises(ValueError, match=msg): + BlockIndex(10, [2, 5], [5, 3]) + + def test_to_int_index(self): + locs = [0, 10] + lengths = [4, 6] + exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15] + + block = BlockIndex(20, locs, lengths) + dense = block.to_int_index() + + tm.assert_numpy_array_equal(dense.indices, np.array(exp_inds, dtype=np.int32)) + + def test_to_block_index(self): + index = BlockIndex(10, [0, 5], [4, 5]) + assert index.to_block_index() is index + + +class TestIntIndex: + def test_check_integrity(self): + # Too many indices than specified in self.length + msg = "Too many indices" + + with pytest.raises(ValueError, match=msg): + IntIndex(length=1, indices=[1, 2, 3]) + + # No index can be negative. + msg = "No index can be less than zero" + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, -2, 3]) + + # No index can be negative. + msg = "No index can be less than zero" + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, -2, 3]) + + # All indices must be less than the length. + msg = "All indices must be less than the length" + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, 2, 5]) + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, 2, 6]) + + # Indices must be strictly ascending. + msg = "Indices must be strictly increasing" + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, 3, 2]) + + with pytest.raises(ValueError, match=msg): + IntIndex(length=5, indices=[1, 3, 3]) + + def test_int_internal(self): + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer") + assert isinstance(idx, IntIndex) + assert idx.npoints == 2 + tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32)) + + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer") + assert isinstance(idx, IntIndex) + assert idx.npoints == 0 + tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32)) + + idx = make_sparse_index( + 4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer" + ) + assert isinstance(idx, IntIndex) + assert idx.npoints == 4 + tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32)) + + def test_equals(self): + index = IntIndex(10, [0, 1, 2, 3, 4]) + assert index.equals(index) + assert not index.equals(IntIndex(10, [0, 1, 2, 3])) + + def test_to_block_index(self, cases, test_length): + xloc, xlen, yloc, ylen, _, _ = cases + xindex = BlockIndex(test_length, xloc, xlen) + yindex = BlockIndex(test_length, yloc, ylen) + + # see if survive the round trip + xbindex = xindex.to_int_index().to_block_index() + ybindex = yindex.to_int_index().to_block_index() + assert isinstance(xbindex, BlockIndex) + assert xbindex.equals(xindex) + assert ybindex.equals(yindex) + + def test_to_int_index(self): + index = IntIndex(10, [2, 3, 4, 5, 6]) + assert index.to_int_index() is index + + +class TestSparseOperators: + @pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"]) + def test_op(self, opname, cases, test_length): + xloc, xlen, yloc, ylen, _, _ = cases + sparse_op = getattr(splib, f"sparse_{opname}_float64") + python_op = getattr(operator, opname) + + xindex = BlockIndex(test_length, xloc, xlen) + yindex = BlockIndex(test_length, yloc, ylen) + + xdindex = xindex.to_int_index() + ydindex = yindex.to_int_index() + + x = np.arange(xindex.npoints) * 10.0 + 1 + y = np.arange(yindex.npoints) * 100.0 + 1 + + xfill = 0 + yfill = 2 + + result_block_vals, rb_index, bfill = sparse_op( + x, xindex, xfill, y, yindex, yfill + ) + result_int_vals, ri_index, ifill = sparse_op( + x, xdindex, xfill, y, ydindex, yfill + ) + + assert rb_index.to_int_index().equals(ri_index) + tm.assert_numpy_array_equal(result_block_vals, result_int_vals) + assert bfill == ifill + + # check versus Series... + xseries = Series(x, xdindex.indices) + xseries = xseries.reindex(np.arange(test_length)).fillna(xfill) + + yseries = Series(y, ydindex.indices) + yseries = yseries.reindex(np.arange(test_length)).fillna(yfill) + + series_result = python_op(xseries, yseries) + series_result = series_result.reindex(ri_index.indices) + + tm.assert_numpy_array_equal(result_block_vals, series_result.values) + tm.assert_numpy_array_equal(result_int_vals, series_result.values) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_reductions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_reductions.py new file mode 100644 index 0000000000000000000000000000000000000000..f44423d5e635c3f74725db219d48fcaca27c4d53 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_reductions.py @@ -0,0 +1,306 @@ +import numpy as np +import pytest + +from pandas import ( + NaT, + SparseDtype, + Timestamp, + isna, +) +from pandas.core.arrays.sparse import SparseArray + + +class TestReductions: + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0), + ], + ) + def test_all(self, data, pos, neg): + # GH#17570 + out = SparseArray(data).all() + assert out + + out = SparseArray(data, fill_value=pos).all() + assert out + + data[1] = neg + out = SparseArray(data).all() + assert not out + + out = SparseArray(data, fill_value=pos).all() + assert not out + + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0), + ], + ) + def test_numpy_all(self, data, pos, neg): + # GH#17570 + out = np.all(SparseArray(data)) + assert out + + out = np.all(SparseArray(data, fill_value=pos)) + assert out + + data[1] = neg + out = np.all(SparseArray(data)) + assert not out + + out = np.all(SparseArray(data, fill_value=pos)) + assert not out + + # raises with a different message on py2. + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.all(SparseArray(data), out=np.array([])) + + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0), + ], + ) + def test_any(self, data, pos, neg): + # GH#17570 + out = SparseArray(data).any() + assert out + + out = SparseArray(data, fill_value=pos).any() + assert out + + data[1] = neg + out = SparseArray(data).any() + assert not out + + out = SparseArray(data, fill_value=pos).any() + assert not out + + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0), + ], + ) + def test_numpy_any(self, data, pos, neg): + # GH#17570 + out = np.any(SparseArray(data)) + assert out + + out = np.any(SparseArray(data, fill_value=pos)) + assert out + + data[1] = neg + out = np.any(SparseArray(data)) + assert not out + + out = np.any(SparseArray(data, fill_value=pos)) + assert not out + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.any(SparseArray(data), out=out) + + def test_sum(self): + data = np.arange(10).astype(float) + out = SparseArray(data).sum() + assert out == 45.0 + + data[5] = np.nan + out = SparseArray(data, fill_value=2).sum() + assert out == 40.0 + + out = SparseArray(data, fill_value=np.nan).sum() + assert out == 40.0 + + @pytest.mark.parametrize( + "arr", + [np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])], + ) + @pytest.mark.parametrize("fill_value", [0, 1, np.nan]) + @pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)]) + def test_sum_min_count(self, arr, fill_value, min_count, expected): + # GH#25777 + sparray = SparseArray(arr, fill_value=fill_value) + result = sparray.sum(min_count=min_count) + if np.isnan(expected): + assert np.isnan(result) + else: + assert result == expected + + def test_bool_sum_min_count(self): + spar_bool = SparseArray([False, True] * 5, dtype=np.bool_, fill_value=True) + res = spar_bool.sum(min_count=1) + assert res == 5 + res = spar_bool.sum(min_count=11) + assert isna(res) + + def test_numpy_sum(self): + data = np.arange(10).astype(float) + out = np.sum(SparseArray(data)) + assert out == 45.0 + + data[5] = np.nan + out = np.sum(SparseArray(data, fill_value=2)) + assert out == 40.0 + + out = np.sum(SparseArray(data, fill_value=np.nan)) + assert out == 40.0 + + msg = "the 'dtype' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.sum(SparseArray(data), dtype=np.int64) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.sum(SparseArray(data), out=out) + + def test_mean(self): + data = np.arange(10).astype(float) + out = SparseArray(data).mean() + assert out == 4.5 + + data[5] = np.nan + out = SparseArray(data).mean() + assert out == 40.0 / 9 + + def test_numpy_mean(self): + data = np.arange(10).astype(float) + out = np.mean(SparseArray(data)) + assert out == 4.5 + + data[5] = np.nan + out = np.mean(SparseArray(data)) + assert out == 40.0 / 9 + + msg = "the 'dtype' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.mean(SparseArray(data), dtype=np.int64) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.mean(SparseArray(data), out=out) + + +class TestMinMax: + @pytest.mark.parametrize( + "raw_data,max_expected,min_expected", + [ + (np.arange(5.0), [4], [0]), + (-np.arange(5.0), [0], [-4]), + (np.array([0, 1, 2, np.nan, 4]), [4], [0]), + (np.array([np.nan] * 5), [np.nan], [np.nan]), + (np.array([]), [np.nan], [np.nan]), + ], + ) + def test_nan_fill_value(self, raw_data, max_expected, min_expected): + arr = SparseArray(raw_data) + max_result = arr.max() + min_result = arr.min() + assert max_result in max_expected + assert min_result in min_expected + + max_result = arr.max(skipna=False) + min_result = arr.min(skipna=False) + if np.isnan(raw_data).any(): + assert np.isnan(max_result) + assert np.isnan(min_result) + else: + assert max_result in max_expected + assert min_result in min_expected + + @pytest.mark.parametrize( + "fill_value,max_expected,min_expected", + [ + (100, 100, 0), + (-100, 1, -100), + ], + ) + def test_fill_value(self, fill_value, max_expected, min_expected): + arr = SparseArray( + np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value) + ) + max_result = arr.max() + assert max_result == max_expected + + min_result = arr.min() + assert min_result == min_expected + + def test_only_fill_value(self): + fv = 100 + arr = SparseArray(np.array([fv, fv, fv]), dtype=SparseDtype("int", fv)) + assert len(arr._valid_sp_values) == 0 + + assert arr.max() == fv + assert arr.min() == fv + assert arr.max(skipna=False) == fv + assert arr.min(skipna=False) == fv + + @pytest.mark.parametrize("func", ["min", "max"]) + @pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])]) + @pytest.mark.parametrize( + "dtype,expected", + [ + (SparseDtype(np.float64, np.nan), np.nan), + (SparseDtype(np.float64, 5.0), np.nan), + (SparseDtype("datetime64[ns]", NaT), NaT), + (SparseDtype("datetime64[ns]", Timestamp("2018-05-05")), NaT), + ], + ) + def test_na_value_if_no_valid_values(self, func, data, dtype, expected): + arr = SparseArray(data, dtype=dtype) + result = getattr(arr, func)() + if expected is NaT: + # TODO: pin down whether we wrap datetime64("NaT") + assert result is NaT or np.isnat(result) + else: + assert np.isnan(result) + + +class TestArgmaxArgmin: + @pytest.mark.parametrize( + "arr,argmax_expected,argmin_expected", + [ + (SparseArray([1, 2, 0, 1, 2]), 1, 2), + (SparseArray([-1, -2, 0, -1, -2]), 2, 1), + (SparseArray([np.nan, 1, 0, 0, np.nan, -1]), 1, 5), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2]), 5, 2), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=-1), 5, 2), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=0), 5, 2), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=1), 5, 2), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=2), 5, 2), + (SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=3), 5, 2), + (SparseArray([0] * 10 + [-1], fill_value=0), 0, 10), + (SparseArray([0] * 10 + [-1], fill_value=-1), 0, 10), + (SparseArray([0] * 10 + [-1], fill_value=1), 0, 10), + (SparseArray([-1] + [0] * 10, fill_value=0), 1, 0), + (SparseArray([1] + [0] * 10, fill_value=0), 0, 1), + (SparseArray([-1] + [0] * 10, fill_value=-1), 1, 0), + (SparseArray([1] + [0] * 10, fill_value=1), 0, 1), + ], + ) + def test_argmax_argmin(self, arr, argmax_expected, argmin_expected): + argmax_result = arr.argmax() + argmin_result = arr.argmin() + assert argmax_result == argmax_expected + assert argmin_result == argmin_expected + + @pytest.mark.parametrize( + "arr,method", + [(SparseArray([]), "argmax"), (SparseArray([]), "argmin")], + ) + def test_empty_array(self, arr, method): + msg = f"attempt to get {method} of an empty sequence" + with pytest.raises(ValueError, match=msg): + arr.argmax() if method == "argmax" else arr.argmin() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_unary.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_unary.py new file mode 100644 index 0000000000000000000000000000000000000000..c00a73773fdd4795e3d5d7f030a591a060dc3bfc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/sparse/test_unary.py @@ -0,0 +1,79 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import SparseArray + + +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") +@pytest.mark.parametrize("fill_value", [0, np.nan]) +@pytest.mark.parametrize("op", [operator.pos, operator.neg]) +def test_unary_op(op, fill_value): + arr = np.array([0, 1, np.nan, 2]) + sparray = SparseArray(arr, fill_value=fill_value) + result = op(sparray) + expected = SparseArray(op(arr), fill_value=op(fill_value)) + tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize("fill_value", [True, False]) +def test_invert(fill_value): + arr = np.array([True, False, False, True]) + sparray = SparseArray(arr, fill_value=fill_value) + result = ~sparray + expected = SparseArray(~arr, fill_value=not fill_value) + tm.assert_sp_array_equal(result, expected) + + result = ~pd.Series(sparray) + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + result = ~pd.DataFrame({"A": sparray}) + expected = pd.DataFrame({"A": expected}) + tm.assert_frame_equal(result, expected) + + +class TestUnaryMethods: + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_neg_operator(self): + arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8) + res = -arr + exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8) + res = -arr + exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_abs_operator(self): + arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8) + res = abs(arr) + exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8) + res = abs(arr) + exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8) + tm.assert_sp_array_equal(exp, res) + + def test_invert_operator(self): + arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool_) + exp = SparseArray( + np.invert([False, True, False, True]), fill_value=True, dtype=np.bool_ + ) + res = ~arr + tm.assert_sp_array_equal(exp, res) + + arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32) + res = ~arr + exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32) + tm.assert_sp_array_equal(exp, res) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a4012e2492dd606725d9c8d2e748ce79d1259dc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/__pycache__/test_string_arrow.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/__pycache__/test_string_arrow.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6798e63596013c8dfe94de837e2f4c43f1b8f62 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/string_/__pycache__/test_string_arrow.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/test_constructors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..91b6f7fa222f9a668092a99a8371753e914008c8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/test_constructors.py @@ -0,0 +1,103 @@ +import numpy as np +import pytest + +import pandas._testing as tm +from pandas.core.arrays import TimedeltaArray + + +class TestTimedeltaArrayConstructor: + def test_only_1dim_accepted(self): + # GH#25282 + arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]") + + depr_msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + TimedeltaArray(arr.reshape(2, 2, 1)) + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 0-dim + TimedeltaArray(arr[[0]].squeeze()) + + def test_freq_validation(self): + # ensure that the public constructor cannot create an invalid instance + arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10**9 + + msg = ( + "Inferred frequency None from passed values does not " + "conform to passed frequency D" + ) + depr_msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + TimedeltaArray(arr.view("timedelta64[ns]"), freq="D") + + def test_non_array_raises(self): + depr_msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="list"): + TimedeltaArray([1, 2, 3]) + + def test_other_type_raises(self): + msg = r"dtype bool cannot be converted to timedelta64\[ns\]" + with pytest.raises(TypeError, match=msg): + TimedeltaArray._from_sequence(np.array([1, 2, 3], dtype="bool")) + + def test_incorrect_dtype_raises(self): + msg = "dtype 'category' is invalid, should be np.timedelta64 dtype" + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype="category" + ) + + msg = "dtype 'int64' is invalid, should be np.timedelta64 dtype" + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64") + ) + + msg = r"dtype 'datetime64\[ns\]' is invalid, should be np.timedelta64 dtype" + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("M8[ns]") + ) + + msg = ( + r"dtype 'datetime64\[us, UTC\]' is invalid, should be np.timedelta64 dtype" + ) + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype="M8[us, UTC]" + ) + + msg = "Supported timedelta64 resolutions are 's', 'ms', 'us', 'ns'" + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("m8[Y]") + ) + + def test_mismatched_values_dtype_units(self): + arr = np.array([1, 2, 3], dtype="m8[s]") + dtype = np.dtype("m8[ns]") + msg = r"Values resolution does not match dtype" + depr_msg = "TimedeltaArray.__init__ is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + TimedeltaArray(arr, dtype=dtype) + + def test_copy(self): + data = np.array([1, 2, 3], dtype="m8[ns]") + arr = TimedeltaArray._from_sequence(data, copy=False) + assert arr._ndarray is data + + arr = TimedeltaArray._from_sequence(data, copy=True) + assert arr._ndarray is not data + assert arr._ndarray.base is not data + + def test_from_sequence_dtype(self): + msg = "dtype 'object' is invalid, should be np.timedelta64 dtype" + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence([], dtype=object) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/test_cumulative.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/test_cumulative.py new file mode 100644 index 0000000000000000000000000000000000000000..2d8fe65f807e431d788e526eee058780b5bf979c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/test_cumulative.py @@ -0,0 +1,20 @@ +import pytest + +import pandas._testing as tm +from pandas.core.arrays import TimedeltaArray + + +class TestAccumulator: + def test_accumulators_disallowed(self): + # GH#50297 + arr = TimedeltaArray._from_sequence(["1D", "2D"], dtype="m8[ns]") + with pytest.raises(TypeError, match="cumprod not supported"): + arr._accumulate("cumprod") + + def test_cumsum(self, unit): + # GH#50297 + dtype = f"m8[{unit}]" + arr = TimedeltaArray._from_sequence(["1D", "2D"], dtype=dtype) + result = arr._accumulate("cumsum") + expected = TimedeltaArray._from_sequence(["1D", "3D"], dtype=dtype) + tm.assert_timedelta_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/test_reductions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/test_reductions.py new file mode 100644 index 0000000000000000000000000000000000000000..991dbf41c808794f9a53a3f3351a9b6e79a7c6fd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/arrays/timedeltas/test_reductions.py @@ -0,0 +1,218 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Timedelta +import pandas._testing as tm +from pandas.core import nanops +from pandas.core.arrays import TimedeltaArray + + +class TestReductions: + @pytest.mark.parametrize("name", ["std", "min", "max", "median", "mean"]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_reductions_empty(self, name, skipna): + tdi = pd.TimedeltaIndex([]) + arr = tdi.array + + result = getattr(tdi, name)(skipna=skipna) + assert result is pd.NaT + + result = getattr(arr, name)(skipna=skipna) + assert result is pd.NaT + + @pytest.mark.parametrize("skipna", [True, False]) + def test_sum_empty(self, skipna): + tdi = pd.TimedeltaIndex([]) + arr = tdi.array + + result = tdi.sum(skipna=skipna) + assert isinstance(result, Timedelta) + assert result == Timedelta(0) + + result = arr.sum(skipna=skipna) + assert isinstance(result, Timedelta) + assert result == Timedelta(0) + + def test_min_max(self, unit): + dtype = f"m8[{unit}]" + arr = TimedeltaArray._from_sequence( + ["3h", "3h", "NaT", "2h", "5h", "4h"], dtype=dtype + ) + + result = arr.min() + expected = Timedelta("2h") + assert result == expected + + result = arr.max() + expected = Timedelta("5h") + assert result == expected + + result = arr.min(skipna=False) + assert result is pd.NaT + + result = arr.max(skipna=False) + assert result is pd.NaT + + def test_sum(self): + tdi = pd.TimedeltaIndex(["3h", "3h", "NaT", "2h", "5h", "4h"]) + arr = tdi.array + + result = arr.sum(skipna=True) + expected = Timedelta(hours=17) + assert isinstance(result, Timedelta) + assert result == expected + + result = tdi.sum(skipna=True) + assert isinstance(result, Timedelta) + assert result == expected + + result = arr.sum(skipna=False) + assert result is pd.NaT + + result = tdi.sum(skipna=False) + assert result is pd.NaT + + result = arr.sum(min_count=9) + assert result is pd.NaT + + result = tdi.sum(min_count=9) + assert result is pd.NaT + + result = arr.sum(min_count=1) + assert isinstance(result, Timedelta) + assert result == expected + + result = tdi.sum(min_count=1) + assert isinstance(result, Timedelta) + assert result == expected + + def test_npsum(self): + # GH#25282, GH#25335 np.sum should return a Timedelta, not timedelta64 + tdi = pd.TimedeltaIndex(["3h", "3h", "2h", "5h", "4h"]) + arr = tdi.array + + result = np.sum(tdi) + expected = Timedelta(hours=17) + assert isinstance(result, Timedelta) + assert result == expected + + result = np.sum(arr) + assert isinstance(result, Timedelta) + assert result == expected + + def test_sum_2d_skipna_false(self): + arr = np.arange(8).astype(np.int64).view("m8[s]").astype("m8[ns]").reshape(4, 2) + arr[-1, -1] = "Nat" + + tda = TimedeltaArray._from_sequence(arr) + + result = tda.sum(skipna=False) + assert result is pd.NaT + + result = tda.sum(axis=0, skipna=False) + expected = pd.TimedeltaIndex([Timedelta(seconds=12), pd.NaT])._values + tm.assert_timedelta_array_equal(result, expected) + + result = tda.sum(axis=1, skipna=False) + expected = pd.TimedeltaIndex( + [ + Timedelta(seconds=1), + Timedelta(seconds=5), + Timedelta(seconds=9), + pd.NaT, + ] + )._values + tm.assert_timedelta_array_equal(result, expected) + + # Adding a Timestamp makes this a test for DatetimeArray.std + @pytest.mark.parametrize( + "add", + [ + Timedelta(0), + pd.Timestamp("2021-01-01"), + pd.Timestamp("2021-01-01", tz="UTC"), + pd.Timestamp("2021-01-01", tz="Asia/Tokyo"), + ], + ) + def test_std(self, add): + tdi = pd.TimedeltaIndex(["0h", "4h", "NaT", "4h", "0h", "2h"]) + add + arr = tdi.array + + result = arr.std(skipna=True) + expected = Timedelta(hours=2) + assert isinstance(result, Timedelta) + assert result == expected + + result = tdi.std(skipna=True) + assert isinstance(result, Timedelta) + assert result == expected + + if getattr(arr, "tz", None) is None: + result = nanops.nanstd(np.asarray(arr), skipna=True) + assert isinstance(result, np.timedelta64) + assert result == expected + + result = arr.std(skipna=False) + assert result is pd.NaT + + result = tdi.std(skipna=False) + assert result is pd.NaT + + if getattr(arr, "tz", None) is None: + result = nanops.nanstd(np.asarray(arr), skipna=False) + assert isinstance(result, np.timedelta64) + assert np.isnat(result) + + def test_median(self): + tdi = pd.TimedeltaIndex(["0h", "3h", "NaT", "5h06m", "0h", "2h"]) + arr = tdi.array + + result = arr.median(skipna=True) + expected = Timedelta(hours=2) + assert isinstance(result, Timedelta) + assert result == expected + + result = tdi.median(skipna=True) + assert isinstance(result, Timedelta) + assert result == expected + + result = arr.median(skipna=False) + assert result is pd.NaT + + result = tdi.median(skipna=False) + assert result is pd.NaT + + def test_mean(self): + tdi = pd.TimedeltaIndex(["0h", "3h", "NaT", "5h06m", "0h", "2h"]) + arr = tdi._data + + # manually verified result + expected = Timedelta(arr.dropna()._ndarray.mean()) + + result = arr.mean() + assert result == expected + result = arr.mean(skipna=False) + assert result is pd.NaT + + result = arr.dropna().mean(skipna=False) + assert result == expected + + result = arr.mean(axis=0) + assert result == expected + + def test_mean_2d(self): + tdi = pd.timedelta_range("14 days", periods=6) + tda = tdi._data.reshape(3, 2) + + result = tda.mean(axis=0) + expected = tda[1] + tm.assert_timedelta_array_equal(result, expected) + + result = tda.mean(axis=1) + expected = tda[:, 0] + Timedelta(hours=12) + tm.assert_timedelta_array_equal(result, expected) + + result = tda.mean(axis=None) + expected = tdi.mean() + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2de0d8fa53e33fa5108d22086b36ee631d7c44fe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61f11b4fa0872b6480971199ea0fe74e73be7a42 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de33c842386b5a949654149c9e9c4e8965fca34c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_conversion.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_conversion.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90d906e098d33f1458d4c138b02f485564c9ea85 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_conversion.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_fillna.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_fillna.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..336203c1bbcfcbe9c3cd0a7e558a62f8697ece3a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_fillna.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_misc.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_misc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e257489a19b3fbf1e9701a5082d0c8b1a1ff641e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_misc.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_transpose.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_transpose.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2273e9f475777dadb956fea0796234ff51f38cf Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_transpose.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_unique.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_unique.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..171cd94692c64c24e427b21c5469ab5c632e7cb9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_unique.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_value_counts.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_value_counts.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2aa931b17a0ff095ec027f63408eb99da9f5fbe3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/base/__pycache__/test_value_counts.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_array.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_array.py new file mode 100644 index 0000000000000000000000000000000000000000..9a3f83e0293f539cd2a68e9eb515cd817d7eb48b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_array.py @@ -0,0 +1,190 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + date_range, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + +# ----------------------------------------------------------------------------- +# Copy/view behaviour for accessing underlying array of Series/DataFrame + + +@pytest.mark.parametrize( + "method", + [lambda ser: ser.values, lambda ser: np.asarray(ser)], + ids=["values", "asarray"], +) +def test_series_values(using_copy_on_write, method): + ser = Series([1, 2, 3], name="name") + ser_orig = ser.copy() + + arr = method(ser) + + if using_copy_on_write: + # .values still gives a view but is read-only + assert np.shares_memory(arr, get_array(ser, "name")) + assert arr.flags.writeable is False + + # mutating series through arr therefore doesn't work + with pytest.raises(ValueError, match="read-only"): + arr[0] = 0 + tm.assert_series_equal(ser, ser_orig) + + # mutating the series itself still works + ser.iloc[0] = 0 + assert ser.values[0] == 0 + else: + assert arr.flags.writeable is True + arr[0] = 0 + assert ser.iloc[0] == 0 + + +@pytest.mark.parametrize( + "method", + [lambda df: df.values, lambda df: np.asarray(df)], + ids=["values", "asarray"], +) +def test_dataframe_values(using_copy_on_write, using_array_manager, method): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df_orig = df.copy() + + arr = method(df) + + if using_copy_on_write: + # .values still gives a view but is read-only + assert np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is False + + # mutating series through arr therefore doesn't work + with pytest.raises(ValueError, match="read-only"): + arr[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + # mutating the series itself still works + df.iloc[0, 0] = 0 + assert df.values[0, 0] == 0 + else: + assert arr.flags.writeable is True + arr[0, 0] = 0 + if not using_array_manager: + assert df.iloc[0, 0] == 0 + else: + tm.assert_frame_equal(df, df_orig) + + +def test_series_to_numpy(using_copy_on_write): + ser = Series([1, 2, 3], name="name") + ser_orig = ser.copy() + + # default: copy=False, no dtype or NAs + arr = ser.to_numpy() + if using_copy_on_write: + # to_numpy still gives a view but is read-only + assert np.shares_memory(arr, get_array(ser, "name")) + assert arr.flags.writeable is False + + # mutating series through arr therefore doesn't work + with pytest.raises(ValueError, match="read-only"): + arr[0] = 0 + tm.assert_series_equal(ser, ser_orig) + + # mutating the series itself still works + ser.iloc[0] = 0 + assert ser.values[0] == 0 + else: + assert arr.flags.writeable is True + arr[0] = 0 + assert ser.iloc[0] == 0 + + # specify copy=False gives a writeable array + ser = Series([1, 2, 3], name="name") + arr = ser.to_numpy(copy=True) + assert not np.shares_memory(arr, get_array(ser, "name")) + assert arr.flags.writeable is True + + # specifying a dtype that already causes a copy also gives a writeable array + ser = Series([1, 2, 3], name="name") + arr = ser.to_numpy(dtype="float64") + assert not np.shares_memory(arr, get_array(ser, "name")) + assert arr.flags.writeable is True + + +@pytest.mark.parametrize("order", ["F", "C"]) +def test_ravel_read_only(using_copy_on_write, order): + ser = Series([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + arr = ser.ravel(order=order) + if using_copy_on_write: + assert arr.flags.writeable is False + assert np.shares_memory(get_array(ser), arr) + + +def test_series_array_ea_dtypes(using_copy_on_write): + ser = Series([1, 2, 3], dtype="Int64") + arr = np.asarray(ser, dtype="int64") + assert np.shares_memory(arr, get_array(ser)) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True + + arr = np.asarray(ser) + assert np.shares_memory(arr, get_array(ser)) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True + + +def test_dataframe_array_ea_dtypes(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}, dtype="Int64") + arr = np.asarray(df, dtype="int64") + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True + + arr = np.asarray(df) + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True + + +def test_dataframe_array_string_dtype(using_copy_on_write, using_array_manager): + df = DataFrame({"a": ["a", "b"]}, dtype="string") + arr = np.asarray(df) + if not using_array_manager: + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True + + +def test_dataframe_multiple_numpy_dtypes(): + df = DataFrame({"a": [1, 2, 3], "b": 1.5}) + arr = np.asarray(df) + assert not np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is True + + +def test_values_is_ea(using_copy_on_write): + df = DataFrame({"a": date_range("2012-01-01", periods=3)}) + arr = np.asarray(df) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True + + +def test_empty_dataframe(): + df = DataFrame() + arr = np.asarray(df) + assert arr.flags.writeable is True diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..d462ce3d3187daf1b414d45ffe8193500ac8487c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_astype.py @@ -0,0 +1,260 @@ +import pickle + +import numpy as np +import pytest + +from pandas.compat.pyarrow import pa_version_under12p0 +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_astype_single_dtype(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5}) + df_orig = df.copy() + df2 = df.astype("float64") + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 2] = 5.5 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + tm.assert_frame_equal(df, df_orig) + + # mutating parent also doesn't update result + df2 = df.astype("float64") + df.iloc[0, 2] = 5.5 + tm.assert_frame_equal(df2, df_orig.astype("float64")) + + +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) +def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): + if new_dtype == "int64[pyarrow]": + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) + df_orig = df.copy() + df2 = df.astype(new_dtype) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + # mutating parent also doesn't update result + df2 = df.astype(new_dtype) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(df2, df_orig.astype(new_dtype)) + + +@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"]) +def test_astype_different_target_dtype(using_copy_on_write, dtype): + if dtype == "int32[pyarrow]": + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + df2 = df.astype(dtype) + + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if using_copy_on_write: + assert df2._mgr._has_no_reference(0) + + df2.iloc[0, 0] = 5 + tm.assert_frame_equal(df, df_orig) + + # mutating parent also doesn't update result + df2 = df.astype(dtype) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(df2, df_orig.astype(dtype)) + + +@td.skip_array_manager_invalid_test +def test_astype_numpy_to_ea(): + ser = Series([1, 2, 3]) + with pd.option_context("mode.copy_on_write", True): + result = ser.astype("Int64") + assert np.shares_memory(get_array(ser), get_array(result)) + + +@pytest.mark.parametrize( + "dtype, new_dtype", [("object", "string"), ("string", "object")] +) +def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): + df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) + df_orig = df.copy() + df2 = df.astype(new_dtype) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = "x" + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype, new_dtype", [("object", "string"), ("string", "object")] +) +def test_astype_string_and_object_update_original( + using_copy_on_write, dtype, new_dtype +): + df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) + df2 = df.astype(new_dtype) + df_orig = df2.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df.iloc[0, 0] = "x" + tm.assert_frame_equal(df2, df_orig) + + +def test_astype_string_copy_on_pickle_roundrip(): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(str) + tm.assert_series_equal(base, base_copy) + + +def test_astype_dict_dtypes(using_copy_on_write): + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} + ) + df_orig = df.copy() + df2 = df.astype({"a": "float64", "c": "float64"}) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 2] = 5.5 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + + df2.iloc[0, 1] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) + + +def test_astype_different_datetime_resos(using_copy_on_write): + df = DataFrame({"a": date_range("2019-12-31", periods=2, freq="D")}) + result = df.astype("datetime64[ms]") + + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + if using_copy_on_write: + assert result._mgr._has_no_reference(0) + + +def test_astype_different_timezones(using_copy_on_write): + df = DataFrame( + {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")} + ) + result = df.astype("datetime64[ns, Europe/Berlin]") + if using_copy_on_write: + assert not result._mgr._has_no_reference(0) + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) + + +def test_astype_different_timezones_different_reso(using_copy_on_write): + df = DataFrame( + {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")} + ) + result = df.astype("datetime64[ms, Europe/Berlin]") + if using_copy_on_write: + assert result._mgr._has_no_reference(0) + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + + +def test_astype_arrow_timestamp(using_copy_on_write): + pytest.importorskip("pyarrow") + df = DataFrame( + { + "a": [ + Timestamp("2020-01-01 01:01:01.000001"), + Timestamp("2020-01-01 01:01:01.000001"), + ] + }, + dtype="M8[ns]", + ) + result = df.astype("timestamp[ns][pyarrow]") + if using_copy_on_write: + assert not result._mgr._has_no_reference(0) + if pa_version_under12p0: + assert not np.shares_memory( + get_array(df, "a"), get_array(result, "a")._pa_array + ) + else: + assert np.shares_memory( + get_array(df, "a"), get_array(result, "a")._pa_array + ) + + +def test_convert_dtypes_infer_objects(using_copy_on_write): + ser = Series(["a", "b", "c"]) + ser_orig = ser.copy() + result = ser.convert_dtypes( + convert_integer=False, + convert_boolean=False, + convert_floating=False, + convert_string=False, + ) + + if using_copy_on_write: + assert np.shares_memory(get_array(ser), get_array(result)) + else: + assert not np.shares_memory(get_array(ser), get_array(result)) + + result.iloc[0] = "x" + tm.assert_series_equal(ser, ser_orig) + + +def test_convert_dtypes(using_copy_on_write): + df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) + df_orig = df.copy() + df2 = df.convert_dtypes() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d")) + + df2.iloc[0, 0] = "x" + tm.assert_frame_equal(df, df_orig) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_clip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..7c87646424e2faf46b740692b007013fef1cfc75 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_clip.py @@ -0,0 +1,101 @@ +import numpy as np + +from pandas import ( + DataFrame, + option_context, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_clip_inplace_reference(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3]}) + df_copy = df.copy() + arr_a = get_array(df, "a") + view = df[:] + if warn_copy_on_write: + with tm.assert_cow_warning(): + df.clip(lower=2, inplace=True) + else: + df.clip(lower=2, inplace=True) + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + tm.assert_frame_equal(df_copy, view) + else: + assert np.shares_memory(get_array(df, "a"), arr_a) + + +def test_clip_inplace_reference_no_op(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3]}) + df_copy = df.copy() + arr_a = get_array(df, "a") + view = df[:] + df.clip(lower=0, inplace=True) + + assert np.shares_memory(get_array(df, "a"), arr_a) + + if using_copy_on_write: + assert not df._mgr._has_no_reference(0) + assert not view._mgr._has_no_reference(0) + tm.assert_frame_equal(df_copy, view) + + +def test_clip_inplace(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3]}) + arr_a = get_array(df, "a") + df.clip(lower=2, inplace=True) + + assert np.shares_memory(get_array(df, "a"), arr_a) + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +def test_clip(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3]}) + df_orig = df.copy() + df2 = df.clip(lower=2) + + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + tm.assert_frame_equal(df_orig, df) + + +def test_clip_no_op(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3]}) + df2 = df.clip(lower=0) + + if using_copy_on_write: + assert not df._mgr._has_no_reference(0) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + +def test_clip_chained_inplace(using_copy_on_write): + df = DataFrame({"a": [1, 4, 2], "b": 1}) + df_orig = df.copy() + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].clip(1, 2, inplace=True) + tm.assert_frame_equal(df, df_orig) + + with tm.raises_chained_assignment_error(): + df[["a"]].clip(1, 2, inplace=True) + tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].clip(1, 2, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[["a"]].clip(1, 2, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[df["a"] > 1].clip(1, 2, inplace=True) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_constructors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_constructors.py new file mode 100644 index 0000000000000000000000000000000000000000..1aa458a62502890f56d9e00af02dd4d7b777a5bb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_constructors.py @@ -0,0 +1,382 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + Period, + PeriodIndex, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + +# ----------------------------------------------------------------------------- +# Copy/view behaviour for Series / DataFrame constructors + + +@pytest.mark.parametrize("dtype", [None, "int64"]) +def test_series_from_series(dtype, using_copy_on_write, warn_copy_on_write): + # Case: constructing a Series from another Series object follows CoW rules: + # a new object is returned and thus mutations are not propagated + ser = Series([1, 2, 3], name="name") + + # default is copy=False -> new Series is a shallow copy / view of original + result = Series(ser, dtype=dtype) + + # the shallow copy still shares memory + assert np.shares_memory(get_array(ser), get_array(result)) + + if using_copy_on_write: + assert result._mgr.blocks[0].refs.has_reference() + + if using_copy_on_write: + # mutating new series copy doesn't mutate original + result.iloc[0] = 0 + assert ser.iloc[0] == 1 + # mutating triggered a copy-on-write -> no longer shares memory + assert not np.shares_memory(get_array(ser), get_array(result)) + else: + # mutating shallow copy does mutate original + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 0 + assert ser.iloc[0] == 0 + # and still shares memory + assert np.shares_memory(get_array(ser), get_array(result)) + + # the same when modifying the parent + result = Series(ser, dtype=dtype) + + if using_copy_on_write: + # mutating original doesn't mutate new series + ser.iloc[0] = 0 + assert result.iloc[0] == 1 + else: + # mutating original does mutate shallow copy + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 0 + assert result.iloc[0] == 0 + + +def test_series_from_series_with_reindex(using_copy_on_write, warn_copy_on_write): + # Case: constructing a Series from another Series with specifying an index + # that potentially requires a reindex of the values + ser = Series([1, 2, 3], name="name") + + # passing an index that doesn't actually require a reindex of the values + # -> without CoW we get an actual mutating view + for index in [ + ser.index, + ser.index.copy(), + list(ser.index), + ser.index.rename("idx"), + ]: + result = Series(ser, index=index) + assert np.shares_memory(ser.values, result.values) + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 0 + if using_copy_on_write: + assert ser.iloc[0] == 1 + else: + assert ser.iloc[0] == 0 + + # ensure that if an actual reindex is needed, we don't have any refs + # (mutating the result wouldn't trigger CoW) + result = Series(ser, index=[0, 1, 2, 3]) + assert not np.shares_memory(ser.values, result.values) + if using_copy_on_write: + assert not result._mgr.blocks[0].refs.has_reference() + + +@pytest.mark.parametrize("fastpath", [False, True]) +@pytest.mark.parametrize("dtype", [None, "int64"]) +@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)]) +@pytest.mark.parametrize( + "arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")] +) +def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr): + if idx is None or dtype is not None: + fastpath = False + msg = "The 'fastpath' keyword in pd.Series is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath) + ser_orig = ser.copy() + data = getattr(arr, "_data", arr) + if using_copy_on_write: + assert not np.shares_memory(get_array(ser), data) + else: + assert np.shares_memory(get_array(ser), data) + + arr[0] = 100 + if using_copy_on_write: + tm.assert_series_equal(ser, ser_orig) + else: + expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype) + tm.assert_series_equal(ser, expected) + + +@pytest.mark.parametrize("copy", [True, False, None]) +def test_series_from_array_different_dtype(using_copy_on_write, copy): + arr = np.array([1, 2, 3], dtype="int64") + ser = Series(arr, dtype="int32", copy=copy) + assert not np.shares_memory(get_array(ser), arr) + + +@pytest.mark.parametrize( + "idx", + [ + Index([1, 2]), + DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]), + PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]), + TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]), + ], +) +def test_series_from_index(using_copy_on_write, idx): + ser = Series(idx) + expected = idx.copy(deep=True) + if using_copy_on_write: + assert np.shares_memory(get_array(ser), get_array(idx)) + assert not ser._mgr._has_no_reference(0) + else: + assert not np.shares_memory(get_array(ser), get_array(idx)) + ser.iloc[0] = ser.iloc[1] + tm.assert_index_equal(idx, expected) + + +def test_series_from_index_different_dtypes(using_copy_on_write): + idx = Index([1, 2, 3], dtype="int64") + ser = Series(idx, dtype="int32") + assert not np.shares_memory(get_array(ser), get_array(idx)) + if using_copy_on_write: + assert ser._mgr._has_no_reference(0) + + +@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") +@pytest.mark.parametrize("fastpath", [False, True]) +@pytest.mark.parametrize("dtype", [None, "int64"]) +@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)]) +def test_series_from_block_manager(using_copy_on_write, idx, dtype, fastpath): + ser = Series([1, 2, 3], dtype="int64") + ser_orig = ser.copy() + msg = "The 'fastpath' keyword in pd.Series is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + ser2 = Series(ser._mgr, dtype=dtype, fastpath=fastpath, index=idx) + assert np.shares_memory(get_array(ser), get_array(ser2)) + if using_copy_on_write: + assert not ser2._mgr._has_no_reference(0) + + ser2.iloc[0] = 100 + if using_copy_on_write: + tm.assert_series_equal(ser, ser_orig) + else: + expected = Series([100, 2, 3]) + tm.assert_series_equal(ser, expected) + + +def test_series_from_block_manager_different_dtype(using_copy_on_write): + ser = Series([1, 2, 3], dtype="int64") + msg = "Passing a SingleBlockManager to Series" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + ser2 = Series(ser._mgr, dtype="int32") + assert not np.shares_memory(get_array(ser), get_array(ser2)) + if using_copy_on_write: + assert ser2._mgr._has_no_reference(0) + + +@pytest.mark.parametrize("use_mgr", [True, False]) +@pytest.mark.parametrize("columns", [None, ["a"]]) +def test_dataframe_constructor_mgr_or_df( + using_copy_on_write, warn_copy_on_write, columns, use_mgr +): + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + + if use_mgr: + data = df._mgr + warn = DeprecationWarning + else: + data = df + warn = None + msg = "Passing a BlockManager to DataFrame" + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): + new_df = DataFrame(data) + + assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) + with tm.assert_cow_warning(warn_copy_on_write and not use_mgr): + new_df.iloc[0] = 100 + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) + tm.assert_frame_equal(df, df_orig) + else: + assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) + tm.assert_frame_equal(df, new_df) + + +@pytest.mark.parametrize("dtype", [None, "int64", "Int64"]) +@pytest.mark.parametrize("index", [None, [0, 1, 2]]) +@pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]]) +def test_dataframe_from_dict_of_series( + request, using_copy_on_write, warn_copy_on_write, columns, index, dtype +): + # Case: constructing a DataFrame from Series objects with copy=False + # has to do a lazy following CoW rules + # (the default for DataFrame(dict) is still to copy to ensure consolidation) + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + s1_orig = s1.copy() + expected = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6]}, index=index, columns=columns, dtype=dtype + ) + + result = DataFrame( + {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False + ) + + # the shallow copy still shares memory + assert np.shares_memory(get_array(result, "a"), get_array(s1)) + + # mutating the new dataframe doesn't mutate original + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0, 0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(s1)) + tm.assert_series_equal(s1, s1_orig) + else: + assert s1.iloc[0] == 10 + + # the same when modifying the parent series + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + result = DataFrame( + {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False + ) + with tm.assert_cow_warning(warn_copy_on_write): + s1.iloc[0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(s1)) + tm.assert_frame_equal(result, expected) + else: + assert result.iloc[0, 0] == 10 + + +@pytest.mark.parametrize("dtype", [None, "int64"]) +def test_dataframe_from_dict_of_series_with_reindex(dtype): + # Case: constructing a DataFrame from Series objects with copy=False + # and passing an index that requires an actual (no-view) reindex -> need + # to ensure the result doesn't have refs set up to unnecessarily trigger + # a copy on write + s1 = Series([1, 2, 3]) + s2 = Series([4, 5, 6]) + df = DataFrame({"a": s1, "b": s2}, index=[1, 2, 3], dtype=dtype, copy=False) + + # df should own its memory, so mutating shouldn't trigger a copy + arr_before = get_array(df, "a") + assert not np.shares_memory(arr_before, get_array(s1)) + df.iloc[0, 0] = 100 + arr_after = get_array(df, "a") + assert np.shares_memory(arr_before, arr_after) + + +@pytest.mark.parametrize("cons", [Series, Index]) +@pytest.mark.parametrize( + "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] +) +def test_dataframe_from_series_or_index( + using_copy_on_write, warn_copy_on_write, data, dtype, cons +): + obj = cons(data, dtype=dtype) + obj_orig = obj.copy() + df = DataFrame(obj, dtype=dtype) + assert np.shares_memory(get_array(obj), get_array(df, 0)) + if using_copy_on_write: + assert not df._mgr._has_no_reference(0) + + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = data[-1] + if using_copy_on_write: + tm.assert_equal(obj, obj_orig) + + +@pytest.mark.parametrize("cons", [Series, Index]) +def test_dataframe_from_series_or_index_different_dtype(using_copy_on_write, cons): + obj = cons([1, 2], dtype="int64") + df = DataFrame(obj, dtype="int32") + assert not np.shares_memory(get_array(obj), get_array(df, 0)) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +def test_dataframe_from_series_infer_datetime(using_copy_on_write): + ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + df = DataFrame(ser) + assert not np.shares_memory(get_array(ser), get_array(df, 0)) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +@pytest.mark.parametrize("index", [None, [0, 1, 2]]) +def test_dataframe_from_dict_of_series_with_dtype(index): + # Variant of above, but now passing a dtype that causes a copy + # -> need to ensure the result doesn't have refs set up to unnecessarily + # trigger a copy on write + s1 = Series([1.0, 2.0, 3.0]) + s2 = Series([4, 5, 6]) + df = DataFrame({"a": s1, "b": s2}, index=index, dtype="int64", copy=False) + + # df should own its memory, so mutating shouldn't trigger a copy + arr_before = get_array(df, "a") + assert not np.shares_memory(arr_before, get_array(s1)) + df.iloc[0, 0] = 100 + arr_after = get_array(df, "a") + assert np.shares_memory(arr_before, arr_after) + + +@pytest.mark.parametrize("copy", [False, None, True]) +def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager): + arr = np.array([[1, 2], [3, 4]]) + df = DataFrame(arr, copy=copy) + + if ( + using_copy_on_write + and copy is not False + or copy is True + or (using_array_manager and copy is None) + ): + assert not np.shares_memory(get_array(df, 0), arr) + else: + assert np.shares_memory(get_array(df, 0), arr) + + +def test_dataframe_from_records_with_dataframe(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + with tm.assert_produces_warning(FutureWarning): + df2 = DataFrame.from_records(df) + if using_copy_on_write: + assert not df._mgr._has_no_reference(0) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + with tm.assert_cow_warning(warn_copy_on_write): + df2.iloc[0, 0] = 100 + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + else: + tm.assert_frame_equal(df, df2) + + +def test_frame_from_dict_of_index(using_copy_on_write): + idx = Index([1, 2, 3]) + expected = idx.copy(deep=True) + df = DataFrame({"a": idx}, copy=False) + assert np.shares_memory(get_array(df, "a"), idx._values) + if using_copy_on_write: + assert not df._mgr._has_no_reference(0) + + df.iloc[0, 0] = 100 + tm.assert_index_equal(idx, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_core_functionalities.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_core_functionalities.py new file mode 100644 index 0000000000000000000000000000000000000000..8dc80c5cc0e0eadbe792e114d48593d95df17907 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_core_functionalities.py @@ -0,0 +1,106 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_assigning_to_same_variable_removes_references(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + df = df.reset_index() + if using_copy_on_write: + assert df._mgr._has_no_reference(1) + arr = get_array(df, "a") + df.iloc[0, 1] = 100 # Write into a + + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_setitem_dont_track_unnecessary_references(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) + + df["b"] = 100 + arr = get_array(df, "a") + # We split the block in setitem, if we are not careful the new blocks will + # reference each other triggering a copy + df.iloc[0, 0] = 100 + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_setitem_with_view_copies(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) + view = df[:] + expected = df.copy() + + df["b"] = 100 + arr = get_array(df, "a") + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 # Check that we correctly track reference + if using_copy_on_write: + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(view, expected) + + +def test_setitem_with_view_invalidated_does_not_copy( + using_copy_on_write, warn_copy_on_write, request +): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) + view = df[:] + + df["b"] = 100 + arr = get_array(df, "a") + view = None # noqa: F841 + # TODO(CoW-warn) false positive? -> block gets split because of `df["b"] = 100` + # which introduces additional refs, even when those of `view` go out of scopes + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 + if using_copy_on_write: + # Setitem split the block. Since the old block shared data with view + # all the new blocks are referencing view and each other. When view + # goes out of scope, they don't share data with any other block, + # so we should not trigger a copy + mark = pytest.mark.xfail( + reason="blk.delete does not track references correctly" + ) + request.applymarker(mark) + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_out_of_scope(using_copy_on_write): + def func(): + df = DataFrame({"a": [1, 2], "b": 1.5, "c": 1}) + # create some subset + result = df[["a", "b"]] + return result + + result = func() + if using_copy_on_write: + assert not result._mgr.blocks[0].refs.has_reference() + assert not result._mgr.blocks[1].refs.has_reference() + + +def test_delete(using_copy_on_write): + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"] + ) + del df["b"] + if using_copy_on_write: + assert not df._mgr.blocks[0].refs.has_reference() + assert not df._mgr.blocks[1].refs.has_reference() + + df = df[["a"]] + if using_copy_on_write: + assert not df._mgr.blocks[0].refs.has_reference() + + +def test_delete_reference(using_copy_on_write): + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"] + ) + x = df[:] + del df["b"] + if using_copy_on_write: + assert df._mgr.blocks[0].refs.has_reference() + assert df._mgr.blocks[1].refs.has_reference() + assert x._mgr.blocks[0].refs.has_reference() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..479fa148f994a74eb205e3fa19ba957504744a54 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_indexing.py @@ -0,0 +1,1266 @@ +import numpy as np +import pytest + +from pandas.errors import SettingWithCopyWarning + +from pandas.core.dtypes.common import is_float_dtype + +import pandas as pd +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +@pytest.fixture(params=["numpy", "nullable"]) +def backend(request): + if request.param == "numpy": + + def make_dataframe(*args, **kwargs): + return DataFrame(*args, **kwargs) + + def make_series(*args, **kwargs): + return Series(*args, **kwargs) + + elif request.param == "nullable": + + def make_dataframe(*args, **kwargs): + df = DataFrame(*args, **kwargs) + df_nullable = df.convert_dtypes() + # convert_dtypes will try to cast float to int if there is no loss in + # precision -> undo that change + for col in df.columns: + if is_float_dtype(df[col].dtype) and not is_float_dtype( + df_nullable[col].dtype + ): + df_nullable[col] = df_nullable[col].astype("Float64") + # copy final result to ensure we start with a fully self-owning DataFrame + return df_nullable.copy() + + def make_series(*args, **kwargs): + ser = Series(*args, **kwargs) + return ser.convert_dtypes().copy() + + return request.param, make_dataframe, make_series + + +# ----------------------------------------------------------------------------- +# Indexing operations taking subset + modifying the subset/parent + + +def test_subset_column_selection(backend, using_copy_on_write): + # Case: taking a subset of the columns of a DataFrame + # + afterwards modifying the subset + _, DataFrame, _ = backend + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df[["a", "c"]] + + if using_copy_on_write: + # the subset shares memory ... + assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) + # ... but uses CoW when being modified + subset.iloc[0, 0] = 0 + else: + assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) + # INFO this no longer raise warning since pandas 1.4 + # with pd.option_context("chained_assignment", "warn"): + # with tm.assert_produces_warning(SettingWithCopyWarning): + subset.iloc[0, 0] = 0 + + assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) + + expected = DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_subset_column_selection_modify_parent(backend, using_copy_on_write): + # Case: taking a subset of the columns of a DataFrame + # + afterwards modifying the parent + _, DataFrame, _ = backend + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + + subset = df[["a", "c"]] + + if using_copy_on_write: + # the subset shares memory ... + assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) + # ... but parent uses CoW parent when it is modified + df.iloc[0, 0] = 0 + + assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) + if using_copy_on_write: + # different column/block still shares memory + assert np.shares_memory(get_array(subset, "c"), get_array(df, "c")) + + expected = DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + + +def test_subset_row_slice(backend, using_copy_on_write, warn_copy_on_write): + # Case: taking a subset of the rows of a DataFrame using a slice + # + afterwards modifying the subset + _, DataFrame, _ = backend + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df[1:3] + subset._mgr._verify_integrity() + + assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) + + if using_copy_on_write: + subset.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) + + else: + # INFO this no longer raise warning since pandas 1.4 + # with pd.option_context("chained_assignment", "warn"): + # with tm.assert_produces_warning(SettingWithCopyWarning): + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0, 0] = 0 + + subset._mgr._verify_integrity() + + expected = DataFrame({"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3)) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.iloc[1, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_column_slice( + backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype +): + # Case: taking a subset of the columns of a DataFrame using a slice + # + afterwards modifying the subset + dtype_backend, DataFrame, _ = backend + single_block = ( + dtype == "int64" and dtype_backend == "numpy" + ) and not using_array_manager + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + + subset = df.iloc[:, 1:] + subset._mgr._verify_integrity() + + if using_copy_on_write: + assert np.shares_memory(get_array(subset, "b"), get_array(df, "b")) + + subset.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b")) + elif warn_copy_on_write: + with tm.assert_cow_warning(single_block): + subset.iloc[0, 0] = 0 + else: + # we only get a warning in case of a single block + warn = SettingWithCopyWarning if single_block else None + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(warn): + subset.iloc[0, 0] = 0 + + expected = DataFrame({"b": [0, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}) + tm.assert_frame_equal(subset, expected) + # original parent dataframe is not modified (also not for BlockManager case, + # except for single block) + if not using_copy_on_write and (using_array_manager or single_block): + df_orig.iloc[0, 1] = 0 + tm.assert_frame_equal(df, df_orig) + else: + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +@pytest.mark.parametrize( + "row_indexer", + [slice(1, 2), np.array([False, True, True]), np.array([1, 2])], + ids=["slice", "mask", "array"], +) +@pytest.mark.parametrize( + "column_indexer", + [slice("b", "c"), np.array([False, True, True]), ["b", "c"]], + ids=["slice", "mask", "array"], +) +def test_subset_loc_rows_columns( + backend, + dtype, + row_indexer, + column_indexer, + using_array_manager, + using_copy_on_write, + warn_copy_on_write, +): + # Case: taking a subset of the rows+columns of a DataFrame using .loc + # + afterwards modifying the subset + # Generic test for several combinations of row/column indexers, not all + # of those could actually return a view / need CoW (so this test is not + # checking memory sharing, only ensuring subsequent mutation doesn't + # affect the parent dataframe) + dtype_backend, DataFrame, _ = backend + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + + subset = df.loc[row_indexer, column_indexer] + + # a few corner cases _do_ actually modify the parent (with both row and column + # slice, and in case of ArrayManager or BlockManager with single block) + mutate_parent = ( + isinstance(row_indexer, slice) + and isinstance(column_indexer, slice) + and ( + using_array_manager + or ( + dtype == "int64" + and dtype_backend == "numpy" + and not using_copy_on_write + ) + ) + ) + + # modifying the subset never modifies the parent + with tm.assert_cow_warning(warn_copy_on_write and mutate_parent): + subset.iloc[0, 0] = 0 + + expected = DataFrame( + {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + if mutate_parent: + df_orig.iloc[1, 1] = 0 + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +@pytest.mark.parametrize( + "row_indexer", + [slice(1, 3), np.array([False, True, True]), np.array([1, 2])], + ids=["slice", "mask", "array"], +) +@pytest.mark.parametrize( + "column_indexer", + [slice(1, 3), np.array([False, True, True]), [1, 2]], + ids=["slice", "mask", "array"], +) +def test_subset_iloc_rows_columns( + backend, + dtype, + row_indexer, + column_indexer, + using_array_manager, + using_copy_on_write, + warn_copy_on_write, +): + # Case: taking a subset of the rows+columns of a DataFrame using .iloc + # + afterwards modifying the subset + # Generic test for several combinations of row/column indexers, not all + # of those could actually return a view / need CoW (so this test is not + # checking memory sharing, only ensuring subsequent mutation doesn't + # affect the parent dataframe) + dtype_backend, DataFrame, _ = backend + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + + subset = df.iloc[row_indexer, column_indexer] + + # a few corner cases _do_ actually modify the parent (with both row and column + # slice, and in case of ArrayManager or BlockManager with single block) + mutate_parent = ( + isinstance(row_indexer, slice) + and isinstance(column_indexer, slice) + and ( + using_array_manager + or ( + dtype == "int64" + and dtype_backend == "numpy" + and not using_copy_on_write + ) + ) + ) + + # modifying the subset never modifies the parent + with tm.assert_cow_warning(warn_copy_on_write and mutate_parent): + subset.iloc[0, 0] = 0 + + expected = DataFrame( + {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + if mutate_parent: + df_orig.iloc[1, 1] = 0 + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "indexer", + [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], + ids=["slice", "mask", "array"], +) +def test_subset_set_with_row_indexer( + backend, indexer_si, indexer, using_copy_on_write, warn_copy_on_write +): + # Case: setting values with a row indexer on a viewing subset + # subset[indexer] = value and subset.iloc[indexer] = value + _, DataFrame, _ = backend + df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df_orig = df.copy() + subset = df[1:4] + + if ( + indexer_si is tm.setitem + and isinstance(indexer, np.ndarray) + and indexer.dtype == "int" + ): + pytest.skip("setitem with labels selects on columns") + + if using_copy_on_write: + indexer_si(subset)[indexer] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + indexer_si(subset)[indexer] = 0 + else: + # INFO iloc no longer raises warning since pandas 1.4 + warn = SettingWithCopyWarning if indexer_si is tm.setitem else None + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(warn): + indexer_si(subset)[indexer] = 0 + + expected = DataFrame( + {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) + ) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig[1:3] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write): + # Case: setting values with a mask on a viewing subset: subset[mask] = value + _, DataFrame, _ = backend + df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df_orig = df.copy() + subset = df[1:4] + + mask = subset > 3 + + if using_copy_on_write: + subset[mask] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset[mask] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(SettingWithCopyWarning): + subset[mask] = 0 + + expected = DataFrame( + {"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4) + ) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.loc[3, "a"] = 0 + df_orig.loc[1:3, "b"] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write): + # Case: setting a single column on a viewing subset -> subset[col] = value + dtype_backend, DataFrame, _ = backend + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + subset = df[1:3] + + if dtype_backend == "numpy": + arr = np.array([10, 11], dtype="int64") + else: + arr = pd.array([10, 11], dtype="Int64") + + if using_copy_on_write or warn_copy_on_write: + subset["a"] = arr + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(SettingWithCopyWarning): + subset["a"] = arr + + subset._mgr._verify_integrity() + expected = DataFrame( + {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_set_column_with_loc( + backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype +): + # Case: setting a single column with loc on a viewing subset + # -> subset.loc[:, col] = value + _, DataFrame, _ = backend + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset.loc[:, "a"] = np.array([10, 11], dtype="int64") + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset.loc[:, "a"] = np.array([10, 11], dtype="int64") + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning( + None, + raise_on_extra_warnings=not using_array_manager, + ): + subset.loc[:, "a"] = np.array([10, 11], dtype="int64") + + subset._mgr._verify_integrity() + expected = DataFrame( + {"a": [10, 11], "b": [5, 6], "c": np.array([8, 9], dtype=dtype)}, + index=range(1, 3), + ) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.loc[1:3, "a"] = np.array([10, 11], dtype="int64") + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_column_with_loc2( + backend, using_copy_on_write, warn_copy_on_write, using_array_manager +): + # Case: setting a single column with loc on a viewing subset + # -> subset.loc[:, col] = value + # separate test for case of DataFrame of a single column -> takes a separate + # code path + _, DataFrame, _ = backend + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset.loc[:, "a"] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset.loc[:, "a"] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning( + None, + raise_on_extra_warnings=not using_array_manager, + ): + subset.loc[:, "a"] = 0 + + subset._mgr._verify_integrity() + expected = DataFrame({"a": [0, 0]}, index=range(1, 3)) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.loc[1:3, "a"] = 0 + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dtype): + # Case: setting multiple columns on a viewing subset + # -> subset[[col1, col2]] = value + dtype_backend, DataFrame, _ = backend + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write or warn_copy_on_write: + subset[["a", "c"]] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(SettingWithCopyWarning): + subset[["a", "c"]] = 0 + + subset._mgr._verify_integrity() + if using_copy_on_write: + # first and third column should certainly have no references anymore + assert all(subset._mgr._has_no_reference(i) for i in [0, 2]) + expected = DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) + if dtype_backend == "nullable": + # there is not yet a global option, so overriding a column by setting a scalar + # defaults to numpy dtype even if original column was nullable + expected["a"] = expected["a"].astype("int64") + expected["c"] = expected["c"].astype("int64") + + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "indexer", + [slice("a", "b"), np.array([True, True, False]), ["a", "b"]], + ids=["slice", "mask", "array"], +) +def test_subset_set_with_column_indexer( + backend, indexer, using_copy_on_write, warn_copy_on_write +): + # Case: setting multiple columns with a column indexer on a viewing subset + # -> subset.loc[:, [col1, col2]] = value + _, DataFrame, _ = backend + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset.loc[:, indexer] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset.loc[:, indexer] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + # As of 2.0, this setitem attempts (successfully) to set values + # inplace, so the assignment is not chained. + subset.loc[:, indexer] = 0 + + subset._mgr._verify_integrity() + expected = DataFrame({"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3)) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + else: + # pre-2.0, in the mixed case with BlockManager, only column "a" + # would be mutated in the parent frame. this changed with the + # enforcement of GH#45333 + df_orig.loc[1:2, ["a", "b"]] = 0 + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "method", + [ + lambda df: df[["a", "b"]][0:2], + lambda df: df[0:2][["a", "b"]], + lambda df: df[["a", "b"]].iloc[0:2], + lambda df: df[["a", "b"]].loc[0:1], + lambda df: df[0:2].iloc[:, 0:2], + lambda df: df[0:2].loc[:, "a":"b"], # type: ignore[misc] + ], + ids=[ + "row-getitem-slice", + "column-getitem", + "row-iloc-slice", + "row-loc-slice", + "column-iloc-slice", + "column-loc-slice", + ], +) +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_chained_getitem( + request, + backend, + method, + dtype, + using_copy_on_write, + using_array_manager, + warn_copy_on_write, +): + # Case: creating a subset using multiple, chained getitem calls using views + # still needs to guarantee proper CoW behaviour + _, DataFrame, _ = backend + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + + # when not using CoW, it depends on whether we have a single block or not + # and whether we are slicing the columns -> in that case we have a view + test_callspec = request.node.callspec.id + if not using_array_manager: + subset_is_view = test_callspec in ( + "numpy-single-block-column-iloc-slice", + "numpy-single-block-column-loc-slice", + ) + else: + # with ArrayManager, it doesn't matter whether we have + # single vs mixed block or numpy vs nullable dtypes + subset_is_view = test_callspec.endswith( + ("column-iloc-slice", "column-loc-slice") + ) + + # modify subset -> don't modify parent + subset = method(df) + + with tm.assert_cow_warning(warn_copy_on_write and subset_is_view): + subset.iloc[0, 0] = 0 + if using_copy_on_write or (not subset_is_view): + tm.assert_frame_equal(df, df_orig) + else: + assert df.iloc[0, 0] == 0 + + # modify parent -> don't modify subset + subset = method(df) + with tm.assert_cow_warning(warn_copy_on_write and subset_is_view): + df.iloc[0, 0] = 0 + expected = DataFrame({"a": [1, 2], "b": [4, 5]}) + if using_copy_on_write or not subset_is_view: + tm.assert_frame_equal(subset, expected) + else: + assert subset.iloc[0, 0] == 0 + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_chained_getitem_column( + backend, dtype, using_copy_on_write, warn_copy_on_write +): + # Case: creating a subset using multiple, chained getitem calls using views + # still needs to guarantee proper CoW behaviour + dtype_backend, DataFrame, Series = backend + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + + # modify subset -> don't modify parent + subset = df[:]["a"][0:2] + df._clear_item_cache() + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + else: + assert df.iloc[0, 0] == 0 + + # modify parent -> don't modify subset + subset = df[:]["a"][0:2] + df._clear_item_cache() + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 + expected = Series([1, 2], name="a") + if using_copy_on_write: + tm.assert_series_equal(subset, expected) + else: + assert subset.iloc[0] == 0 + + +@pytest.mark.parametrize( + "method", + [ + lambda s: s["a":"c"]["a":"b"], # type: ignore[misc] + lambda s: s.iloc[0:3].iloc[0:2], + lambda s: s.loc["a":"c"].loc["a":"b"], # type: ignore[misc] + lambda s: s.loc["a":"c"] # type: ignore[misc] + .iloc[0:3] + .iloc[0:2] + .loc["a":"b"] # type: ignore[misc] + .iloc[0:1], + ], + ids=["getitem", "iloc", "loc", "long-chain"], +) +def test_subset_chained_getitem_series( + backend, method, using_copy_on_write, warn_copy_on_write +): + # Case: creating a subset using multiple, chained getitem calls using views + # still needs to guarantee proper CoW behaviour + _, _, Series = backend + s = Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + + # modify subset -> don't modify parent + subset = method(s) + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 + if using_copy_on_write: + tm.assert_series_equal(s, s_orig) + else: + assert s.iloc[0] == 0 + + # modify parent -> don't modify subset + subset = s.iloc[0:3].iloc[0:2] + with tm.assert_cow_warning(warn_copy_on_write): + s.iloc[0] = 0 + expected = Series([1, 2], index=["a", "b"]) + if using_copy_on_write: + tm.assert_series_equal(subset, expected) + else: + assert subset.iloc[0] == 0 + + +def test_subset_chained_single_block_row( + using_copy_on_write, using_array_manager, warn_copy_on_write +): + # not parametrizing this for dtype backend, since this explicitly tests single block + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + df_orig = df.copy() + + # modify subset -> don't modify parent + subset = df[:].iloc[0].iloc[0:2] + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 + if using_copy_on_write or using_array_manager: + tm.assert_frame_equal(df, df_orig) + else: + assert df.iloc[0, 0] == 0 + + # modify parent -> don't modify subset + subset = df[:].iloc[0].iloc[0:2] + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 + expected = Series([1, 4], index=["a", "b"], name=0) + if using_copy_on_write or using_array_manager: + tm.assert_series_equal(subset, expected) + else: + assert subset.iloc[0] == 0 + + +@pytest.mark.parametrize( + "method", + [ + lambda df: df[:], + lambda df: df.loc[:, :], + lambda df: df.loc[:], + lambda df: df.iloc[:, :], + lambda df: df.iloc[:], + ], + ids=["getitem", "loc", "loc-rows", "iloc", "iloc-rows"], +) +def test_null_slice(backend, method, using_copy_on_write, warn_copy_on_write): + # Case: also all variants of indexing with a null slice (:) should return + # new objects to ensure we correctly use CoW for the results + dtype_backend, DataFrame, _ = backend + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + df_orig = df.copy() + + df2 = method(df) + + # we always return new objects (shallow copy), regardless of CoW or not + assert df2 is not df + + # and those trigger CoW when mutated + with tm.assert_cow_warning(warn_copy_on_write): + df2.iloc[0, 0] = 0 + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + else: + assert df.iloc[0, 0] == 0 + + +@pytest.mark.parametrize( + "method", + [ + lambda s: s[:], + lambda s: s.loc[:], + lambda s: s.iloc[:], + ], + ids=["getitem", "loc", "iloc"], +) +def test_null_slice_series(backend, method, using_copy_on_write, warn_copy_on_write): + _, _, Series = backend + s = Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + + s2 = method(s) + + # we always return new objects, regardless of CoW or not + assert s2 is not s + + # and those trigger CoW when mutated + with tm.assert_cow_warning(warn_copy_on_write): + s2.iloc[0] = 0 + if using_copy_on_write: + tm.assert_series_equal(s, s_orig) + else: + assert s.iloc[0] == 0 + + +# TODO add more tests modifying the parent + + +# ----------------------------------------------------------------------------- +# Series -- Indexing operations taking subset + modifying the subset/parent + + +def test_series_getitem_slice(backend, using_copy_on_write, warn_copy_on_write): + # Case: taking a slice of a Series + afterwards modifying the subset + _, _, Series = backend + s = Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + + subset = s[:] + assert np.shares_memory(get_array(subset), get_array(s)) + + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 + + if using_copy_on_write: + assert not np.shares_memory(get_array(subset), get_array(s)) + + expected = Series([0, 2, 3], index=["a", "b", "c"]) + tm.assert_series_equal(subset, expected) + + if using_copy_on_write: + # original parent series is not modified (CoW) + tm.assert_series_equal(s, s_orig) + else: + # original parent series is actually updated + assert s.iloc[0] == 0 + + +def test_series_getitem_ellipsis(using_copy_on_write, warn_copy_on_write): + # Case: taking a view of a Series using Ellipsis + afterwards modifying the subset + s = Series([1, 2, 3]) + s_orig = s.copy() + + subset = s[...] + assert np.shares_memory(get_array(subset), get_array(s)) + + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 + + if using_copy_on_write: + assert not np.shares_memory(get_array(subset), get_array(s)) + + expected = Series([0, 2, 3]) + tm.assert_series_equal(subset, expected) + + if using_copy_on_write: + # original parent series is not modified (CoW) + tm.assert_series_equal(s, s_orig) + else: + # original parent series is actually updated + assert s.iloc[0] == 0 + + +@pytest.mark.parametrize( + "indexer", + [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], + ids=["slice", "mask", "array"], +) +def test_series_subset_set_with_indexer( + backend, indexer_si, indexer, using_copy_on_write, warn_copy_on_write +): + # Case: setting values in a viewing Series with an indexer + _, _, Series = backend + s = Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + subset = s[:] + + warn = None + msg = "Series.__setitem__ treating keys as positions is deprecated" + if ( + indexer_si is tm.setitem + and isinstance(indexer, np.ndarray) + and indexer.dtype.kind == "i" + ): + warn = FutureWarning + if warn_copy_on_write: + with tm.assert_cow_warning(raise_on_extra_warnings=warn is not None): + indexer_si(subset)[indexer] = 0 + else: + with tm.assert_produces_warning(warn, match=msg): + indexer_si(subset)[indexer] = 0 + expected = Series([0, 0, 3], index=["a", "b", "c"]) + tm.assert_series_equal(subset, expected) + + if using_copy_on_write: + tm.assert_series_equal(s, s_orig) + else: + tm.assert_series_equal(s, expected) + + +# ----------------------------------------------------------------------------- +# del operator + + +def test_del_frame(backend, using_copy_on_write, warn_copy_on_write): + # Case: deleting a column with `del` on a viewing child dataframe should + # not modify parent + update the references + dtype_backend, DataFrame, _ = backend + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df[:] + + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + del df2["b"] + + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df2, df_orig[["a", "c"]]) + df2._mgr._verify_integrity() + + with tm.assert_cow_warning(warn_copy_on_write and dtype_backend == "numpy"): + df.loc[0, "b"] = 200 + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + df_orig = df.copy() + + with tm.assert_cow_warning(warn_copy_on_write): + df2.loc[0, "a"] = 100 + if using_copy_on_write: + # modifying child after deleting a column still doesn't update parent + tm.assert_frame_equal(df, df_orig) + else: + assert df.loc[0, "a"] == 100 + + +def test_del_series(backend): + _, _, Series = backend + s = Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + s2 = s[:] + + assert np.shares_memory(get_array(s), get_array(s2)) + + del s2["a"] + + assert not np.shares_memory(get_array(s), get_array(s2)) + tm.assert_series_equal(s, s_orig) + tm.assert_series_equal(s2, s_orig[["b", "c"]]) + + # modifying s2 doesn't need copy on write (due to `del`, s2 is backed by new array) + values = s2.values + s2.loc["b"] = 100 + assert values[0] == 100 + + +# ----------------------------------------------------------------------------- +# Accessing column as Series + + +def test_column_as_series( + backend, using_copy_on_write, warn_copy_on_write, using_array_manager +): + # Case: selecting a single column now also uses Copy-on-Write + dtype_backend, DataFrame, Series = backend + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s = df["a"] + + assert np.shares_memory(get_array(s, "a"), get_array(df, "a")) + + if using_copy_on_write or using_array_manager: + s[0] = 0 + else: + if warn_copy_on_write: + with tm.assert_cow_warning(): + s[0] = 0 + else: + warn = SettingWithCopyWarning if dtype_backend == "numpy" else None + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(warn): + s[0] = 0 + + expected = Series([0, 2, 3], name="a") + tm.assert_series_equal(s, expected) + if using_copy_on_write: + # assert not np.shares_memory(s.values, get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) + else: + df_orig.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_column_as_series_set_with_upcast( + backend, using_copy_on_write, using_array_manager, warn_copy_on_write +): + # Case: selecting a single column now also uses Copy-on-Write -> when + # setting a value causes an upcast, we don't need to update the parent + # DataFrame through the cache mechanism + dtype_backend, DataFrame, Series = backend + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s = df["a"] + if dtype_backend == "nullable": + with tm.assert_cow_warning(warn_copy_on_write): + with pytest.raises(TypeError, match="Invalid value"): + s[0] = "foo" + expected = Series([1, 2, 3], name="a") + elif using_copy_on_write or warn_copy_on_write or using_array_manager: + # TODO(CoW-warn) assert the FutureWarning for CoW is also raised + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + s[0] = "foo" + expected = Series(["foo", 2, 3], dtype=object, name="a") + else: + with pd.option_context("chained_assignment", "warn"): + msg = "|".join( + [ + "A value is trying to be set on a copy of a slice from a DataFrame", + "Setting an item of incompatible dtype is deprecated", + ] + ) + with tm.assert_produces_warning( + (SettingWithCopyWarning, FutureWarning), match=msg + ): + s[0] = "foo" + expected = Series(["foo", 2, 3], dtype=object, name="a") + + tm.assert_series_equal(s, expected) + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) + else: + df_orig["a"] = expected + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "method", + [ + lambda df: df["a"], + lambda df: df.loc[:, "a"], + lambda df: df.iloc[:, 0], + ], + ids=["getitem", "loc", "iloc"], +) +def test_column_as_series_no_item_cache( + request, + backend, + method, + using_copy_on_write, + warn_copy_on_write, + using_array_manager, +): + # Case: selecting a single column (which now also uses Copy-on-Write to protect + # the view) should always give a new object (i.e. not make use of a cache) + dtype_backend, DataFrame, _ = backend + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s1 = method(df) + s2 = method(df) + + is_iloc = "iloc" in request.node.name + if using_copy_on_write or warn_copy_on_write or is_iloc: + assert s1 is not s2 + else: + assert s1 is s2 + + if using_copy_on_write or using_array_manager: + s1.iloc[0] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + s1.iloc[0] = 0 + else: + warn = SettingWithCopyWarning if dtype_backend == "numpy" else None + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(warn): + s1.iloc[0] = 0 + + if using_copy_on_write: + tm.assert_series_equal(s2, df_orig["a"]) + tm.assert_frame_equal(df, df_orig) + else: + assert s2.iloc[0] == 0 + + +# TODO add tests for other indexing methods on the Series + + +def test_dataframe_add_column_from_series(backend, using_copy_on_write): + # Case: adding a new column to a DataFrame from an existing column/series + # -> delays copy under CoW + _, DataFrame, Series = backend + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + + s = Series([10, 11, 12]) + df["new"] = s + if using_copy_on_write: + assert np.shares_memory(get_array(df, "new"), get_array(s)) + else: + assert not np.shares_memory(get_array(df, "new"), get_array(s)) + + # editing series -> doesn't modify column in frame + s[0] = 0 + expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [100, "a"]) +@pytest.mark.parametrize( + "indexer_func, indexer", + [ + (tm.loc, (0, "a")), + (tm.iloc, (0, 0)), + (tm.loc, ([0], "a")), + (tm.iloc, ([0], 0)), + (tm.loc, (slice(None), "a")), + (tm.iloc, (slice(None), 0)), + ], +) +@pytest.mark.parametrize( + "col", [[0.1, 0.2, 0.3], [7, 8, 9]], ids=["mixed-block", "single-block"] +) +def test_set_value_copy_only_necessary_column( + using_copy_on_write, warn_copy_on_write, indexer_func, indexer, val, col +): + # When setting inplace, only copy column that is modified instead of the whole + # block (by splitting the block) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": col}) + df_orig = df.copy() + view = df[:] + + if val == "a" and not warn_copy_on_write: + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype is deprecated" + ): + indexer_func(df)[indexer] = val + if val == "a" and warn_copy_on_write: + with tm.assert_produces_warning( + FutureWarning, match="incompatible dtype|Setting a value on a view" + ): + indexer_func(df)[indexer] = val + else: + with tm.assert_cow_warning(warn_copy_on_write and val == 100): + indexer_func(df)[indexer] = val + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "c"), get_array(view, "c")) + if val == "a": + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + else: + assert np.shares_memory(get_array(df, "a"), get_array(view, "a")) + + +def test_series_midx_slice(using_copy_on_write, warn_copy_on_write): + ser = Series([1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]])) + ser_orig = ser.copy() + result = ser[1] + assert np.shares_memory(get_array(ser), get_array(result)) + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 100 + if using_copy_on_write: + tm.assert_series_equal(ser, ser_orig) + else: + expected = Series( + [100, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]]) + ) + tm.assert_series_equal(ser, expected) + + +def test_getitem_midx_slice( + using_copy_on_write, warn_copy_on_write, using_array_manager +): + df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2}) + df_orig = df.copy() + new_df = df[("a",)] + + if using_copy_on_write: + assert not new_df._mgr._has_no_reference(0) + + if not using_array_manager: + assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x")) + if using_copy_on_write: + new_df.iloc[0, 0] = 100 + tm.assert_frame_equal(df_orig, df) + else: + if warn_copy_on_write: + with tm.assert_cow_warning(): + new_df.iloc[0, 0] = 100 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(SettingWithCopyWarning): + new_df.iloc[0, 0] = 100 + assert df.iloc[0, 0] == 100 + + +def test_series_midx_tuples_slice(using_copy_on_write, warn_copy_on_write): + ser = Series( + [1, 2, 3], + index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]), + ) + result = ser[(1, 2)] + assert np.shares_memory(get_array(ser), get_array(result)) + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 100 + if using_copy_on_write: + expected = Series( + [1, 2, 3], + index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]), + ) + tm.assert_series_equal(ser, expected) + + +def test_midx_read_only_bool_indexer(): + # GH#56635 + def mklbl(prefix, n): + return [f"{prefix}{i}" for i in range(n)] + + idx = pd.MultiIndex.from_product( + [mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)] + ) + cols = pd.MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"] + ) + df = DataFrame(1, index=idx, columns=cols).sort_index().sort_index(axis=1) + + mask = df[("a", "foo")] == 1 + expected_mask = mask.copy() + result = df.loc[pd.IndexSlice[mask, :, ["C1", "C3"]], :] + expected = df.loc[pd.IndexSlice[:, :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + tm.assert_series_equal(mask, expected_mask) + + +def test_loc_enlarging_with_dataframe(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + rhs = DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]}) + rhs_orig = rhs.copy() + df.loc[:, ["b", "c"]] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(rhs, "b")) + assert np.shares_memory(get_array(df, "c"), get_array(rhs, "c")) + assert not df._mgr._has_no_reference(1) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(rhs, "b")) + + df.iloc[0, 1] = 100 + tm.assert_frame_equal(rhs, rhs_orig) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_internals.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_internals.py new file mode 100644 index 0000000000000000000000000000000000000000..a727331307d7e9086144aa8d27f70ffa83973620 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_internals.py @@ -0,0 +1,151 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +@td.skip_array_manager_invalid_test +def test_consolidate(using_copy_on_write): + # create unconsolidated DataFrame + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df["c"] = [4, 5, 6] + + # take a viewing subset + subset = df[:] + + # each block of subset references a block of df + assert all(blk.refs.has_reference() for blk in subset._mgr.blocks) + + # consolidate the two int64 blocks + subset._consolidate_inplace() + + # the float64 block still references the parent one because it still a view + assert subset._mgr.blocks[0].refs.has_reference() + # equivalent of assert np.shares_memory(df["b"].values, subset["b"].values) + # but avoids caching df["b"] + assert np.shares_memory(get_array(df, "b"), get_array(subset, "b")) + + # the new consolidated int64 block does not reference another + assert not subset._mgr.blocks[1].refs.has_reference() + + # the parent dataframe now also only is linked for the float column + assert not df._mgr.blocks[0].refs.has_reference() + assert df._mgr.blocks[1].refs.has_reference() + assert not df._mgr.blocks[2].refs.has_reference() + + # and modifying subset still doesn't modify parent + if using_copy_on_write: + subset.iloc[0, 1] = 0.0 + assert not df._mgr.blocks[1].refs.has_reference() + assert df.loc[0, "b"] == 0.1 + + +@pytest.mark.single_cpu +@td.skip_array_manager_invalid_test +def test_switch_options(): + # ensure we can switch the value of the option within one session + # (assuming data is constructed after switching) + + # using the option_context to ensure we set back to global option value + # after running the test + with pd.option_context("mode.copy_on_write", False): + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + subset = df[:] + subset.iloc[0, 0] = 0 + # df updated with CoW disabled + assert df.iloc[0, 0] == 0 + + pd.options.mode.copy_on_write = True + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + subset = df[:] + subset.iloc[0, 0] = 0 + # df not updated with CoW enabled + assert df.iloc[0, 0] == 1 + + pd.options.mode.copy_on_write = False + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + subset = df[:] + subset.iloc[0, 0] = 0 + # df updated with CoW disabled + assert df.iloc[0, 0] == 0 + + +@td.skip_array_manager_invalid_test +@pytest.mark.parametrize("dtype", [np.intp, np.int8]) +@pytest.mark.parametrize( + "locs, arr", + [ + ([0], np.array([-1, -2, -3])), + ([1], np.array([-1, -2, -3])), + ([5], np.array([-1, -2, -3])), + ([0, 1], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([0, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([0, 1, 2], np.array([[-1, -2, -3], [-4, -5, -6], [-4, -5, -6]]).T), + ([1, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ], +) +def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): + # Nothing currently calls iset with + # more than 1 loc with inplace=True (only happens with inplace=False) + # but ensure that it works + df = DataFrame( + { + "a": [1, 2, 3], + "b": [4, 5, 6], + "c": [7, 8, 9], + "d": [10, 11, 12], + "e": [13, 14, 15], + "f": ["a", "b", "c"], + }, + ) + arr = arr.astype(dtype) + df_orig = df.copy() + df2 = df.copy(deep=None) # Trigger a CoW (if enabled, otherwise makes copy) + df2._mgr.iset(locs, arr, inplace=True) + + tm.assert_frame_equal(df, df_orig) + + if using_copy_on_write: + for i, col in enumerate(df.columns): + if i not in locs: + assert np.shares_memory(get_array(df, col), get_array(df2, col)) + else: + for col in df.columns: + assert not np.shares_memory(get_array(df, col), get_array(df2, col)) + + +def test_exponential_backoff(): + # GH#55518 + df = DataFrame({"a": [1, 2, 3]}) + for i in range(490): + df.copy(deep=False) + + assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491 + + df = DataFrame({"a": [1, 2, 3]}) + dfs = [df.copy(deep=False) for i in range(510)] + + for i in range(20): + df.copy(deep=False) + assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531 + assert df._mgr.blocks[0].refs.clear_counter == 1000 + + for i in range(500): + df.copy(deep=False) + + # Don't reduce since we still have over 500 objects alive + assert df._mgr.blocks[0].refs.clear_counter == 1000 + + dfs = dfs[:300] + for i in range(500): + df.copy(deep=False) + + # Reduce since there are less than 500 objects alive + assert df._mgr.blocks[0].refs.clear_counter == 500 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_interp_fillna.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_interp_fillna.py new file mode 100644 index 0000000000000000000000000000000000000000..ddc5879a56d544a5bbcbc9ef72b35d20d6f3b91b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_interp_fillna.py @@ -0,0 +1,432 @@ +import numpy as np +import pytest + +from pandas import ( + NA, + ArrowDtype, + DataFrame, + Interval, + NaT, + Series, + Timestamp, + interval_range, + option_context, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +@pytest.mark.parametrize("method", ["pad", "nearest", "linear"]) +def test_interpolate_no_op(using_copy_on_write, method): + df = DataFrame({"a": [1, 2]}) + df_orig = df.copy() + + warn = None + if method == "pad": + warn = FutureWarning + msg = "DataFrame.interpolate with method=pad is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = df.interpolate(method=method) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = 100 + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +def test_interp_fill_functions(using_copy_on_write, func): + # Check that these takes the same code paths as interpolate + df = DataFrame({"a": [1, 2]}) + df_orig = df.copy() + + result = getattr(df, func)() + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = 100 + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_triggers_copy(using_copy_on_write, vals, func): + df = DataFrame({"a": vals}) + result = getattr(df, func)() + + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + if using_copy_on_write: + # Check that we don't have references when triggering a copy + assert result._mgr._has_no_reference(0) + + +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_inplace_no_reference_no_copy(using_copy_on_write, vals): + df = DataFrame({"a": vals}) + arr = get_array(df, "a") + df.interpolate(method="linear", inplace=True) + + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + # Check that we don't have references when triggering a copy + assert df._mgr._has_no_reference(0) + + +@pytest.mark.parametrize( + "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] +) +def test_interpolate_inplace_with_refs(using_copy_on_write, vals, warn_copy_on_write): + df = DataFrame({"a": [1, np.nan, 2]}) + df_orig = df.copy() + arr = get_array(df, "a") + view = df[:] + with tm.assert_cow_warning(warn_copy_on_write): + df.interpolate(method="linear", inplace=True) + + if using_copy_on_write: + # Check that copy was triggered in interpolate and that we don't + # have any references left + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + else: + assert np.shares_memory(arr, get_array(df, "a")) + + +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +@pytest.mark.parametrize("dtype", ["float64", "Float64"]) +def test_interp_fill_functions_inplace( + using_copy_on_write, func, warn_copy_on_write, dtype +): + # Check that these takes the same code paths as interpolate + df = DataFrame({"a": [1, np.nan, 2]}, dtype=dtype) + df_orig = df.copy() + arr = get_array(df, "a") + view = df[:] + + with tm.assert_cow_warning(warn_copy_on_write and dtype == "float64"): + getattr(df, func)(inplace=True) + + if using_copy_on_write: + # Check that copy was triggered in interpolate and that we don't + # have any references left + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + else: + assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64") + + +def test_interpolate_cleaned_fill_method(using_copy_on_write): + # Check that "method is set to None" case works correctly + df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) + df_orig = df.copy() + + msg = "DataFrame.interpolate with object dtype" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.interpolate(method="linear") + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = Timestamp("2021-12-31") + + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_interpolate_object_convert_no_op(using_copy_on_write): + df = DataFrame({"a": ["a", "b", "c"], "b": 1}) + arr_a = get_array(df, "a") + msg = "DataFrame.interpolate with method=pad is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.interpolate(method="pad", inplace=True) + + # Now CoW makes a copy, it should not! + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr_a, get_array(df, "a")) + + +def test_interpolate_object_convert_copies(using_copy_on_write): + df = DataFrame({"a": Series([1, 2], dtype=object), "b": 1}) + arr_a = get_array(df, "a") + msg = "DataFrame.interpolate with method=pad is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.interpolate(method="pad", inplace=True) + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert not np.shares_memory(arr_a, get_array(df, "a")) + + +def test_interpolate_downcast(using_copy_on_write): + df = DataFrame({"a": [1, np.nan, 2.5], "b": 1}) + arr_a = get_array(df, "a") + msg = "DataFrame.interpolate with method=pad is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.interpolate(method="pad", inplace=True, downcast="infer") + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr_a, get_array(df, "a")) + + +def test_interpolate_downcast_reference_triggers_copy(using_copy_on_write): + df = DataFrame({"a": [1, np.nan, 2.5], "b": 1}) + df_orig = df.copy() + arr_a = get_array(df, "a") + view = df[:] + msg = "DataFrame.interpolate with method=pad is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.interpolate(method="pad", inplace=True, downcast="infer") + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert not np.shares_memory(arr_a, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + else: + tm.assert_frame_equal(df, view) + + +def test_fillna(using_copy_on_write): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + df_orig = df.copy() + + df2 = df.fillna(5.5) + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + df2.iloc[0, 1] = 100 + tm.assert_frame_equal(df_orig, df) + + +def test_fillna_dict(using_copy_on_write): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + df_orig = df.copy() + + df2 = df.fillna({"a": 100.5}) + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + df2.iloc[0, 1] = 100 + tm.assert_frame_equal(df_orig, df) + + +@pytest.mark.parametrize("downcast", [None, False]) +def test_fillna_inplace(using_copy_on_write, downcast): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + + msg = "The 'downcast' keyword in fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.fillna(5.5, inplace=True, downcast=downcast) + assert np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(1) + + +def test_fillna_inplace_reference(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1.5, np.nan], "b": 1}) + df_orig = df.copy() + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + view = df[:] + + with tm.assert_cow_warning(warn_copy_on_write): + df.fillna(5.5, inplace=True) + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + assert view._mgr._has_no_reference(0) + assert df._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "b"), arr_b) + expected = DataFrame({"a": [1.5, 5.5], "b": 1}) + tm.assert_frame_equal(df, expected) + + +def test_fillna_interval_inplace_reference(using_copy_on_write, warn_copy_on_write): + # Set dtype explicitly to avoid implicit cast when setting nan + ser = Series( + interval_range(start=0, end=5), name="a", dtype="interval[float64, right]" + ) + ser.iloc[1] = np.nan + + ser_orig = ser.copy() + view = ser[:] + with tm.assert_cow_warning(warn_copy_on_write): + ser.fillna(value=Interval(left=0, right=5), inplace=True) + + if using_copy_on_write: + assert not np.shares_memory( + get_array(ser, "a").left.values, get_array(view, "a").left.values + ) + tm.assert_series_equal(view, ser_orig) + else: + assert np.shares_memory( + get_array(ser, "a").left.values, get_array(view, "a").left.values + ) + + +def test_fillna_series_empty_arg(using_copy_on_write): + ser = Series([1, np.nan, 2]) + ser_orig = ser.copy() + result = ser.fillna({}) + + if using_copy_on_write: + assert np.shares_memory(get_array(ser), get_array(result)) + else: + assert not np.shares_memory(get_array(ser), get_array(result)) + + ser.iloc[0] = 100.5 + tm.assert_series_equal(ser_orig, result) + + +def test_fillna_series_empty_arg_inplace(using_copy_on_write): + ser = Series([1, np.nan, 2]) + arr = get_array(ser) + ser.fillna({}, inplace=True) + + assert np.shares_memory(get_array(ser), arr) + if using_copy_on_write: + assert ser._mgr._has_no_reference(0) + + +def test_fillna_ea_noop_shares_memory( + using_copy_on_write, any_numeric_ea_and_arrow_dtype +): + df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype) + df_orig = df.copy() + df2 = df.fillna(100) + + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not df2._mgr._has_no_reference(1) + elif isinstance(df.dtypes.iloc[0], ArrowDtype): + # arrow is immutable, so no-ops do not need to copy underlying array + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + tm.assert_frame_equal(df_orig, df) + + df2.iloc[0, 1] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert df2._mgr._has_no_reference(1) + assert df._mgr._has_no_reference(1) + tm.assert_frame_equal(df_orig, df) + + +def test_fillna_inplace_ea_noop_shares_memory( + using_copy_on_write, warn_copy_on_write, any_numeric_ea_and_arrow_dtype +): + df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype) + df_orig = df.copy() + view = df[:] + with tm.assert_cow_warning(warn_copy_on_write): + df.fillna(100, inplace=True) + + if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + else: + # MaskedArray can actually respect inplace=True + assert np.shares_memory(get_array(df, "a"), get_array(view, "a")) + + assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) + if using_copy_on_write: + assert not df._mgr._has_no_reference(1) + assert not view._mgr._has_no_reference(1) + + with tm.assert_cow_warning( + warn_copy_on_write and "pyarrow" not in any_numeric_ea_and_arrow_dtype + ): + df.iloc[0, 1] = 100 + if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write: + tm.assert_frame_equal(df_orig, view) + else: + # we actually have a view + tm.assert_frame_equal(df, view) + + +def test_fillna_chained_assignment(using_copy_on_write): + df = DataFrame({"a": [1, np.nan, 2], "b": 1}) + df_orig = df.copy() + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].fillna(100, inplace=True) + tm.assert_frame_equal(df, df_orig) + + with tm.raises_chained_assignment_error(): + df[["a"]].fillna(100, inplace=True) + tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[["a"]].fillna(100, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[df.a > 5].fillna(100, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].fillna(100, inplace=True) + + +@pytest.mark.parametrize("func", ["interpolate", "ffill", "bfill"]) +def test_interpolate_chained_assignment(using_copy_on_write, func): + df = DataFrame({"a": [1, np.nan, 2], "b": 1}) + df_orig = df.copy() + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + getattr(df["a"], func)(inplace=True) + tm.assert_frame_equal(df, df_orig) + + with tm.raises_chained_assignment_error(): + getattr(df[["a"]], func)(inplace=True) + tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + getattr(df["a"], func)(inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + getattr(df[["a"]], func)(inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + getattr(df[df["a"] > 1], func)(inplace=True) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_methods.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_methods.py new file mode 100644 index 0000000000000000000000000000000000000000..5d1eefccbb1e723320da889f2874b79b12ce3d0e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_methods.py @@ -0,0 +1,2055 @@ +import numpy as np +import pytest + +from pandas.errors import SettingWithCopyWarning + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Period, + Series, + Timestamp, + date_range, + option_context, + period_range, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_copy(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy() + + # the deep copy by defaults takes a shallow copy of the Index + assert df_copy.index is not df.index + assert df_copy.columns is not df.columns + assert df_copy.index.is_(df.index) + assert df_copy.columns.is_(df.columns) + + # the deep copy doesn't share memory + assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) + if using_copy_on_write: + assert not df_copy._mgr.blocks[0].refs.has_reference() + assert not df_copy._mgr.blocks[1].refs.has_reference() + + # mutating copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + + +def test_copy_shallow(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy(deep=False) + + # the shallow copy also makes a shallow copy of the index + if using_copy_on_write: + assert df_copy.index is not df.index + assert df_copy.columns is not df.columns + assert df_copy.index.is_(df.index) + assert df_copy.columns.is_(df.columns) + else: + assert df_copy.index is df.index + assert df_copy.columns is df.columns + + # the shallow copy still shares memory + assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) + if using_copy_on_write: + assert df_copy._mgr.blocks[0].refs.has_reference() + assert df_copy._mgr.blocks[1].refs.has_reference() + + if using_copy_on_write: + # mutating shallow copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + # mutating triggered a copy-on-write -> no longer shares memory + assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) + # but still shares memory for the other columns/blocks + assert np.shares_memory(get_array(df_copy, "c"), get_array(df, "c")) + else: + # mutating shallow copy does mutate original + with tm.assert_cow_warning(warn_copy_on_write): + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 0 + # and still shares memory + assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) + + +@pytest.mark.parametrize("copy", [True, None, False]) +@pytest.mark.parametrize( + "method", + [ + lambda df, copy: df.rename(columns=str.lower, copy=copy), + lambda df, copy: df.reindex(columns=["a", "c"], copy=copy), + lambda df, copy: df.reindex_like(df, copy=copy), + lambda df, copy: df.align(df, copy=copy)[0], + lambda df, copy: df.set_axis(["a", "b", "c"], axis="index", copy=copy), + lambda df, copy: df.rename_axis(index="test", copy=copy), + lambda df, copy: df.rename_axis(columns="test", copy=copy), + lambda df, copy: df.astype({"b": "int64"}, copy=copy), + # lambda df, copy: df.swaplevel(0, 0, copy=copy), + lambda df, copy: df.swapaxes(0, 0, copy=copy), + lambda df, copy: df.truncate(0, 5, copy=copy), + lambda df, copy: df.infer_objects(copy=copy), + lambda df, copy: df.to_timestamp(copy=copy), + lambda df, copy: df.to_period(freq="D", copy=copy), + lambda df, copy: df.tz_localize("US/Central", copy=copy), + lambda df, copy: df.tz_convert("US/Central", copy=copy), + lambda df, copy: df.set_flags(allows_duplicate_labels=False, copy=copy), + ], + ids=[ + "rename", + "reindex", + "reindex_like", + "align", + "set_axis", + "rename_axis0", + "rename_axis1", + "astype", + # "swaplevel", # only series + "swapaxes", + "truncate", + "infer_objects", + "to_timestamp", + "to_period", + "tz_localize", + "tz_convert", + "set_flags", + ], +) +def test_methods_copy_keyword( + request, method, copy, using_copy_on_write, using_array_manager +): + index = None + if "to_timestamp" in request.node.callspec.id: + index = period_range("2012-01-01", freq="D", periods=3) + elif "to_period" in request.node.callspec.id: + index = date_range("2012-01-01", freq="D", periods=3) + elif "tz_localize" in request.node.callspec.id: + index = date_range("2012-01-01", freq="D", periods=3) + elif "tz_convert" in request.node.callspec.id: + index = date_range("2012-01-01", freq="D", periods=3, tz="Europe/Brussels") + + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=index) + + if "swapaxes" in request.node.callspec.id: + msg = "'DataFrame.swapaxes' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df2 = method(df, copy=copy) + else: + df2 = method(df, copy=copy) + + share_memory = using_copy_on_write or copy is False + + if request.node.callspec.id.startswith("reindex-"): + # TODO copy=False without CoW still returns a copy in this case + if not using_copy_on_write and not using_array_manager and copy is False: + share_memory = False + + if share_memory: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + +@pytest.mark.parametrize("copy", [True, None, False]) +@pytest.mark.parametrize( + "method", + [ + lambda ser, copy: ser.rename(index={0: 100}, copy=copy), + lambda ser, copy: ser.rename(None, copy=copy), + lambda ser, copy: ser.reindex(index=ser.index, copy=copy), + lambda ser, copy: ser.reindex_like(ser, copy=copy), + lambda ser, copy: ser.align(ser, copy=copy)[0], + lambda ser, copy: ser.set_axis(["a", "b", "c"], axis="index", copy=copy), + lambda ser, copy: ser.rename_axis(index="test", copy=copy), + lambda ser, copy: ser.astype("int64", copy=copy), + lambda ser, copy: ser.swaplevel(0, 1, copy=copy), + lambda ser, copy: ser.swapaxes(0, 0, copy=copy), + lambda ser, copy: ser.truncate(0, 5, copy=copy), + lambda ser, copy: ser.infer_objects(copy=copy), + lambda ser, copy: ser.to_timestamp(copy=copy), + lambda ser, copy: ser.to_period(freq="D", copy=copy), + lambda ser, copy: ser.tz_localize("US/Central", copy=copy), + lambda ser, copy: ser.tz_convert("US/Central", copy=copy), + lambda ser, copy: ser.set_flags(allows_duplicate_labels=False, copy=copy), + ], + ids=[ + "rename (dict)", + "rename", + "reindex", + "reindex_like", + "align", + "set_axis", + "rename_axis0", + "astype", + "swaplevel", + "swapaxes", + "truncate", + "infer_objects", + "to_timestamp", + "to_period", + "tz_localize", + "tz_convert", + "set_flags", + ], +) +def test_methods_series_copy_keyword(request, method, copy, using_copy_on_write): + index = None + if "to_timestamp" in request.node.callspec.id: + index = period_range("2012-01-01", freq="D", periods=3) + elif "to_period" in request.node.callspec.id: + index = date_range("2012-01-01", freq="D", periods=3) + elif "tz_localize" in request.node.callspec.id: + index = date_range("2012-01-01", freq="D", periods=3) + elif "tz_convert" in request.node.callspec.id: + index = date_range("2012-01-01", freq="D", periods=3, tz="Europe/Brussels") + elif "swaplevel" in request.node.callspec.id: + index = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]]) + + ser = Series([1, 2, 3], index=index) + + if "swapaxes" in request.node.callspec.id: + msg = "'Series.swapaxes' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser2 = method(ser, copy=copy) + else: + ser2 = method(ser, copy=copy) + + share_memory = using_copy_on_write or copy is False + + if share_memory: + assert np.shares_memory(get_array(ser2), get_array(ser)) + else: + assert not np.shares_memory(get_array(ser2), get_array(ser)) + + +@pytest.mark.parametrize("copy", [True, None, False]) +def test_transpose_copy_keyword(using_copy_on_write, copy, using_array_manager): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df.transpose(copy=copy) + share_memory = using_copy_on_write or copy is False or copy is None + share_memory = share_memory and not using_array_manager + + if share_memory: + assert np.shares_memory(get_array(df, "a"), get_array(result, 0)) + else: + assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) + + +# ----------------------------------------------------------------------------- +# DataFrame methods returning new DataFrame using shallow copy + + +def test_reset_index(using_copy_on_write): + # Case: resetting the index (i.e. adding a new column) + mutating the + # resulting dataframe + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12] + ) + df_orig = df.copy() + df2 = df.reset_index() + df2._mgr._verify_integrity() + + if using_copy_on_write: + # still shares memory (df2 is a shallow copy) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + # mutating df2 triggers a copy-on-write for that column / block + df2.iloc[0, 2] = 0 + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("index", [pd.RangeIndex(0, 2), Index([1, 2])]) +def test_reset_index_series_drop(using_copy_on_write, index): + ser = Series([1, 2], index=index) + ser_orig = ser.copy() + ser2 = ser.reset_index(drop=True) + if using_copy_on_write: + assert np.shares_memory(get_array(ser), get_array(ser2)) + assert not ser._mgr._has_no_reference(0) + else: + assert not np.shares_memory(get_array(ser), get_array(ser2)) + + ser2.iloc[0] = 100 + tm.assert_series_equal(ser, ser_orig) + + +def test_groupby_column_index_in_references(): + df = DataFrame( + {"A": ["a", "b", "c", "d"], "B": [1, 2, 3, 4], "C": ["a", "a", "b", "b"]} + ) + df = df.set_index("A") + key = df["C"] + result = df.groupby(key, observed=True).sum() + expected = df.groupby("C", observed=True).sum() + tm.assert_frame_equal(result, expected) + + +def test_rename_columns(using_copy_on_write): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the result + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.rename(columns=str.upper) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "A"), get_array(df, "a")) + df2.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "C"), get_array(df, "c")) + expected = DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns_modify_parent(using_copy_on_write): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the original (parent) dataframe + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df2 = df.rename(columns=str.upper) + df2_orig = df2.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "A"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) + df.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "C"), get_array(df, "c")) + expected = DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df2, df2_orig) + + +def test_pipe(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1.5}) + df_orig = df.copy() + + def testfunc(df): + return df + + df2 = df.pipe(testfunc) + + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 0] = 0 + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + expected = DataFrame({"a": [0, 2, 3], "b": 1.5}) + tm.assert_frame_equal(df, expected) + + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + +def test_pipe_modify_df(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1.5}) + df_orig = df.copy() + + def testfunc(df): + df.iloc[0, 0] = 100 + return df + + df2 = df.pipe(testfunc) + + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + expected = DataFrame({"a": [100, 2, 3], "b": 1.5}) + tm.assert_frame_equal(df, expected) + + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + +def test_reindex_columns(using_copy_on_write): + # Case: reindexing the column returns a new dataframe + # + afterwards modifying the result + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.reindex(columns=["a", "c"]) + + if using_copy_on_write: + # still shares memory (df2 is a shallow copy) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "index", + [ + lambda idx: idx, + lambda idx: idx.view(), + lambda idx: idx.copy(), + lambda idx: list(idx), + ], + ids=["identical", "view", "copy", "values"], +) +def test_reindex_rows(index, using_copy_on_write): + # Case: reindexing the rows with an index that matches the current index + # can use a shallow copy + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.reindex(index=index(df.index)) + + if using_copy_on_write: + # still shares memory (df2 is a shallow copy) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + tm.assert_frame_equal(df, df_orig) + + +def test_drop_on_column(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.drop(columns="a") + df2._mgr._verify_integrity() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + else: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + df2.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + tm.assert_frame_equal(df, df_orig) + + +def test_select_dtypes(using_copy_on_write): + # Case: selecting columns using `select_dtypes()` returns a new dataframe + # + afterwards modifying the result + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.select_dtypes("int64") + df2._mgr._verify_integrity() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "filter_kwargs", [{"items": ["a"]}, {"like": "a"}, {"regex": "a"}] +) +def test_filter(using_copy_on_write, filter_kwargs): + # Case: selecting columns using `filter()` returns a new dataframe + # + afterwards modifying the result + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.filter(**filter_kwargs) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + if using_copy_on_write: + df2.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_shift_no_op(using_copy_on_write): + df = DataFrame( + [[1, 2], [3, 4], [5, 6]], + index=date_range("2020-01-01", "2020-01-03"), + columns=["a", "b"], + ) + df_orig = df.copy() + df2 = df.shift(periods=0) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + tm.assert_frame_equal(df2, df_orig) + + +def test_shift_index(using_copy_on_write): + df = DataFrame( + [[1, 2], [3, 4], [5, 6]], + index=date_range("2020-01-01", "2020-01-03"), + columns=["a", "b"], + ) + df2 = df.shift(periods=1, axis=0) + + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + +def test_shift_rows_freq(using_copy_on_write): + df = DataFrame( + [[1, 2], [3, 4], [5, 6]], + index=date_range("2020-01-01", "2020-01-03"), + columns=["a", "b"], + ) + df_orig = df.copy() + df_orig.index = date_range("2020-01-02", "2020-01-04") + df2 = df.shift(periods=1, freq="1D") + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + tm.assert_frame_equal(df2, df_orig) + + +def test_shift_columns(using_copy_on_write, warn_copy_on_write): + df = DataFrame( + [[1, 2], [3, 4], [5, 6]], columns=date_range("2020-01-01", "2020-01-02") + ) + df2 = df.shift(periods=1, axis=1) + + assert np.shares_memory(get_array(df2, "2020-01-02"), get_array(df, "2020-01-01")) + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory( + get_array(df2, "2020-01-02"), get_array(df, "2020-01-01") + ) + expected = DataFrame( + [[np.nan, 1], [np.nan, 3], [np.nan, 5]], + columns=date_range("2020-01-01", "2020-01-02"), + ) + tm.assert_frame_equal(df2, expected) + + +def test_pop(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + view_original = df[:] + result = df.pop("a") + + assert np.shares_memory(result.values, get_array(view_original, "a")) + assert np.shares_memory(get_array(df, "b"), get_array(view_original, "b")) + + if using_copy_on_write: + result.iloc[0] = 0 + assert not np.shares_memory(result.values, get_array(view_original, "a")) + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(view_original, "b")) + tm.assert_frame_equal(view_original, df_orig) + else: + expected = DataFrame({"a": [1, 2, 3], "b": [0, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(view_original, expected) + + +@pytest.mark.parametrize( + "func", + [ + lambda x, y: x.align(y), + lambda x, y: x.align(y.a, axis=0), + lambda x, y: x.align(y.a.iloc[slice(0, 1)], axis=1), + ], +) +def test_align_frame(using_copy_on_write, func): + df = DataFrame({"a": [1, 2, 3], "b": "a"}) + df_orig = df.copy() + df_changed = df[["b", "a"]].copy() + df2, _ = func(df, df_changed) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_align_series(using_copy_on_write): + ser = Series([1, 2]) + ser_orig = ser.copy() + ser_other = ser.copy() + ser2, ser_other_result = ser.align(ser_other) + + if using_copy_on_write: + assert np.shares_memory(ser2.values, ser.values) + assert np.shares_memory(ser_other_result.values, ser_other.values) + else: + assert not np.shares_memory(ser2.values, ser.values) + assert not np.shares_memory(ser_other_result.values, ser_other.values) + + ser2.iloc[0] = 0 + ser_other_result.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(ser2.values, ser.values) + assert not np.shares_memory(ser_other_result.values, ser_other.values) + tm.assert_series_equal(ser, ser_orig) + tm.assert_series_equal(ser_other, ser_orig) + + +def test_align_copy_false(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df_orig = df.copy() + df2, df3 = df.align(df, copy=False) + + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + if using_copy_on_write: + df2.loc[0, "a"] = 0 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + df3.loc[0, "a"] = 0 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + +def test_align_with_series_copy_false(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + ser = Series([1, 2, 3], name="x") + ser_orig = ser.copy() + df_orig = df.copy() + df2, ser2 = df.align(ser, copy=False, axis=0) + + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + assert np.shares_memory(get_array(ser, "x"), get_array(ser2, "x")) + + if using_copy_on_write: + df2.loc[0, "a"] = 0 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + ser2.loc[0] = 0 + tm.assert_series_equal(ser, ser_orig) # Original is unchanged + + +def test_to_frame(using_copy_on_write, warn_copy_on_write): + # Case: converting a Series to a DataFrame with to_frame + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + + df = ser[:].to_frame() + + # currently this always returns a "view" + assert np.shares_memory(ser.values, get_array(df, 0)) + + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 + + if using_copy_on_write: + # mutating df triggers a copy-on-write for that column + assert not np.shares_memory(ser.values, get_array(df, 0)) + tm.assert_series_equal(ser, ser_orig) + else: + # but currently select_dtypes() actually returns a view -> mutates parent + expected = ser_orig.copy() + expected.iloc[0] = 0 + tm.assert_series_equal(ser, expected) + + # modify original series -> don't modify dataframe + df = ser[:].to_frame() + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 0 + + if using_copy_on_write: + tm.assert_frame_equal(df, ser_orig.to_frame()) + else: + expected = ser_orig.copy().to_frame() + expected.iloc[0, 0] = 0 + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("ax", ["index", "columns"]) +def test_swapaxes_noop(using_copy_on_write, ax): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df_orig = df.copy() + msg = "'DataFrame.swapaxes' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df2 = df.swapaxes(ax, ax) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_swapaxes_single_block(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["x", "y", "z"]) + df_orig = df.copy() + msg = "'DataFrame.swapaxes' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df2 = df.swapaxes("index", "columns") + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "x"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "x"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "x"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_swapaxes_read_only_array(): + df = DataFrame({"a": [1, 2], "b": 3}) + msg = "'DataFrame.swapaxes' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df = df.swapaxes(axis1="index", axis2="columns") + df.iloc[0, 0] = 100 + expected = DataFrame({0: [100, 3], 1: [2, 3]}, index=["a", "b"]) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize( + "method, idx", + [ + (lambda df: df.copy(deep=False).copy(deep=False), 0), + (lambda df: df.reset_index().reset_index(), 2), + (lambda df: df.rename(columns=str.upper).rename(columns=str.lower), 0), + (lambda df: df.copy(deep=False).select_dtypes(include="number"), 0), + ], + ids=["shallow-copy", "reset_index", "rename", "select_dtypes"], +) +def test_chained_methods(request, method, idx, using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + # when not using CoW, only the copy() variant actually gives a view + df2_is_view = not using_copy_on_write and request.node.callspec.id == "shallow-copy" + + # modify df2 -> don't modify df + df2 = method(df) + with tm.assert_cow_warning(warn_copy_on_write and df2_is_view): + df2.iloc[0, idx] = 0 + if not df2_is_view: + tm.assert_frame_equal(df, df_orig) + + # modify df -> don't modify df2 + df2 = method(df) + with tm.assert_cow_warning(warn_copy_on_write and df2_is_view): + df.iloc[0, 0] = 0 + if not df2_is_view: + tm.assert_frame_equal(df2.iloc[:, idx:], df_orig) + + +@pytest.mark.parametrize("obj", [Series([1, 2], name="a"), DataFrame({"a": [1, 2]})]) +def test_to_timestamp(using_copy_on_write, obj): + obj.index = Index([Period("2012-1-1", freq="D"), Period("2012-1-2", freq="D")]) + + obj_orig = obj.copy() + obj2 = obj.to_timestamp() + + if using_copy_on_write: + assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + else: + assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + + # mutating obj2 triggers a copy-on-write for that column / block + obj2.iloc[0] = 0 + assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + tm.assert_equal(obj, obj_orig) + + +@pytest.mark.parametrize("obj", [Series([1, 2], name="a"), DataFrame({"a": [1, 2]})]) +def test_to_period(using_copy_on_write, obj): + obj.index = Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")]) + + obj_orig = obj.copy() + obj2 = obj.to_period(freq="Y") + + if using_copy_on_write: + assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + else: + assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + + # mutating obj2 triggers a copy-on-write for that column / block + obj2.iloc[0] = 0 + assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + tm.assert_equal(obj, obj_orig) + + +def test_set_index(using_copy_on_write): + # GH 49473 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.set_index("a") + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + else: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + # mutating df2 triggers a copy-on-write for that column / block + df2.iloc[0, 1] = 0 + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + tm.assert_frame_equal(df, df_orig) + + +def test_set_index_mutating_parent_does_not_mutate_index(): + df = DataFrame({"a": [1, 2, 3], "b": 1}) + result = df.set_index("a") + expected = result.copy() + + df.iloc[0, 0] = 100 + tm.assert_frame_equal(result, expected) + + +def test_add_prefix(using_copy_on_write): + # GH 49473 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.add_prefix("CoW_") + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "CoW_a"), get_array(df, "a")) + df2.iloc[0, 0] = 0 + + assert not np.shares_memory(get_array(df2, "CoW_a"), get_array(df, "a")) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "CoW_c"), get_array(df, "c")) + expected = DataFrame( + {"CoW_a": [0, 2, 3], "CoW_b": [4, 5, 6], "CoW_c": [0.1, 0.2, 0.3]} + ) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_add_suffix(using_copy_on_write): + # GH 49473 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.add_suffix("_CoW") + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a_CoW"), get_array(df, "a")) + df2.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(df2, "a_CoW"), get_array(df, "a")) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c_CoW"), get_array(df, "c")) + expected = DataFrame( + {"a_CoW": [0, 2, 3], "b_CoW": [4, 5, 6], "c_CoW": [0.1, 0.2, 0.3]} + ) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("axis, val", [(0, 5.5), (1, np.nan)]) +def test_dropna(using_copy_on_write, axis, val): + df = DataFrame({"a": [1, 2, 3], "b": [4, val, 6], "c": "d"}) + df_orig = df.copy() + df2 = df.dropna(axis=axis) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("val", [5, 5.5]) +def test_dropna_series(using_copy_on_write, val): + ser = Series([1, val, 4]) + ser_orig = ser.copy() + ser2 = ser.dropna() + + if using_copy_on_write: + assert np.shares_memory(ser2.values, ser.values) + else: + assert not np.shares_memory(ser2.values, ser.values) + + ser2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(ser2.values, ser.values) + tm.assert_series_equal(ser, ser_orig) + + +@pytest.mark.parametrize( + "method", + [ + lambda df: df.head(), + lambda df: df.head(2), + lambda df: df.tail(), + lambda df: df.tail(3), + ], +) +def test_head_tail(method, using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = method(df) + df2._mgr._verify_integrity() + + if using_copy_on_write: + # We are explicitly deviating for CoW here to make an eager copy (avoids + # tracking references for very cheap ops) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + # modify df2 to trigger CoW for that block + with tm.assert_cow_warning(warn_copy_on_write): + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + # without CoW enabled, head and tail return views. Mutating df2 also mutates df. + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + with tm.assert_cow_warning(warn_copy_on_write): + df2.iloc[0, 0] = 1 + tm.assert_frame_equal(df, df_orig) + + +def test_infer_objects(using_copy_on_write): + df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) + df_orig = df.copy() + df2 = df.infer_objects() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + df2.iloc[0, 0] = 0 + df2.iloc[0, 1] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) + + +def test_infer_objects_no_reference(using_copy_on_write): + df = DataFrame( + { + "a": [1, 2], + "b": "c", + "c": 1, + "d": Series( + [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" + ), + "e": "b", + } + ) + df = df.infer_objects() + + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + arr_d = get_array(df, "d") + + df.iloc[0, 0] = 0 + df.iloc[0, 1] = "d" + df.iloc[0, 3] = Timestamp("2018-12-31") + if using_copy_on_write: + assert np.shares_memory(arr_a, get_array(df, "a")) + # TODO(CoW): Block splitting causes references here + assert not np.shares_memory(arr_b, get_array(df, "b")) + assert np.shares_memory(arr_d, get_array(df, "d")) + + +def test_infer_objects_reference(using_copy_on_write): + df = DataFrame( + { + "a": [1, 2], + "b": "c", + "c": 1, + "d": Series( + [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" + ), + } + ) + view = df[:] # noqa: F841 + df = df.infer_objects() + + arr_a = get_array(df, "a") + arr_b = get_array(df, "b") + arr_d = get_array(df, "d") + + df.iloc[0, 0] = 0 + df.iloc[0, 1] = "d" + df.iloc[0, 3] = Timestamp("2018-12-31") + if using_copy_on_write: + assert not np.shares_memory(arr_a, get_array(df, "a")) + assert not np.shares_memory(arr_b, get_array(df, "b")) + assert np.shares_memory(arr_d, get_array(df, "d")) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"before": "a", "after": "b", "axis": 1}, + {"before": 0, "after": 1, "axis": 0}, + ], +) +def test_truncate(using_copy_on_write, kwargs): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 2}) + df_orig = df.copy() + df2 = df.truncate(**kwargs) + df2._mgr._verify_integrity() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("method", ["assign", "drop_duplicates"]) +def test_assign_drop_duplicates(using_copy_on_write, method): + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + df2 = getattr(df, method)() + df2._mgr._verify_integrity() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})]) +def test_take(using_copy_on_write, obj): + # Check that no copy is made when we take all rows in original order + obj_orig = obj.copy() + obj2 = obj.take([0, 1]) + + if using_copy_on_write: + assert np.shares_memory(obj2.values, obj.values) + else: + assert not np.shares_memory(obj2.values, obj.values) + + obj2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(obj2.values, obj.values) + tm.assert_equal(obj, obj_orig) + + +@pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})]) +def test_between_time(using_copy_on_write, obj): + obj.index = date_range("2018-04-09", periods=2, freq="1D20min") + obj_orig = obj.copy() + obj2 = obj.between_time("0:00", "1:00") + + if using_copy_on_write: + assert np.shares_memory(obj2.values, obj.values) + else: + assert not np.shares_memory(obj2.values, obj.values) + + obj2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(obj2.values, obj.values) + tm.assert_equal(obj, obj_orig) + + +def test_reindex_like(using_copy_on_write): + df = DataFrame({"a": [1, 2], "b": "a"}) + other = DataFrame({"b": "a", "a": [1, 2]}) + + df_orig = df.copy() + df2 = df.reindex_like(other) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 1] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_sort_index(using_copy_on_write): + # GH 49473 + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + ser2 = ser.sort_index() + + if using_copy_on_write: + assert np.shares_memory(ser.values, ser2.values) + else: + assert not np.shares_memory(ser.values, ser2.values) + + # mutating ser triggers a copy-on-write for the column / block + ser2.iloc[0] = 0 + assert not np.shares_memory(ser2.values, ser.values) + tm.assert_series_equal(ser, ser_orig) + + +@pytest.mark.parametrize( + "obj, kwargs", + [(Series([1, 2, 3], name="a"), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})], +) +def test_sort_values(using_copy_on_write, obj, kwargs): + obj_orig = obj.copy() + obj2 = obj.sort_values(**kwargs) + + if using_copy_on_write: + assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + else: + assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + + # mutating df triggers a copy-on-write for the column / block + obj2.iloc[0] = 0 + assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a")) + tm.assert_equal(obj, obj_orig) + + +@pytest.mark.parametrize( + "obj, kwargs", + [(Series([1, 2, 3], name="a"), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})], +) +def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_write): + obj_orig = obj.copy() + view = obj[:] + obj.sort_values(inplace=True, **kwargs) + + assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) + + # mutating obj triggers a copy-on-write for the column / block + with tm.assert_cow_warning(warn_copy_on_write): + obj.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(obj, "a"), get_array(view, "a")) + tm.assert_equal(view, obj_orig) + else: + assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) + + +@pytest.mark.parametrize("decimals", [-1, 0, 1]) +def test_round(using_copy_on_write, warn_copy_on_write, decimals): + df = DataFrame({"a": [1, 2], "b": "c"}) + df_orig = df.copy() + df2 = df.round(decimals=decimals) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + # TODO: Make inplace by using out parameter of ndarray.round? + if decimals >= 0: + # Ensure lazy copy if no-op + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 1] = "d" + df2.iloc[0, 0] = 4 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_reorder_levels(using_copy_on_write): + index = MultiIndex.from_tuples( + [(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"] + ) + df = DataFrame({"a": [1, 2, 3, 4]}, index=index) + df_orig = df.copy() + df2 = df.reorder_levels(order=["two", "one"]) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_series_reorder_levels(using_copy_on_write): + index = MultiIndex.from_tuples( + [(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"] + ) + ser = Series([1, 2, 3, 4], index=index) + ser_orig = ser.copy() + ser2 = ser.reorder_levels(order=["two", "one"]) + + if using_copy_on_write: + assert np.shares_memory(ser2.values, ser.values) + else: + assert not np.shares_memory(ser2.values, ser.values) + + ser2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(ser2.values, ser.values) + tm.assert_series_equal(ser, ser_orig) + + +@pytest.mark.parametrize("obj", [Series([1, 2, 3]), DataFrame({"a": [1, 2, 3]})]) +def test_swaplevel(using_copy_on_write, obj): + index = MultiIndex.from_tuples([(1, 1), (1, 2), (2, 1)], names=["one", "two"]) + obj.index = index + obj_orig = obj.copy() + obj2 = obj.swaplevel() + + if using_copy_on_write: + assert np.shares_memory(obj2.values, obj.values) + else: + assert not np.shares_memory(obj2.values, obj.values) + + obj2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(obj2.values, obj.values) + tm.assert_equal(obj, obj_orig) + + +def test_frame_set_axis(using_copy_on_write): + # GH 49473 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.set_axis(["a", "b", "c"], axis="index") + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column / block + df2.iloc[0, 0] = 0 + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_series_set_axis(using_copy_on_write): + # GH 49473 + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + ser2 = ser.set_axis(["a", "b", "c"], axis="index") + + if using_copy_on_write: + assert np.shares_memory(ser, ser2) + else: + assert not np.shares_memory(ser, ser2) + + # mutating ser triggers a copy-on-write for the column / block + ser2.iloc[0] = 0 + assert not np.shares_memory(ser2, ser) + tm.assert_series_equal(ser, ser_orig) + + +def test_set_flags(using_copy_on_write, warn_copy_on_write): + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + ser2 = ser.set_flags(allows_duplicate_labels=False) + + assert np.shares_memory(ser, ser2) + + # mutating ser triggers a copy-on-write for the column / block + with tm.assert_cow_warning(warn_copy_on_write): + ser2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(ser2, ser) + tm.assert_series_equal(ser, ser_orig) + else: + assert np.shares_memory(ser2, ser) + expected = Series([0, 2, 3]) + tm.assert_series_equal(ser, expected) + + +@pytest.mark.parametrize("kwargs", [{"mapper": "test"}, {"index": "test"}]) +def test_rename_axis(using_copy_on_write, kwargs): + df = DataFrame({"a": [1, 2, 3, 4]}, index=Index([1, 2, 3, 4], name="a")) + df_orig = df.copy() + df2 = df.rename_axis(**kwargs) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "func, tz", [("tz_convert", "Europe/Berlin"), ("tz_localize", None)] +) +def test_tz_convert_localize(using_copy_on_write, func, tz): + # GH 49473 + ser = Series( + [1, 2], index=date_range(start="2014-08-01 09:00", freq="h", periods=2, tz=tz) + ) + ser_orig = ser.copy() + ser2 = getattr(ser, func)("US/Central") + + if using_copy_on_write: + assert np.shares_memory(ser.values, ser2.values) + else: + assert not np.shares_memory(ser.values, ser2.values) + + # mutating ser triggers a copy-on-write for the column / block + ser2.iloc[0] = 0 + assert not np.shares_memory(ser2.values, ser.values) + tm.assert_series_equal(ser, ser_orig) + + +def test_droplevel(using_copy_on_write): + # GH 49473 + index = MultiIndex.from_tuples([(1, 1), (1, 2), (2, 1)], names=["one", "two"]) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, index=index) + df_orig = df.copy() + df2 = df.droplevel(0) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column / block + df2.iloc[0, 0] = 0 + + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + tm.assert_frame_equal(df, df_orig) + + +def test_squeeze(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + series = df.squeeze() + + # Should share memory regardless of CoW since squeeze is just an iloc + assert np.shares_memory(series.values, get_array(df, "a")) + + # mutating squeezed df triggers a copy-on-write for that column/block + with tm.assert_cow_warning(warn_copy_on_write): + series.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(series.values, get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + else: + # Without CoW the original will be modified + assert np.shares_memory(series.values, get_array(df, "a")) + assert df.loc[0, "a"] == 0 + + +def test_items(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + df_orig = df.copy() + + # Test this twice, since the second time, the item cache will be + # triggered, and we want to make sure it still works then. + for i in range(2): + for name, ser in df.items(): + assert np.shares_memory(get_array(ser, name), get_array(df, name)) + + # mutating df triggers a copy-on-write for that column / block + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 0 + + if using_copy_on_write: + assert not np.shares_memory(get_array(ser, name), get_array(df, name)) + tm.assert_frame_equal(df, df_orig) + else: + # Original frame will be modified + assert df.loc[0, name] == 0 + + +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_putmask(using_copy_on_write, dtype, warn_copy_on_write): + df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype) + view = df[:] + df_orig = df.copy() + with tm.assert_cow_warning(warn_copy_on_write): + df[df == df] = 5 + + if using_copy_on_write: + assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) + tm.assert_frame_equal(view, df_orig) + else: + # Without CoW the original will be modified + assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) + assert view.iloc[0, 0] == 5 + + +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_putmask_no_reference(using_copy_on_write, dtype): + df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype) + arr_a = get_array(df, "a") + df[df == df] = 5 + + if using_copy_on_write: + assert np.shares_memory(arr_a, get_array(df, "a")) + + +@pytest.mark.parametrize("dtype", ["float64", "Float64"]) +def test_putmask_aligns_rhs_no_reference(using_copy_on_write, dtype): + df = DataFrame({"a": [1.5, 2], "b": 1.5}, dtype=dtype) + arr_a = get_array(df, "a") + df[df == df] = DataFrame({"a": [5.5, 5]}) + + if using_copy_on_write: + assert np.shares_memory(arr_a, get_array(df, "a")) + + +@pytest.mark.parametrize( + "val, exp, warn", [(5.5, True, FutureWarning), (5, False, None)] +) +def test_putmask_dont_copy_some_blocks( + using_copy_on_write, val, exp, warn, warn_copy_on_write +): + df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5}) + view = df[:] + df_orig = df.copy() + indexer = DataFrame( + [[True, False, False], [True, False, False]], columns=list("abc") + ) + if warn_copy_on_write: + with tm.assert_cow_warning(): + df[indexer] = val + else: + with tm.assert_produces_warning(warn, match="incompatible dtype"): + df[indexer] = val + + if using_copy_on_write: + assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) + # TODO(CoW): Could split blocks to avoid copying the whole block + assert np.shares_memory(get_array(view, "b"), get_array(df, "b")) is exp + assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) + assert df._mgr._has_no_reference(1) is not exp + assert not df._mgr._has_no_reference(2) + tm.assert_frame_equal(view, df_orig) + elif val == 5: + # Without CoW the original will be modified, the other case upcasts, e.g. copy + assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) + assert view.iloc[0, 0] == 5 + + +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +@pytest.mark.parametrize( + "func", + [ + lambda ser: ser.where(ser > 0, 10), + lambda ser: ser.mask(ser <= 0, 10), + ], +) +def test_where_mask_noop(using_copy_on_write, dtype, func): + ser = Series([1, 2, 3], dtype=dtype) + ser_orig = ser.copy() + + result = func(ser) + + if using_copy_on_write: + assert np.shares_memory(get_array(ser), get_array(result)) + else: + assert not np.shares_memory(get_array(ser), get_array(result)) + + result.iloc[0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(ser), get_array(result)) + tm.assert_series_equal(ser, ser_orig) + + +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +@pytest.mark.parametrize( + "func", + [ + lambda ser: ser.where(ser < 0, 10), + lambda ser: ser.mask(ser >= 0, 10), + ], +) +def test_where_mask(using_copy_on_write, dtype, func): + ser = Series([1, 2, 3], dtype=dtype) + ser_orig = ser.copy() + + result = func(ser) + + assert not np.shares_memory(get_array(ser), get_array(result)) + tm.assert_series_equal(ser, ser_orig) + + +@pytest.mark.parametrize("dtype, val", [("int64", 10.5), ("Int64", 10)]) +@pytest.mark.parametrize( + "func", + [ + lambda df, val: df.where(df < 0, val), + lambda df, val: df.mask(df >= 0, val), + ], +) +def test_where_mask_noop_on_single_column(using_copy_on_write, dtype, val, func): + df = DataFrame({"a": [1, 2, 3], "b": [-4, -5, -6]}, dtype=dtype) + df_orig = df.copy() + + result = func(df, val) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(result, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(result, "b")) + + result.iloc[0, 1] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(result, "b")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("func", ["mask", "where"]) +def test_chained_where_mask(using_copy_on_write, func): + df = DataFrame({"a": [1, 4, 2], "b": 1}) + df_orig = df.copy() + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + getattr(df["a"], func)(df["a"] > 2, 5, inplace=True) + tm.assert_frame_equal(df, df_orig) + + with tm.raises_chained_assignment_error(): + getattr(df[["a"]], func)(df["a"] > 2, 5, inplace=True) + tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + getattr(df["a"], func)(df["a"] > 2, 5, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + getattr(df[["a"]], func)(df["a"] > 2, 5, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + getattr(df[df["a"] > 1], func)(df["a"] > 2, 5, inplace=True) + + +def test_asfreq_noop(using_copy_on_write): + df = DataFrame( + {"a": [0.0, None, 2.0, 3.0]}, + index=date_range("1/1/2000", periods=4, freq="min"), + ) + df_orig = df.copy() + df2 = df.asfreq(freq="min") + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column / block + df2.iloc[0, 0] = 0 + + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_iterrows(using_copy_on_write): + df = DataFrame({"a": 0, "b": 1}, index=[1, 2, 3]) + df_orig = df.copy() + + for _, sub in df.iterrows(): + sub.iloc[0] = 100 + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + + +def test_interpolate_creates_copy(using_copy_on_write, warn_copy_on_write): + # GH#51126 + df = DataFrame({"a": [1.5, np.nan, 3]}) + view = df[:] + expected = df.copy() + + with tm.assert_cow_warning(warn_copy_on_write): + df.ffill(inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100.5 + + if using_copy_on_write: + tm.assert_frame_equal(view, expected) + else: + expected = DataFrame({"a": [100.5, 1.5, 3]}) + tm.assert_frame_equal(view, expected) + + +def test_isetitem(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + df_orig = df.copy() + df2 = df.copy(deep=None) # Trigger a CoW + df2.isetitem(1, np.array([-1, -2, -3])) # This is inplace + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + else: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + df2.loc[0, "a"] = 0 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + else: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_isetitem_series(using_copy_on_write, dtype): + df = DataFrame({"a": [1, 2, 3], "b": np.array([4, 5, 6], dtype=dtype)}) + ser = Series([7, 8, 9]) + ser_orig = ser.copy() + df.isetitem(0, ser) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "a"), get_array(ser)) + assert not df._mgr._has_no_reference(0) + + # mutating dataframe doesn't update series + df.loc[0, "a"] = 0 + tm.assert_series_equal(ser, ser_orig) + + # mutating series doesn't update dataframe + df = DataFrame({"a": [1, 2, 3], "b": np.array([4, 5, 6], dtype=dtype)}) + ser = Series([7, 8, 9]) + df.isetitem(0, ser) + + ser.loc[0] = 0 + expected = DataFrame({"a": [7, 8, 9], "b": np.array([4, 5, 6], dtype=dtype)}) + tm.assert_frame_equal(df, expected) + + +def test_isetitem_frame(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 2}) + rhs = DataFrame({"a": [4, 5, 6], "b": 2}) + df.isetitem([0, 1], rhs) + if using_copy_on_write: + assert np.shares_memory(get_array(df, "a"), get_array(rhs, "a")) + assert np.shares_memory(get_array(df, "b"), get_array(rhs, "b")) + assert not df._mgr._has_no_reference(0) + else: + assert not np.shares_memory(get_array(df, "a"), get_array(rhs, "a")) + assert not np.shares_memory(get_array(df, "b"), get_array(rhs, "b")) + expected = df.copy() + rhs.iloc[0, 0] = 100 + rhs.iloc[0, 1] = 100 + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("key", ["a", ["a"]]) +def test_get(using_copy_on_write, warn_copy_on_write, key): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df_orig = df.copy() + + result = df.get(key) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + result.iloc[0] = 0 + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + else: + # for non-CoW it depends on whether we got a Series or DataFrame if it + # is a view or copy or triggers a warning or not + if warn_copy_on_write: + warn = FutureWarning if isinstance(key, str) else None + else: + warn = SettingWithCopyWarning if isinstance(key, list) else None + with option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(warn): + result.iloc[0] = 0 + + if isinstance(key, list): + tm.assert_frame_equal(df, df_orig) + else: + assert df.iloc[0, 0] == 0 + + +@pytest.mark.parametrize("axis, key", [(0, 0), (1, "a")]) +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_xs( + using_copy_on_write, warn_copy_on_write, using_array_manager, axis, key, dtype +): + single_block = (dtype == "int64") and not using_array_manager + is_view = single_block or (using_array_manager and axis == 1) + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + + result = df.xs(key, axis=axis) + + if axis == 1 or single_block: + assert np.shares_memory(get_array(df, "a"), get_array(result)) + elif using_copy_on_write: + assert result._mgr._has_no_reference(0) + + if using_copy_on_write or (is_view and not warn_copy_on_write): + result.iloc[0] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(single_block or axis == 1): + result.iloc[0] = 0 + else: + with option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(SettingWithCopyWarning): + result.iloc[0] = 0 + + if using_copy_on_write or (not single_block and axis == 0): + tm.assert_frame_equal(df, df_orig) + else: + assert df.iloc[0, 0] == 0 + + +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("key, level", [("l1", 0), (2, 1)]) +def test_xs_multiindex( + using_copy_on_write, warn_copy_on_write, using_array_manager, key, level, axis +): + arr = np.arange(18).reshape(6, 3) + index = MultiIndex.from_product([["l1", "l2"], [1, 2, 3]], names=["lev1", "lev2"]) + df = DataFrame(arr, index=index, columns=list("abc")) + if axis == 1: + df = df.transpose().copy() + df_orig = df.copy() + + result = df.xs(key, level=level, axis=axis) + + if level == 0: + assert np.shares_memory( + get_array(df, df.columns[0]), get_array(result, result.columns[0]) + ) + + if warn_copy_on_write: + warn = FutureWarning if level == 0 else None + elif not using_copy_on_write and not using_array_manager: + warn = SettingWithCopyWarning + else: + warn = None + with option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(warn): + result.iloc[0, 0] = 0 + + tm.assert_frame_equal(df, df_orig) + + +def test_update_frame(using_copy_on_write, warn_copy_on_write): + df1 = DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) + df2 = DataFrame({"b": [100.0]}, index=[1]) + df1_orig = df1.copy() + view = df1[:] + + # TODO(CoW) better warning message? + with tm.assert_cow_warning(warn_copy_on_write): + df1.update(df2) + + expected = DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 100.0, 6.0]}) + tm.assert_frame_equal(df1, expected) + if using_copy_on_write: + # df1 is updated, but its view not + tm.assert_frame_equal(view, df1_orig) + assert np.shares_memory(get_array(df1, "a"), get_array(view, "a")) + assert not np.shares_memory(get_array(df1, "b"), get_array(view, "b")) + else: + tm.assert_frame_equal(view, expected) + + +def test_update_series(using_copy_on_write, warn_copy_on_write): + ser1 = Series([1.0, 2.0, 3.0]) + ser2 = Series([100.0], index=[1]) + ser1_orig = ser1.copy() + view = ser1[:] + + if warn_copy_on_write: + with tm.assert_cow_warning(): + ser1.update(ser2) + else: + ser1.update(ser2) + + expected = Series([1.0, 100.0, 3.0]) + tm.assert_series_equal(ser1, expected) + if using_copy_on_write: + # ser1 is updated, but its view not + tm.assert_series_equal(view, ser1_orig) + else: + tm.assert_series_equal(view, expected) + + +def test_update_chained_assignment(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + ser2 = Series([100.0], index=[1]) + df_orig = df.copy() + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].update(ser2) + tm.assert_frame_equal(df, df_orig) + + with tm.raises_chained_assignment_error(): + df[["a"]].update(ser2.to_frame()) + tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].update(ser2) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[["a"]].update(ser2.to_frame()) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[df["a"] > 1].update(ser2.to_frame()) + + +def test_inplace_arithmetic_series(using_copy_on_write): + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + data = get_array(ser) + ser *= 2 + if using_copy_on_write: + # https://github.com/pandas-dev/pandas/pull/55745 + # changed to NOT update inplace because there is no benefit (actual + # operation already done non-inplace). This was only for the optics + # of updating the backing array inplace, but we no longer want to make + # that guarantee + assert not np.shares_memory(get_array(ser), data) + tm.assert_numpy_array_equal(data, get_array(ser_orig)) + else: + assert np.shares_memory(get_array(ser), data) + tm.assert_numpy_array_equal(data, get_array(ser)) + + +def test_inplace_arithmetic_series_with_reference( + using_copy_on_write, warn_copy_on_write +): + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + view = ser[:] + with tm.assert_cow_warning(warn_copy_on_write): + ser *= 2 + if using_copy_on_write: + assert not np.shares_memory(get_array(ser), get_array(view)) + tm.assert_series_equal(ser_orig, view) + else: + assert np.shares_memory(get_array(ser), get_array(view)) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_transpose(using_copy_on_write, copy, using_array_manager): + df = DataFrame({"a": [1, 2, 3], "b": 1}) + df_orig = df.copy() + result = df.transpose(copy=copy) + + if not copy and not using_array_manager or using_copy_on_write: + assert np.shares_memory(get_array(df, "a"), get_array(result, 0)) + else: + assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) + + result.iloc[0, 0] = 100 + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + + +def test_transpose_different_dtypes(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1.5}) + df_orig = df.copy() + result = df.T + + assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) + result.iloc[0, 0] = 100 + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + + +def test_transpose_ea_single_column(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}, dtype="Int64") + result = df.T + + assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) + + +def test_transform_frame(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1}) + df_orig = df.copy() + + def func(ser): + ser.iloc[0] = 100 + return ser + + with tm.assert_cow_warning(warn_copy_on_write): + df.transform(func) + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + + +def test_transform_series(using_copy_on_write, warn_copy_on_write): + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + + def func(ser): + ser.iloc[0] = 100 + return ser + + with tm.assert_cow_warning(warn_copy_on_write): + ser.transform(func) + if using_copy_on_write: + tm.assert_series_equal(ser, ser_orig) + + +def test_count_read_only_array(): + df = DataFrame({"a": [1, 2], "b": 3}) + result = df.count() + result.iloc[0] = 100 + expected = Series([100, 2], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + +def test_series_view(using_copy_on_write, warn_copy_on_write): + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + ser2 = ser.view() + assert np.shares_memory(get_array(ser), get_array(ser2)) + if using_copy_on_write: + assert not ser2._mgr._has_no_reference(0) + + with tm.assert_cow_warning(warn_copy_on_write): + ser2.iloc[0] = 100 + + if using_copy_on_write: + tm.assert_series_equal(ser_orig, ser) + else: + expected = Series([100, 2, 3]) + tm.assert_series_equal(ser, expected) + + +def test_insert_series(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + df.insert(loc=1, value=ser, column="b") + if using_copy_on_write: + assert np.shares_memory(get_array(ser), get_array(df, "b")) + assert not df._mgr._has_no_reference(1) + else: + assert not np.shares_memory(get_array(ser), get_array(df, "b")) + + df.iloc[0, 1] = 100 + tm.assert_series_equal(ser, ser_orig) + + +def test_eval(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1}) + df_orig = df.copy() + + result = df.eval("c = a+b") + if using_copy_on_write: + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) + else: + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + + result.iloc[0, 0] = 100 + tm.assert_frame_equal(df, df_orig) + + +def test_eval_inplace(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1}) + df_orig = df.copy() + df_view = df[:] + + df.eval("c = a+b", inplace=True) + assert np.shares_memory(get_array(df, "a"), get_array(df_view, "a")) + + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 + if using_copy_on_write: + tm.assert_frame_equal(df_view, df_orig) + + +def test_apply_modify_row(using_copy_on_write, warn_copy_on_write): + # Case: applying a function on each row as a Series object, where the + # function mutates the row object (which needs to trigger CoW if row is a view) + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + df_orig = df.copy() + + def transform(row): + row["B"] = 100 + return row + + with tm.assert_cow_warning(warn_copy_on_write): + df.apply(transform, axis=1) + + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + else: + assert df.loc[0, "B"] == 100 + + # row Series is a copy + df = DataFrame({"A": [1, 2], "B": ["b", "c"]}) + df_orig = df.copy() + + with tm.assert_produces_warning(None): + df.apply(transform, axis=1) + + tm.assert_frame_equal(df, df_orig) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_replace.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_replace.py new file mode 100644 index 0000000000000000000000000000000000000000..6d16bc308388359b69e07d77a5fef153b4eb248f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_replace.py @@ -0,0 +1,481 @@ +import numpy as np +import pytest + +from pandas import ( + Categorical, + DataFrame, + option_context, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +@pytest.mark.parametrize( + "replace_kwargs", + [ + {"to_replace": {"a": 1, "b": 4}, "value": -1}, + # Test CoW splits blocks to avoid copying unchanged columns + {"to_replace": {"a": 1}, "value": -1}, + {"to_replace": {"b": 4}, "value": -1}, + {"to_replace": {"b": {4: 1}}}, + # TODO: Add these in a further optimization + # We would need to see which columns got replaced in the mask + # which could be expensive + # {"to_replace": {"b": 1}}, + # 1 + ], +) +def test_replace(using_copy_on_write, replace_kwargs): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) + df_orig = df.copy() + + df_replaced = df.replace(**replace_kwargs) + + if using_copy_on_write: + if (df_replaced["b"] == df["b"]).all(): + assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + + # mutating squeezed df triggers a copy-on-write for that column/block + df_replaced.loc[0, "c"] = -1 + if using_copy_on_write: + assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + + if "a" in replace_kwargs["to_replace"]: + arr = get_array(df_replaced, "a") + df_replaced.loc[0, "a"] = 100 + assert np.shares_memory(get_array(df_replaced, "a"), arr) + tm.assert_frame_equal(df, df_orig) + + +def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": ["aaa", "bbb"]}) + df_orig = df.copy() + view = df[:] + arr = get_array(df, "a") + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) + if using_copy_on_write: + assert not np.shares_memory(arr, get_array(df, "a")) + assert df._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_replace_regex_inplace(using_copy_on_write): + df = DataFrame({"a": ["aaa", "bbb"]}) + arr = get_array(df, "a") + df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr, get_array(df, "a")) + + df_orig = df.copy() + df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True) + tm.assert_frame_equal(df_orig, df) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + +def test_replace_regex_inplace_no_op(using_copy_on_write): + df = DataFrame({"a": [1, 2]}) + arr = get_array(df, "a") + df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert np.shares_memory(arr, get_array(df, "a")) + + df_orig = df.copy() + df2 = df.replace(to_replace=r"^x.$", value="new", regex=True) + tm.assert_frame_equal(df_orig, df) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + +def test_replace_mask_all_false_second_block(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2}) + df_orig = df.copy() + + df2 = df.replace(to_replace=1.5, value=55.5) + + if using_copy_on_write: + # TODO: Block splitting would allow us to avoid copying b + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + else: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + df2.loc[0, "c"] = 1 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + # TODO: This should split and not copy the whole block + # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d")) + + +def test_replace_coerce_single_column(using_copy_on_write, using_array_manager): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) + df_orig = df.copy() + + df2 = df.replace(to_replace=1.5, value="a") + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + elif not using_array_manager: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + if using_copy_on_write: + df2.loc[0, "b"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + +def test_replace_to_replace_wrong_dtype(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) + df_orig = df.copy() + + df2 = df.replace(to_replace="xxx", value=1.5) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + df2.loc[0, "b"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + +def test_replace_list_categorical(using_copy_on_write): + df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") + arr = get_array(df, "a") + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.replace(["c"], value="a", inplace=True) + assert np.shares_memory(arr.codes, get_array(df, "a").codes) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + df_orig = df.copy() + with tm.assert_produces_warning(FutureWarning, match=msg): + df2 = df.replace(["b"], value="a") + assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) + + tm.assert_frame_equal(df, df_orig) + + +def test_replace_list_inplace_refs_categorical(using_copy_on_write): + df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") + view = df[:] + df_orig = df.copy() + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.replace(["c"], value="a", inplace=True) + if using_copy_on_write: + assert not np.shares_memory( + get_array(view, "a").codes, get_array(df, "a").codes + ) + tm.assert_frame_equal(df_orig, view) + else: + # This could be inplace + assert not np.shares_memory( + get_array(view, "a").codes, get_array(df, "a").codes + ) + + +@pytest.mark.parametrize("to_replace", [1.5, [1.5], []]) +def test_replace_inplace(using_copy_on_write, to_replace): + df = DataFrame({"a": [1.5, 2, 3]}) + arr_a = get_array(df, "a") + df.replace(to_replace=1.5, value=15.5, inplace=True) + + assert np.shares_memory(get_array(df, "a"), arr_a) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +@pytest.mark.parametrize("to_replace", [1.5, [1.5]]) +def test_replace_inplace_reference(using_copy_on_write, to_replace, warn_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3]}) + arr_a = get_array(df, "a") + view = df[:] + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(to_replace=to_replace, value=15.5, inplace=True) + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + else: + assert np.shares_memory(get_array(df, "a"), arr_a) + + +@pytest.mark.parametrize("to_replace", ["a", 100.5]) +def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace): + df = DataFrame({"a": [1.5, 2, 3]}) + arr_a = get_array(df, "a") + view = df[:] + df.replace(to_replace=to_replace, value=15.5, inplace=True) + + assert np.shares_memory(get_array(df, "a"), arr_a) + if using_copy_on_write: + assert not df._mgr._has_no_reference(0) + assert not view._mgr._has_no_reference(0) + + +@pytest.mark.parametrize("to_replace", [1, [1]]) +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace): + df = DataFrame({"a": Categorical([1, 2, 3])}) + df_orig = df.copy() + arr_a = get_array(df, "a") + view = df[:] + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df.replace(to_replace=to_replace, value=val, inplace=True) + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) + + +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical_inplace(using_copy_on_write, val): + df = DataFrame({"a": Categorical([1, 2, 3])}) + arr_a = get_array(df, "a") + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df.replace(to_replace=1, value=val, inplace=True) + + assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + expected = DataFrame({"a": Categorical([val, 2, 3])}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical(using_copy_on_write, val): + df = DataFrame({"a": Categorical([1, 2, 3])}) + df_orig = df.copy() + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df2 = df.replace(to_replace=1, value=val) + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert df2._mgr._has_no_reference(0) + assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes) + tm.assert_frame_equal(df, df_orig) + + arr_a = get_array(df2, "a").codes + df2.iloc[0, 0] = 2.0 + assert np.shares_memory(get_array(df2, "a").codes, arr_a) + + +@pytest.mark.parametrize("method", ["where", "mask"]) +def test_masking_inplace(using_copy_on_write, method, warn_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3]}) + df_orig = df.copy() + arr_a = get_array(df, "a") + view = df[:] + + method = getattr(df, method) + if warn_copy_on_write: + with tm.assert_cow_warning(): + method(df["a"] > 1.6, -1, inplace=True) + else: + method(df["a"] > 1.6, -1, inplace=True) + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr_a) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "a"), arr_a) + + +def test_replace_empty_list(using_copy_on_write): + df = DataFrame({"a": [1, 2]}) + + df2 = df.replace([], []) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not df._mgr._has_no_reference(0) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + arr_a = get_array(df, "a") + df.replace([], []) + if using_copy_on_write: + assert np.shares_memory(get_array(df, "a"), arr_a) + assert not df._mgr._has_no_reference(0) + assert not df2._mgr._has_no_reference(0) + + +@pytest.mark.parametrize("value", ["d", None]) +def test_replace_object_list_inplace(using_copy_on_write, value): + df = DataFrame({"a": ["a", "b", "c"]}) + arr = get_array(df, "a") + df.replace(["c"], value, inplace=True) + if using_copy_on_write or value is None: + assert np.shares_memory(arr, get_array(df, "a")) + else: + # This could be inplace + assert not np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +def test_replace_list_multiple_elements_inplace(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + arr = get_array(df, "a") + df.replace([1, 2], 4, inplace=True) + if using_copy_on_write: + assert np.shares_memory(arr, get_array(df, "a")) + assert df._mgr._has_no_reference(0) + else: + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_replace_list_none(using_copy_on_write): + df = DataFrame({"a": ["a", "b", "c"]}) + + df_orig = df.copy() + df2 = df.replace(["b"], value=None) + tm.assert_frame_equal(df, df_orig) + + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + +def test_replace_list_none_inplace_refs(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": ["a", "b", "c"]}) + arr = get_array(df, "a") + df_orig = df.copy() + view = df[:] + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(["a"], value=None, inplace=True) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + else: + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_replace_columnwise_no_op_inplace(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + view = df[:] + df_orig = df.copy() + df.replace({"a": 10}, 100, inplace=True) + if using_copy_on_write: + assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(view, df_orig) + + +def test_replace_columnwise_no_op(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + df_orig = df.copy() + df2 = df.replace({"a": 10}, 100) + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + df2.iloc[0, 0] = 100 + tm.assert_frame_equal(df, df_orig) + + +def test_replace_chained_assignment(using_copy_on_write): + df = DataFrame({"a": [1, np.nan, 2], "b": 1}) + df_orig = df.copy() + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].replace(1, 100, inplace=True) + tm.assert_frame_equal(df, df_orig) + + with tm.raises_chained_assignment_error(): + df[["a"]].replace(1, 100, inplace=True) + tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[["a"]].replace(1, 100, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[df.a > 5].replace(1, 100, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].replace(1, 100, inplace=True) + + +def test_replace_listlike(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + df_orig = df.copy() + + result = df.replace([200, 201], [11, 11]) + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df, "a")) + + result.iloc[0, 0] = 100 + tm.assert_frame_equal(df, df) + + result = df.replace([200, 2], [10, 10]) + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + tm.assert_frame_equal(df, df_orig) + + +def test_replace_listlike_inplace(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + arr = get_array(df, "a") + df.replace([200, 2], [10, 11], inplace=True) + assert np.shares_memory(get_array(df, "a"), arr) + + view = df[:] + df_orig = df.copy() + with tm.assert_cow_warning(warn_copy_on_write): + df.replace([200, 3], [10, 11], inplace=True) + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), arr) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "a"), arr) + tm.assert_frame_equal(df, view) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_util.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_util.py new file mode 100644 index 0000000000000000000000000000000000000000..ff55330d70b28c5459a4c0915dd93c8640a91add --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/test_util.py @@ -0,0 +1,14 @@ +import numpy as np + +from pandas import DataFrame +from pandas.tests.copy_view.util import get_array + + +def test_get_array_numpy(): + df = DataFrame({"a": [1, 2, 3]}) + assert np.shares_memory(get_array(df, "a"), get_array(df, "a")) + + +def test_get_array_masked(): + df = DataFrame({"a": [1, 2, 3]}, dtype="Int64") + assert np.shares_memory(get_array(df, "a"), get_array(df, "a")) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/util.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/util.py new file mode 100644 index 0000000000000000000000000000000000000000..969334424936559767b0bca87093acfec52f9763 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/copy_view/util.py @@ -0,0 +1,30 @@ +from pandas import ( + Categorical, + Index, + Series, +) +from pandas.core.arrays import BaseMaskedArray + + +def get_array(obj, col=None): + """ + Helper method to get array for a DataFrame column or a Series. + + Equivalent of df[col].values, but without going through normal getitem, + which triggers tracking references / CoW (and we might be testing that + this is done by some other operation). + """ + if isinstance(obj, Index): + arr = obj._values + elif isinstance(obj, Series) and (col is None or obj.name == col): + arr = obj._values + else: + assert col is not None + icol = obj.columns.get_loc(col) + assert isinstance(icol, int) + arr = obj._get_column_array(icol) + if isinstance(arr, BaseMaskedArray): + return arr._data + elif isinstance(arr, Categorical): + return arr + return getattr(arr, "_ndarray", arr) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/test_common.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/test_common.py new file mode 100644 index 0000000000000000000000000000000000000000..c34c97b6e4f0483994f6c81f6a2c470f84e9c488 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/test_common.py @@ -0,0 +1,801 @@ +from __future__ import annotations + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas.core.dtypes.astype import astype_array +import pandas.core.dtypes.common as com +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + CategoricalDtypeType, + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import isna + +import pandas as pd +import pandas._testing as tm +from pandas.api.types import pandas_dtype +from pandas.arrays import SparseArray + + +# EA & Actual Dtypes +def to_ea_dtypes(dtypes): + """convert list of string dtypes to EA dtype""" + return [getattr(pd, dt + "Dtype") for dt in dtypes] + + +def to_numpy_dtypes(dtypes): + """convert list of string dtypes to numpy dtype""" + return [getattr(np, dt) for dt in dtypes if isinstance(dt, str)] + + +class TestNumpyEADtype: + # Passing invalid dtype, both as a string or object, must raise TypeError + # Per issue GH15520 + @pytest.mark.parametrize("box", [pd.Timestamp, "pd.Timestamp", list]) + def test_invalid_dtype_error(self, box): + with pytest.raises(TypeError, match="not understood"): + com.pandas_dtype(box) + + @pytest.mark.parametrize( + "dtype", + [ + object, + "float64", + np.object_, + np.dtype("object"), + "O", + np.float64, + float, + np.dtype("float64"), + "object_", + ], + ) + def test_pandas_dtype_valid(self, dtype): + assert com.pandas_dtype(dtype) == dtype + + @pytest.mark.parametrize( + "dtype", ["M8[ns]", "m8[ns]", "object", "float64", "int64"] + ) + def test_numpy_dtype(self, dtype): + assert com.pandas_dtype(dtype) == np.dtype(dtype) + + def test_numpy_string_dtype(self): + # do not parse freq-like string as period dtype + assert com.pandas_dtype("U") == np.dtype("U") + assert com.pandas_dtype("S") == np.dtype("S") + + @pytest.mark.parametrize( + "dtype", + [ + "datetime64[ns, US/Eastern]", + "datetime64[ns, Asia/Tokyo]", + "datetime64[ns, UTC]", + # GH#33885 check that the M8 alias is understood + "M8[ns, US/Eastern]", + "M8[ns, Asia/Tokyo]", + "M8[ns, UTC]", + ], + ) + def test_datetimetz_dtype(self, dtype): + assert com.pandas_dtype(dtype) == DatetimeTZDtype.construct_from_string(dtype) + assert com.pandas_dtype(dtype) == dtype + + def test_categorical_dtype(self): + assert com.pandas_dtype("category") == CategoricalDtype() + + @pytest.mark.parametrize( + "dtype", + [ + "period[D]", + "period[3M]", + "period[us]", + "Period[D]", + "Period[3M]", + "Period[us]", + ], + ) + def test_period_dtype(self, dtype): + assert com.pandas_dtype(dtype) is not PeriodDtype(dtype) + assert com.pandas_dtype(dtype) == PeriodDtype(dtype) + assert com.pandas_dtype(dtype) == dtype + + +dtypes = { + "datetime_tz": com.pandas_dtype("datetime64[ns, US/Eastern]"), + "datetime": com.pandas_dtype("datetime64[ns]"), + "timedelta": com.pandas_dtype("timedelta64[ns]"), + "period": PeriodDtype("D"), + "integer": np.dtype(np.int64), + "float": np.dtype(np.float64), + "object": np.dtype(object), + "category": com.pandas_dtype("category"), + "string": pd.StringDtype(), +} + + +@pytest.mark.parametrize("name1,dtype1", list(dtypes.items()), ids=lambda x: str(x)) +@pytest.mark.parametrize("name2,dtype2", list(dtypes.items()), ids=lambda x: str(x)) +def test_dtype_equal(name1, dtype1, name2, dtype2): + # match equal to self, but not equal to other + assert com.is_dtype_equal(dtype1, dtype1) + if name1 != name2: + assert not com.is_dtype_equal(dtype1, dtype2) + + +@pytest.mark.parametrize("name,dtype", list(dtypes.items()), ids=lambda x: str(x)) +def test_pyarrow_string_import_error(name, dtype): + # GH-44276 + assert not com.is_dtype_equal(dtype, "string[pyarrow]") + + +@pytest.mark.parametrize( + "dtype1,dtype2", + [ + (np.int8, np.int64), + (np.int16, np.int64), + (np.int32, np.int64), + (np.float32, np.float64), + (PeriodDtype("D"), PeriodDtype("2D")), # PeriodType + ( + com.pandas_dtype("datetime64[ns, US/Eastern]"), + com.pandas_dtype("datetime64[ns, CET]"), + ), # Datetime + (None, None), # gh-15941: no exception should be raised. + ], +) +def test_dtype_equal_strict(dtype1, dtype2): + assert not com.is_dtype_equal(dtype1, dtype2) + + +def get_is_dtype_funcs(): + """ + Get all functions in pandas.core.dtypes.common that + begin with 'is_' and end with 'dtype' + + """ + fnames = [f for f in dir(com) if (f.startswith("is_") and f.endswith("dtype"))] + fnames.remove("is_string_or_object_np_dtype") # fastpath requires np.dtype obj + return [getattr(com, fname) for fname in fnames] + + +@pytest.mark.filterwarnings( + "ignore:is_categorical_dtype is deprecated:DeprecationWarning" +) +@pytest.mark.parametrize("func", get_is_dtype_funcs(), ids=lambda x: x.__name__) +def test_get_dtype_error_catch(func): + # see gh-15941 + # + # No exception should be raised. + + msg = f"{func.__name__} is deprecated" + warn = None + if ( + func is com.is_int64_dtype + or func is com.is_interval_dtype + or func is com.is_datetime64tz_dtype + or func is com.is_categorical_dtype + or func is com.is_period_dtype + ): + warn = DeprecationWarning + + with tm.assert_produces_warning(warn, match=msg): + assert not func(None) + + +def test_is_object(): + assert com.is_object_dtype(object) + assert com.is_object_dtype(np.array([], dtype=object)) + + assert not com.is_object_dtype(int) + assert not com.is_object_dtype(np.array([], dtype=int)) + assert not com.is_object_dtype([1, 2, 3]) + + +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] +) +def test_is_sparse(check_scipy): + msg = "is_sparse is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert com.is_sparse(SparseArray([1, 2, 3])) + + assert not com.is_sparse(np.array([1, 2, 3])) + + if check_scipy: + import scipy.sparse + + assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3])) + + +def test_is_scipy_sparse(): + sp_sparse = pytest.importorskip("scipy.sparse") + + assert com.is_scipy_sparse(sp_sparse.bsr_matrix([1, 2, 3])) + + assert not com.is_scipy_sparse(SparseArray([1, 2, 3])) + + +def test_is_datetime64_dtype(): + assert not com.is_datetime64_dtype(object) + assert not com.is_datetime64_dtype([1, 2, 3]) + assert not com.is_datetime64_dtype(np.array([], dtype=int)) + + assert com.is_datetime64_dtype(np.datetime64) + assert com.is_datetime64_dtype(np.array([], dtype=np.datetime64)) + + +def test_is_datetime64tz_dtype(): + msg = "is_datetime64tz_dtype is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert not com.is_datetime64tz_dtype(object) + assert not com.is_datetime64tz_dtype([1, 2, 3]) + assert not com.is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) + assert com.is_datetime64tz_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) + + +def test_custom_ea_kind_M_not_datetime64tz(): + # GH 34986 + class NotTZDtype(ExtensionDtype): + @property + def kind(self) -> str: + return "M" + + not_tz_dtype = NotTZDtype() + msg = "is_datetime64tz_dtype is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert not com.is_datetime64tz_dtype(not_tz_dtype) + assert not com.needs_i8_conversion(not_tz_dtype) + + +def test_is_timedelta64_dtype(): + assert not com.is_timedelta64_dtype(object) + assert not com.is_timedelta64_dtype(None) + assert not com.is_timedelta64_dtype([1, 2, 3]) + assert not com.is_timedelta64_dtype(np.array([], dtype=np.datetime64)) + assert not com.is_timedelta64_dtype("0 days") + assert not com.is_timedelta64_dtype("0 days 00:00:00") + assert not com.is_timedelta64_dtype(["0 days 00:00:00"]) + assert not com.is_timedelta64_dtype("NO DATE") + + assert com.is_timedelta64_dtype(np.timedelta64) + assert com.is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) + assert com.is_timedelta64_dtype(pd.to_timedelta(["0 days", "1 days"])) + + +def test_is_period_dtype(): + msg = "is_period_dtype is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert not com.is_period_dtype(object) + assert not com.is_period_dtype([1, 2, 3]) + assert not com.is_period_dtype(pd.Period("2017-01-01")) + + assert com.is_period_dtype(PeriodDtype(freq="D")) + assert com.is_period_dtype(pd.PeriodIndex([], freq="Y")) + + +def test_is_interval_dtype(): + msg = "is_interval_dtype is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert not com.is_interval_dtype(object) + assert not com.is_interval_dtype([1, 2, 3]) + + assert com.is_interval_dtype(IntervalDtype()) + + interval = pd.Interval(1, 2, closed="right") + assert not com.is_interval_dtype(interval) + assert com.is_interval_dtype(pd.IntervalIndex([interval])) + + +def test_is_categorical_dtype(): + msg = "is_categorical_dtype is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert not com.is_categorical_dtype(object) + assert not com.is_categorical_dtype([1, 2, 3]) + + assert com.is_categorical_dtype(CategoricalDtype()) + assert com.is_categorical_dtype(pd.Categorical([1, 2, 3])) + assert com.is_categorical_dtype(pd.CategoricalIndex([1, 2, 3])) + + +@pytest.mark.parametrize( + "dtype, expected", + [ + (int, False), + (pd.Series([1, 2]), False), + (str, True), + (object, True), + (np.array(["a", "b"]), True), + (pd.StringDtype(), True), + (pd.Index([], dtype="O"), True), + ], +) +def test_is_string_dtype(dtype, expected): + # GH#54661 + + result = com.is_string_dtype(dtype) + assert result is expected + + +@pytest.mark.parametrize( + "data", + [[(0, 1), (1, 1)], pd.Categorical([1, 2, 3]), np.array([1, 2], dtype=object)], +) +def test_is_string_dtype_arraylike_with_object_elements_not_strings(data): + # GH 15585 + assert not com.is_string_dtype(pd.Series(data)) + + +def test_is_string_dtype_nullable(nullable_string_dtype): + assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype)) + + +integer_dtypes: list = [] + + +@pytest.mark.parametrize( + "dtype", + integer_dtypes + + [pd.Series([1, 2])] + + tm.ALL_INT_NUMPY_DTYPES + + to_numpy_dtypes(tm.ALL_INT_NUMPY_DTYPES) + + tm.ALL_INT_EA_DTYPES + + to_ea_dtypes(tm.ALL_INT_EA_DTYPES), +) +def test_is_integer_dtype(dtype): + assert com.is_integer_dtype(dtype) + + +@pytest.mark.parametrize( + "dtype", + [ + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ], +) +def test_is_not_integer_dtype(dtype): + assert not com.is_integer_dtype(dtype) + + +signed_integer_dtypes: list = [] + + +@pytest.mark.parametrize( + "dtype", + signed_integer_dtypes + + [pd.Series([1, 2])] + + tm.SIGNED_INT_NUMPY_DTYPES + + to_numpy_dtypes(tm.SIGNED_INT_NUMPY_DTYPES) + + tm.SIGNED_INT_EA_DTYPES + + to_ea_dtypes(tm.SIGNED_INT_EA_DTYPES), +) +def test_is_signed_integer_dtype(dtype): + assert com.is_integer_dtype(dtype) + + +@pytest.mark.parametrize( + "dtype", + [ + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ] + + tm.UNSIGNED_INT_NUMPY_DTYPES + + to_numpy_dtypes(tm.UNSIGNED_INT_NUMPY_DTYPES) + + tm.UNSIGNED_INT_EA_DTYPES + + to_ea_dtypes(tm.UNSIGNED_INT_EA_DTYPES), +) +def test_is_not_signed_integer_dtype(dtype): + assert not com.is_signed_integer_dtype(dtype) + + +unsigned_integer_dtypes: list = [] + + +@pytest.mark.parametrize( + "dtype", + unsigned_integer_dtypes + + [pd.Series([1, 2], dtype=np.uint32)] + + tm.UNSIGNED_INT_NUMPY_DTYPES + + to_numpy_dtypes(tm.UNSIGNED_INT_NUMPY_DTYPES) + + tm.UNSIGNED_INT_EA_DTYPES + + to_ea_dtypes(tm.UNSIGNED_INT_EA_DTYPES), +) +def test_is_unsigned_integer_dtype(dtype): + assert com.is_unsigned_integer_dtype(dtype) + + +@pytest.mark.parametrize( + "dtype", + [ + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ] + + tm.SIGNED_INT_NUMPY_DTYPES + + to_numpy_dtypes(tm.SIGNED_INT_NUMPY_DTYPES) + + tm.SIGNED_INT_EA_DTYPES + + to_ea_dtypes(tm.SIGNED_INT_EA_DTYPES), +) +def test_is_not_unsigned_integer_dtype(dtype): + assert not com.is_unsigned_integer_dtype(dtype) + + +@pytest.mark.parametrize( + "dtype", [np.int64, np.array([1, 2], dtype=np.int64), "Int64", pd.Int64Dtype] +) +def test_is_int64_dtype(dtype): + msg = "is_int64_dtype is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert com.is_int64_dtype(dtype) + + +def test_type_comparison_with_numeric_ea_dtype(any_numeric_ea_dtype): + # GH#43038 + assert pandas_dtype(any_numeric_ea_dtype) == any_numeric_ea_dtype + + +def test_type_comparison_with_real_numpy_dtype(any_real_numpy_dtype): + # GH#43038 + assert pandas_dtype(any_real_numpy_dtype) == any_real_numpy_dtype + + +def test_type_comparison_with_signed_int_ea_dtype_and_signed_int_numpy_dtype( + any_signed_int_ea_dtype, any_signed_int_numpy_dtype +): + # GH#43038 + assert not pandas_dtype(any_signed_int_ea_dtype) == any_signed_int_numpy_dtype + + +@pytest.mark.parametrize( + "dtype", + [ + str, + float, + np.int32, + np.uint64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([1, 2], dtype=np.uint32), + "int8", + "Int8", + pd.Int8Dtype, + ], +) +def test_is_not_int64_dtype(dtype): + msg = "is_int64_dtype is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert not com.is_int64_dtype(dtype) + + +def test_is_datetime64_any_dtype(): + assert not com.is_datetime64_any_dtype(int) + assert not com.is_datetime64_any_dtype(str) + assert not com.is_datetime64_any_dtype(np.array([1, 2])) + assert not com.is_datetime64_any_dtype(np.array(["a", "b"])) + + assert com.is_datetime64_any_dtype(np.datetime64) + assert com.is_datetime64_any_dtype(np.array([], dtype=np.datetime64)) + assert com.is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern")) + assert com.is_datetime64_any_dtype( + pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]") + ) + + +def test_is_datetime64_ns_dtype(): + assert not com.is_datetime64_ns_dtype(int) + assert not com.is_datetime64_ns_dtype(str) + assert not com.is_datetime64_ns_dtype(np.datetime64) + assert not com.is_datetime64_ns_dtype(np.array([1, 2])) + assert not com.is_datetime64_ns_dtype(np.array(["a", "b"])) + assert not com.is_datetime64_ns_dtype(np.array([], dtype=np.datetime64)) + + # This datetime array has the wrong unit (ps instead of ns) + assert not com.is_datetime64_ns_dtype(np.array([], dtype="datetime64[ps]")) + + assert com.is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern")) + assert com.is_datetime64_ns_dtype( + pd.DatetimeIndex([1, 2, 3], dtype=np.dtype("datetime64[ns]")) + ) + + # non-nano dt64tz + assert not com.is_datetime64_ns_dtype(DatetimeTZDtype("us", "US/Eastern")) + + +def test_is_timedelta64_ns_dtype(): + assert not com.is_timedelta64_ns_dtype(np.dtype("m8[ps]")) + assert not com.is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64)) + + assert com.is_timedelta64_ns_dtype(np.dtype("m8[ns]")) + assert com.is_timedelta64_ns_dtype(np.array([1, 2], dtype="m8[ns]")) + + +def test_is_numeric_v_string_like(): + assert not com.is_numeric_v_string_like(np.array([1]), 1) + assert not com.is_numeric_v_string_like(np.array([1]), np.array([2])) + assert not com.is_numeric_v_string_like(np.array(["foo"]), np.array(["foo"])) + + assert com.is_numeric_v_string_like(np.array([1]), "foo") + assert com.is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"])) + assert com.is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2])) + + +def test_needs_i8_conversion(): + assert not com.needs_i8_conversion(str) + assert not com.needs_i8_conversion(np.int64) + assert not com.needs_i8_conversion(pd.Series([1, 2])) + assert not com.needs_i8_conversion(np.array(["a", "b"])) + + assert not com.needs_i8_conversion(np.datetime64) + assert com.needs_i8_conversion(np.dtype(np.datetime64)) + assert not com.needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]")) + assert com.needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]").dtype) + assert not com.needs_i8_conversion(pd.DatetimeIndex(["2000"], tz="US/Eastern")) + assert com.needs_i8_conversion(pd.DatetimeIndex(["2000"], tz="US/Eastern").dtype) + + +def test_is_numeric_dtype(): + assert not com.is_numeric_dtype(str) + assert not com.is_numeric_dtype(np.datetime64) + assert not com.is_numeric_dtype(np.timedelta64) + assert not com.is_numeric_dtype(np.array(["a", "b"])) + assert not com.is_numeric_dtype(np.array([], dtype=np.timedelta64)) + + assert com.is_numeric_dtype(int) + assert com.is_numeric_dtype(float) + assert com.is_numeric_dtype(np.uint64) + assert com.is_numeric_dtype(pd.Series([1, 2])) + assert com.is_numeric_dtype(pd.Index([1, 2.0])) + + class MyNumericDType(ExtensionDtype): + @property + def type(self): + return str + + @property + def name(self): + raise NotImplementedError + + @classmethod + def construct_array_type(cls): + raise NotImplementedError + + def _is_numeric(self) -> bool: + return True + + assert com.is_numeric_dtype(MyNumericDType()) + + +def test_is_any_real_numeric_dtype(): + assert not com.is_any_real_numeric_dtype(str) + assert not com.is_any_real_numeric_dtype(bool) + assert not com.is_any_real_numeric_dtype(complex) + assert not com.is_any_real_numeric_dtype(object) + assert not com.is_any_real_numeric_dtype(np.datetime64) + assert not com.is_any_real_numeric_dtype(np.array(["a", "b", complex(1, 2)])) + assert not com.is_any_real_numeric_dtype(pd.DataFrame([complex(1, 2), True])) + + assert com.is_any_real_numeric_dtype(int) + assert com.is_any_real_numeric_dtype(float) + assert com.is_any_real_numeric_dtype(np.array([1, 2.5])) + + +def test_is_float_dtype(): + assert not com.is_float_dtype(str) + assert not com.is_float_dtype(int) + assert not com.is_float_dtype(pd.Series([1, 2])) + assert not com.is_float_dtype(np.array(["a", "b"])) + + assert com.is_float_dtype(float) + assert com.is_float_dtype(pd.Index([1, 2.0])) + + +def test_is_bool_dtype(): + assert not com.is_bool_dtype(int) + assert not com.is_bool_dtype(str) + assert not com.is_bool_dtype(pd.Series([1, 2])) + assert not com.is_bool_dtype(pd.Series(["a", "b"], dtype="category")) + assert not com.is_bool_dtype(np.array(["a", "b"])) + assert not com.is_bool_dtype(pd.Index(["a", "b"])) + assert not com.is_bool_dtype("Int64") + + assert com.is_bool_dtype(bool) + assert com.is_bool_dtype(np.bool_) + assert com.is_bool_dtype(pd.Series([True, False], dtype="category")) + assert com.is_bool_dtype(np.array([True, False])) + assert com.is_bool_dtype(pd.Index([True, False])) + + assert com.is_bool_dtype(pd.BooleanDtype()) + assert com.is_bool_dtype(pd.array([True, False, None], dtype="boolean")) + assert com.is_bool_dtype("boolean") + + +def test_is_bool_dtype_numpy_error(): + # GH39010 + assert not com.is_bool_dtype("0 - Name") + + +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] +) +def test_is_extension_array_dtype(check_scipy): + assert not com.is_extension_array_dtype([1, 2, 3]) + assert not com.is_extension_array_dtype(np.array([1, 2, 3])) + assert not com.is_extension_array_dtype(pd.DatetimeIndex([1, 2, 3])) + + cat = pd.Categorical([1, 2, 3]) + assert com.is_extension_array_dtype(cat) + assert com.is_extension_array_dtype(pd.Series(cat)) + assert com.is_extension_array_dtype(SparseArray([1, 2, 3])) + assert com.is_extension_array_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) + + dtype = DatetimeTZDtype("ns", tz="US/Eastern") + s = pd.Series([], dtype=dtype) + assert com.is_extension_array_dtype(s) + + if check_scipy: + import scipy.sparse + + assert not com.is_extension_array_dtype(scipy.sparse.bsr_matrix([1, 2, 3])) + + +def test_is_complex_dtype(): + assert not com.is_complex_dtype(int) + assert not com.is_complex_dtype(str) + assert not com.is_complex_dtype(pd.Series([1, 2])) + assert not com.is_complex_dtype(np.array(["a", "b"])) + + assert com.is_complex_dtype(np.complex128) + assert com.is_complex_dtype(complex) + assert com.is_complex_dtype(np.array([1 + 1j, 5])) + + +@pytest.mark.parametrize( + "input_param,result", + [ + (int, np.dtype(int)), + ("int32", np.dtype("int32")), + (float, np.dtype(float)), + ("float64", np.dtype("float64")), + (np.dtype("float64"), np.dtype("float64")), + (str, np.dtype(str)), + (pd.Series([1, 2], dtype=np.dtype("int16")), np.dtype("int16")), + (pd.Series(["a", "b"], dtype=object), np.dtype(object)), + (pd.Index([1, 2]), np.dtype("int64")), + (pd.Index(["a", "b"], dtype=object), np.dtype(object)), + ("category", "category"), + (pd.Categorical(["a", "b"]).dtype, CategoricalDtype(["a", "b"])), + (pd.Categorical(["a", "b"]), CategoricalDtype(["a", "b"])), + (pd.CategoricalIndex(["a", "b"]).dtype, CategoricalDtype(["a", "b"])), + (pd.CategoricalIndex(["a", "b"]), CategoricalDtype(["a", "b"])), + (CategoricalDtype(), CategoricalDtype()), + (pd.DatetimeIndex([1, 2]), np.dtype("=M8[ns]")), + (pd.DatetimeIndex([1, 2]).dtype, np.dtype("=M8[ns]")), + (" df.two.sum() + + with tm.assert_produces_warning(None): + # successfully modify column in place + # this should not raise a warning + df.one += 1 + assert df.one.iloc[0] == 2 + + with tm.assert_produces_warning(None): + # successfully add an attribute to a series + # this should not raise a warning + df.two.not_an_index = [1, 2] + + with tm.assert_produces_warning(UserWarning): + # warn when setting column to nonexistent name + df.four = df.two + 2 + assert df.four.sum() > df.two.sum() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/test_inference.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/test_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..0567be737c681282d162225d10a0849e476a579a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/test_inference.py @@ -0,0 +1,2047 @@ +""" +These the test the public routines exposed in types/common.py +related to inference and not otherwise tested in types/test_common.py + +""" +import collections +from collections import namedtuple +from collections.abc import Iterator +from datetime import ( + date, + datetime, + time, + timedelta, +) +from decimal import Decimal +from fractions import Fraction +from io import StringIO +import itertools +from numbers import Number +import re +import sys +from typing import ( + Generic, + TypeVar, +) + +import numpy as np +import pytest +import pytz + +from pandas._libs import ( + lib, + missing as libmissing, + ops as libops, +) +from pandas.compat.numpy import np_version_gt2 + +from pandas.core.dtypes import inference +from pandas.core.dtypes.cast import find_result_type +from pandas.core.dtypes.common import ( + ensure_int32, + is_bool, + is_complex, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_float, + is_integer, + is_number, + is_scalar, + is_scipy_sparse, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, +) + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + DateOffset, + DatetimeIndex, + Index, + Interval, + Period, + PeriodIndex, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, +) +import pandas._testing as tm +from pandas.core.arrays import ( + BooleanArray, + FloatingArray, + IntegerArray, +) + + +@pytest.fixture(params=[True, False], ids=str) +def coerce(request): + return request.param + + +class MockNumpyLikeArray: + """ + A class which is numpy-like (e.g. Pint's Quantity) but not actually numpy + + The key is that it is not actually a numpy array so + ``util.is_array(mock_numpy_like_array_instance)`` returns ``False``. Other + important properties are that the class defines a :meth:`__iter__` method + (so that ``isinstance(abc.Iterable)`` returns ``True``) and has a + :meth:`ndim` property, as pandas special-cases 0-dimensional arrays in some + cases. + + We expect pandas to behave with respect to such duck arrays exactly as + with real numpy arrays. In particular, a 0-dimensional duck array is *NOT* + a scalar (`is_scalar(np.array(1)) == False`), but it is not list-like either. + """ + + def __init__(self, values) -> None: + self._values = values + + def __iter__(self) -> Iterator: + iter_values = iter(self._values) + + def it_outer(): + yield from iter_values + + return it_outer() + + def __len__(self) -> int: + return len(self._values) + + def __array__(self, dtype=None, copy=None): + return np.asarray(self._values, dtype=dtype) + + @property + def ndim(self): + return self._values.ndim + + @property + def dtype(self): + return self._values.dtype + + @property + def size(self): + return self._values.size + + @property + def shape(self): + return self._values.shape + + +# collect all objects to be tested for list-like-ness; use tuples of objects, +# whether they are list-like or not (special casing for sets), and their ID +ll_params = [ + ([1], True, "list"), + ([], True, "list-empty"), + ((1,), True, "tuple"), + ((), True, "tuple-empty"), + ({"a": 1}, True, "dict"), + ({}, True, "dict-empty"), + ({"a", 1}, "set", "set"), + (set(), "set", "set-empty"), + (frozenset({"a", 1}), "set", "frozenset"), + (frozenset(), "set", "frozenset-empty"), + (iter([1, 2]), True, "iterator"), + (iter([]), True, "iterator-empty"), + ((x for x in [1, 2]), True, "generator"), + ((_ for _ in []), True, "generator-empty"), + (Series([1]), True, "Series"), + (Series([], dtype=object), True, "Series-empty"), + # Series.str will still raise a TypeError if iterated + (Series(["a"]).str, True, "StringMethods"), + (Series([], dtype="O").str, True, "StringMethods-empty"), + (Index([1]), True, "Index"), + (Index([]), True, "Index-empty"), + (DataFrame([[1]]), True, "DataFrame"), + (DataFrame(), True, "DataFrame-empty"), + (np.ndarray((2,) * 1), True, "ndarray-1d"), + (np.array([]), True, "ndarray-1d-empty"), + (np.ndarray((2,) * 2), True, "ndarray-2d"), + (np.array([[]]), True, "ndarray-2d-empty"), + (np.ndarray((2,) * 3), True, "ndarray-3d"), + (np.array([[[]]]), True, "ndarray-3d-empty"), + (np.ndarray((2,) * 4), True, "ndarray-4d"), + (np.array([[[[]]]]), True, "ndarray-4d-empty"), + (np.array(2), False, "ndarray-0d"), + (MockNumpyLikeArray(np.ndarray((2,) * 1)), True, "duck-ndarray-1d"), + (MockNumpyLikeArray(np.array([])), True, "duck-ndarray-1d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 2)), True, "duck-ndarray-2d"), + (MockNumpyLikeArray(np.array([[]])), True, "duck-ndarray-2d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 3)), True, "duck-ndarray-3d"), + (MockNumpyLikeArray(np.array([[[]]])), True, "duck-ndarray-3d-empty"), + (MockNumpyLikeArray(np.ndarray((2,) * 4)), True, "duck-ndarray-4d"), + (MockNumpyLikeArray(np.array([[[[]]]])), True, "duck-ndarray-4d-empty"), + (MockNumpyLikeArray(np.array(2)), False, "duck-ndarray-0d"), + (1, False, "int"), + (b"123", False, "bytes"), + (b"", False, "bytes-empty"), + ("123", False, "string"), + ("", False, "string-empty"), + (str, False, "string-type"), + (object(), False, "object"), + (np.nan, False, "NaN"), + (None, False, "None"), +] +objs, expected, ids = zip(*ll_params) + + +@pytest.fixture(params=zip(objs, expected), ids=ids) +def maybe_list_like(request): + return request.param + + +def test_is_list_like(maybe_list_like): + obj, expected = maybe_list_like + expected = True if expected == "set" else expected + assert inference.is_list_like(obj) == expected + + +def test_is_list_like_disallow_sets(maybe_list_like): + obj, expected = maybe_list_like + expected = False if expected == "set" else expected + assert inference.is_list_like(obj, allow_sets=False) == expected + + +def test_is_list_like_recursion(): + # GH 33721 + # interpreter would crash with SIGABRT + def list_like(): + inference.is_list_like([]) + list_like() + + rec_limit = sys.getrecursionlimit() + try: + # Limit to avoid stack overflow on Windows CI + sys.setrecursionlimit(100) + with tm.external_error_raised(RecursionError): + list_like() + finally: + sys.setrecursionlimit(rec_limit) + + +def test_is_list_like_iter_is_none(): + # GH 43373 + # is_list_like was yielding false positives with __iter__ == None + class NotListLike: + def __getitem__(self, item): + return self + + __iter__ = None + + assert not inference.is_list_like(NotListLike()) + + +def test_is_list_like_generic(): + # GH 49649 + # is_list_like was yielding false positives for Generic classes in python 3.11 + T = TypeVar("T") + + class MyDataFrame(DataFrame, Generic[T]): + ... + + tstc = MyDataFrame[int] + tst = MyDataFrame[int]({"x": [1, 2, 3]}) + + assert not inference.is_list_like(tstc) + assert isinstance(tst, DataFrame) + assert inference.is_list_like(tst) + + +def test_is_sequence(): + is_seq = inference.is_sequence + assert is_seq((1, 2)) + assert is_seq([1, 2]) + assert not is_seq("abcd") + assert not is_seq(np.int64) + + class A: + def __getitem__(self, item): + return 1 + + assert not is_seq(A()) + + +def test_is_array_like(): + assert inference.is_array_like(Series([], dtype=object)) + assert inference.is_array_like(Series([1, 2])) + assert inference.is_array_like(np.array(["a", "b"])) + assert inference.is_array_like(Index(["2016-01-01"])) + assert inference.is_array_like(np.array([2, 3])) + assert inference.is_array_like(MockNumpyLikeArray(np.array([2, 3]))) + + class DtypeList(list): + dtype = "special" + + assert inference.is_array_like(DtypeList()) + + assert not inference.is_array_like([1, 2, 3]) + assert not inference.is_array_like(()) + assert not inference.is_array_like("foo") + assert not inference.is_array_like(123) + + +@pytest.mark.parametrize( + "inner", + [ + [], + [1], + (1,), + (1, 2), + {"a": 1}, + {1, "a"}, + Series([1]), + Series([], dtype=object), + Series(["a"]).str, + (x for x in range(5)), + ], +) +@pytest.mark.parametrize("outer", [list, Series, np.array, tuple]) +def test_is_nested_list_like_passes(inner, outer): + result = outer([inner for _ in range(5)]) + assert inference.is_list_like(result) + + +@pytest.mark.parametrize( + "obj", + [ + "abc", + [], + [1], + (1,), + ["a"], + "a", + {"a"}, + [1, 2, 3], + Series([1]), + DataFrame({"A": [1]}), + ([1, 2] for _ in range(5)), + ], +) +def test_is_nested_list_like_fails(obj): + assert not inference.is_nested_list_like(obj) + + +@pytest.mark.parametrize("ll", [{}, {"A": 1}, Series([1]), collections.defaultdict()]) +def test_is_dict_like_passes(ll): + assert inference.is_dict_like(ll) + + +@pytest.mark.parametrize( + "ll", + [ + "1", + 1, + [1, 2], + (1, 2), + range(2), + Index([1]), + dict, + collections.defaultdict, + Series, + ], +) +def test_is_dict_like_fails(ll): + assert not inference.is_dict_like(ll) + + +@pytest.mark.parametrize("has_keys", [True, False]) +@pytest.mark.parametrize("has_getitem", [True, False]) +@pytest.mark.parametrize("has_contains", [True, False]) +def test_is_dict_like_duck_type(has_keys, has_getitem, has_contains): + class DictLike: + def __init__(self, d) -> None: + self.d = d + + if has_keys: + + def keys(self): + return self.d.keys() + + if has_getitem: + + def __getitem__(self, key): + return self.d.__getitem__(key) + + if has_contains: + + def __contains__(self, key) -> bool: + return self.d.__contains__(key) + + d = DictLike({1: 2}) + result = inference.is_dict_like(d) + expected = has_keys and has_getitem and has_contains + + assert result is expected + + +def test_is_file_like(): + class MockFile: + pass + + is_file = inference.is_file_like + + data = StringIO("data") + assert is_file(data) + + # No read / write attributes + # No iterator attributes + m = MockFile() + assert not is_file(m) + + MockFile.write = lambda self: 0 + + # Write attribute but not an iterator + m = MockFile() + assert not is_file(m) + + # gh-16530: Valid iterator just means we have the + # __iter__ attribute for our purposes. + MockFile.__iter__ = lambda self: self + + # Valid write-only file + m = MockFile() + assert is_file(m) + + del MockFile.write + MockFile.read = lambda self: 0 + + # Valid read-only file + m = MockFile() + assert is_file(m) + + # Iterator but no read / write attributes + data = [1, 2, 3] + assert not is_file(data) + + +test_tuple = collections.namedtuple("test_tuple", ["a", "b", "c"]) + + +@pytest.mark.parametrize("ll", [test_tuple(1, 2, 3)]) +def test_is_names_tuple_passes(ll): + assert inference.is_named_tuple(ll) + + +@pytest.mark.parametrize("ll", [(1, 2, 3), "a", Series({"pi": 3.14})]) +def test_is_names_tuple_fails(ll): + assert not inference.is_named_tuple(ll) + + +def test_is_hashable(): + # all new-style classes are hashable by default + class HashableClass: + pass + + class UnhashableClass1: + __hash__ = None + + class UnhashableClass2: + def __hash__(self): + raise TypeError("Not hashable") + + hashable = (1, 3.14, np.float64(3.14), "a", (), (1,), HashableClass()) + not_hashable = ([], UnhashableClass1()) + abc_hashable_not_really_hashable = (([],), UnhashableClass2()) + + for i in hashable: + assert inference.is_hashable(i) + for i in not_hashable: + assert not inference.is_hashable(i) + for i in abc_hashable_not_really_hashable: + assert not inference.is_hashable(i) + + # numpy.array is no longer collections.abc.Hashable as of + # https://github.com/numpy/numpy/pull/5326, just test + # is_hashable() + assert not inference.is_hashable(np.array([])) + + +@pytest.mark.parametrize("ll", [re.compile("ad")]) +def test_is_re_passes(ll): + assert inference.is_re(ll) + + +@pytest.mark.parametrize("ll", ["x", 2, 3, object()]) +def test_is_re_fails(ll): + assert not inference.is_re(ll) + + +@pytest.mark.parametrize( + "ll", [r"a", "x", r"asdf", re.compile("adsf"), r"\u2233\s*", re.compile(r"")] +) +def test_is_recompilable_passes(ll): + assert inference.is_re_compilable(ll) + + +@pytest.mark.parametrize("ll", [1, [], object()]) +def test_is_recompilable_fails(ll): + assert not inference.is_re_compilable(ll) + + +class TestInference: + @pytest.mark.parametrize( + "arr", + [ + np.array(list("abc"), dtype="S1"), + np.array(list("abc"), dtype="S1").astype(object), + [b"a", np.nan, b"c"], + ], + ) + def test_infer_dtype_bytes(self, arr): + result = lib.infer_dtype(arr, skipna=True) + assert result == "bytes" + + @pytest.mark.parametrize( + "value, expected", + [ + (float("inf"), True), + (np.inf, True), + (-np.inf, False), + (1, False), + ("a", False), + ], + ) + def test_isposinf_scalar(self, value, expected): + # GH 11352 + result = libmissing.isposinf_scalar(value) + assert result is expected + + @pytest.mark.parametrize( + "value, expected", + [ + (float("-inf"), True), + (-np.inf, True), + (np.inf, False), + (1, False), + ("a", False), + ], + ) + def test_isneginf_scalar(self, value, expected): + result = libmissing.isneginf_scalar(value) + assert result is expected + + @pytest.mark.parametrize( + "convert_to_masked_nullable, exp", + [ + ( + True, + BooleanArray( + np.array([True, False], dtype="bool"), np.array([False, True]) + ), + ), + (False, np.array([True, np.nan], dtype="object")), + ], + ) + def test_maybe_convert_nullable_boolean(self, convert_to_masked_nullable, exp): + # GH 40687 + arr = np.array([True, np.nan], dtype=object) + result = libops.maybe_convert_bool( + arr, set(), convert_to_masked_nullable=convert_to_masked_nullable + ) + if convert_to_masked_nullable: + tm.assert_extension_array_equal(BooleanArray(*result), exp) + else: + result = result[0] + tm.assert_numpy_array_equal(result, exp) + + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + @pytest.mark.parametrize("coerce_numeric", [True, False]) + @pytest.mark.parametrize( + "infinity", ["inf", "inF", "iNf", "Inf", "iNF", "InF", "INf", "INF"] + ) + @pytest.mark.parametrize("prefix", ["", "-", "+"]) + def test_maybe_convert_numeric_infinities( + self, coerce_numeric, infinity, prefix, convert_to_masked_nullable + ): + # see gh-13274 + result, _ = lib.maybe_convert_numeric( + np.array([prefix + infinity], dtype=object), + na_values={"", "NULL", "nan"}, + coerce_numeric=coerce_numeric, + convert_to_masked_nullable=convert_to_masked_nullable, + ) + expected = np.array([np.inf if prefix in ["", "+"] else -np.inf]) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_maybe_convert_numeric_infinities_raises(self, convert_to_masked_nullable): + msg = "Unable to parse string" + with pytest.raises(ValueError, match=msg): + lib.maybe_convert_numeric( + np.array(["foo_inf"], dtype=object), + na_values={"", "NULL", "nan"}, + coerce_numeric=False, + convert_to_masked_nullable=convert_to_masked_nullable, + ) + + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_maybe_convert_numeric_post_floatify_nan( + self, coerce, convert_to_masked_nullable + ): + # see gh-13314 + data = np.array(["1.200", "-999.000", "4.500"], dtype=object) + expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) + nan_values = {-999, -999.0} + + out = lib.maybe_convert_numeric( + data, + nan_values, + coerce, + convert_to_masked_nullable=convert_to_masked_nullable, + ) + if convert_to_masked_nullable: + expected = FloatingArray(expected, np.isnan(expected)) + tm.assert_extension_array_equal(expected, FloatingArray(*out)) + else: + out = out[0] + tm.assert_numpy_array_equal(out, expected) + + def test_convert_infs(self): + arr = np.array(["inf", "inf", "inf"], dtype="O") + result, _ = lib.maybe_convert_numeric(arr, set(), False) + assert result.dtype == np.float64 + + arr = np.array(["-inf", "-inf", "-inf"], dtype="O") + result, _ = lib.maybe_convert_numeric(arr, set(), False) + assert result.dtype == np.float64 + + def test_scientific_no_exponent(self): + # See PR 12215 + arr = np.array(["42E", "2E", "99e", "6e"], dtype="O") + result, _ = lib.maybe_convert_numeric(arr, set(), False, True) + assert np.all(np.isnan(result)) + + def test_convert_non_hashable(self): + # GH13324 + # make sure that we are handing non-hashables + arr = np.array([[10.0, 2], 1.0, "apple"], dtype=object) + result, _ = lib.maybe_convert_numeric(arr, set(), False, True) + tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) + + def test_convert_numeric_uint64(self): + arr = np.array([2**63], dtype=object) + exp = np.array([2**63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set())[0], exp) + + arr = np.array([str(2**63)], dtype=object) + exp = np.array([2**63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set())[0], exp) + + arr = np.array([np.uint64(2**63)], dtype=object) + exp = np.array([2**63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set())[0], exp) + + @pytest.mark.parametrize( + "arr", + [ + np.array([2**63, np.nan], dtype=object), + np.array([str(2**63), np.nan], dtype=object), + np.array([np.nan, 2**63], dtype=object), + np.array([np.nan, str(2**63)], dtype=object), + ], + ) + def test_convert_numeric_uint64_nan(self, coerce, arr): + expected = arr.astype(float) if coerce else arr.copy() + result, _ = lib.maybe_convert_numeric(arr, set(), coerce_numeric=coerce) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_convert_numeric_uint64_nan_values( + self, coerce, convert_to_masked_nullable + ): + arr = np.array([2**63, 2**63 + 1], dtype=object) + na_values = {2**63} + + expected = ( + np.array([np.nan, 2**63 + 1], dtype=float) if coerce else arr.copy() + ) + result = lib.maybe_convert_numeric( + arr, + na_values, + coerce_numeric=coerce, + convert_to_masked_nullable=convert_to_masked_nullable, + ) + if convert_to_masked_nullable and coerce: + expected = IntegerArray( + np.array([0, 2**63 + 1], dtype="u8"), + np.array([True, False], dtype="bool"), + ) + result = IntegerArray(*result) + else: + result = result[0] # discard mask + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize( + "case", + [ + np.array([2**63, -1], dtype=object), + np.array([str(2**63), -1], dtype=object), + np.array([str(2**63), str(-1)], dtype=object), + np.array([-1, 2**63], dtype=object), + np.array([-1, str(2**63)], dtype=object), + np.array([str(-1), str(2**63)], dtype=object), + ], + ) + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_convert_numeric_int64_uint64( + self, case, coerce, convert_to_masked_nullable + ): + expected = case.astype(float) if coerce else case.copy() + result, _ = lib.maybe_convert_numeric( + case, + set(), + coerce_numeric=coerce, + convert_to_masked_nullable=convert_to_masked_nullable, + ) + + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_convert_numeric_string_uint64(self, convert_to_masked_nullable): + # GH32394 + result = lib.maybe_convert_numeric( + np.array(["uint64"], dtype=object), + set(), + coerce_numeric=True, + convert_to_masked_nullable=convert_to_masked_nullable, + ) + if convert_to_masked_nullable: + result = FloatingArray(*result) + else: + result = result[0] + assert np.isnan(result) + + @pytest.mark.parametrize("value", [-(2**63) - 1, 2**64]) + def test_convert_int_overflow(self, value): + # see gh-18584 + arr = np.array([value], dtype=object) + result = lib.maybe_convert_objects(arr) + tm.assert_numpy_array_equal(arr, result) + + @pytest.mark.parametrize("val", [None, np.nan, float("nan")]) + @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) + def test_maybe_convert_objects_nat_inference(self, val, dtype): + dtype = np.dtype(dtype) + vals = np.array([pd.NaT, val], dtype=object) + result = lib.maybe_convert_objects( + vals, + convert_non_numeric=True, + dtype_if_all_nat=dtype, + ) + assert result.dtype == dtype + assert np.isnat(result).all() + + result = lib.maybe_convert_objects( + vals[::-1], + convert_non_numeric=True, + dtype_if_all_nat=dtype, + ) + assert result.dtype == dtype + assert np.isnat(result).all() + + @pytest.mark.parametrize( + "value, expected_dtype", + [ + # see gh-4471 + ([2**63], np.uint64), + # NumPy bug: can't compare uint64 to int64, as that + # results in both casting to float64, so we should + # make sure that this function is robust against it + ([np.uint64(2**63)], np.uint64), + ([2, -1], np.int64), + ([2**63, -1], object), + # GH#47294 + ([np.uint8(1)], np.uint8), + ([np.uint16(1)], np.uint16), + ([np.uint32(1)], np.uint32), + ([np.uint64(1)], np.uint64), + ([np.uint8(2), np.uint16(1)], np.uint16), + ([np.uint32(2), np.uint16(1)], np.uint32), + ([np.uint32(2), -1], object), + ([np.uint32(2), 1], np.uint64), + ([np.uint32(2), np.int32(1)], object), + ], + ) + def test_maybe_convert_objects_uint(self, value, expected_dtype): + arr = np.array(value, dtype=object) + exp = np.array(value, dtype=expected_dtype) + tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) + + def test_maybe_convert_objects_datetime(self): + # GH27438 + arr = np.array( + [np.datetime64("2000-01-01"), np.timedelta64(1, "s")], dtype=object + ) + exp = arr.copy() + out = lib.maybe_convert_objects(arr, convert_non_numeric=True) + tm.assert_numpy_array_equal(out, exp) + + arr = np.array([pd.NaT, np.timedelta64(1, "s")], dtype=object) + exp = np.array([np.timedelta64("NaT"), np.timedelta64(1, "s")], dtype="m8[ns]") + out = lib.maybe_convert_objects(arr, convert_non_numeric=True) + tm.assert_numpy_array_equal(out, exp) + + # with convert_non_numeric=True, the nan is a valid NA value for td64 + arr = np.array([np.timedelta64(1, "s"), np.nan], dtype=object) + exp = exp[::-1] + out = lib.maybe_convert_objects(arr, convert_non_numeric=True) + tm.assert_numpy_array_equal(out, exp) + + def test_maybe_convert_objects_dtype_if_all_nat(self): + arr = np.array([pd.NaT, pd.NaT], dtype=object) + out = lib.maybe_convert_objects(arr, convert_non_numeric=True) + # no dtype_if_all_nat passed -> we dont guess + tm.assert_numpy_array_equal(out, arr) + + out = lib.maybe_convert_objects( + arr, + convert_non_numeric=True, + dtype_if_all_nat=np.dtype("timedelta64[ns]"), + ) + exp = np.array(["NaT", "NaT"], dtype="timedelta64[ns]") + tm.assert_numpy_array_equal(out, exp) + + out = lib.maybe_convert_objects( + arr, + convert_non_numeric=True, + dtype_if_all_nat=np.dtype("datetime64[ns]"), + ) + exp = np.array(["NaT", "NaT"], dtype="datetime64[ns]") + tm.assert_numpy_array_equal(out, exp) + + def test_maybe_convert_objects_dtype_if_all_nat_invalid(self): + # we accept datetime64[ns], timedelta64[ns], and EADtype + arr = np.array([pd.NaT, pd.NaT], dtype=object) + + with pytest.raises(ValueError, match="int64"): + lib.maybe_convert_objects( + arr, + convert_non_numeric=True, + dtype_if_all_nat=np.dtype("int64"), + ) + + @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) + def test_maybe_convert_objects_datetime_overflow_safe(self, dtype): + stamp = datetime(2363, 10, 4) # Enterprise-D launch date + if dtype == "timedelta64[ns]": + stamp = stamp - datetime(1970, 1, 1) + arr = np.array([stamp], dtype=object) + + out = lib.maybe_convert_objects(arr, convert_non_numeric=True) + # no OutOfBoundsDatetime/OutOfBoundsTimedeltas + tm.assert_numpy_array_equal(out, arr) + + def test_maybe_convert_objects_mixed_datetimes(self): + ts = Timestamp("now") + vals = [ts, ts.to_pydatetime(), ts.to_datetime64(), pd.NaT, np.nan, None] + + for data in itertools.permutations(vals): + data = np.array(list(data), dtype=object) + expected = DatetimeIndex(data)._data._ndarray + result = lib.maybe_convert_objects(data, convert_non_numeric=True) + tm.assert_numpy_array_equal(result, expected) + + def test_maybe_convert_objects_timedelta64_nat(self): + obj = np.timedelta64("NaT", "ns") + arr = np.array([obj], dtype=object) + assert arr[0] is obj + + result = lib.maybe_convert_objects(arr, convert_non_numeric=True) + + expected = np.array([obj], dtype="m8[ns]") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "exp", + [ + IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])), + IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])), + ], + ) + def test_maybe_convert_objects_nullable_integer(self, exp): + # GH27335 + arr = np.array([2, np.nan], dtype=object) + result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) + + tm.assert_extension_array_equal(result, exp) + + @pytest.mark.parametrize( + "dtype, val", [("int64", 1), ("uint64", np.iinfo(np.int64).max + 1)] + ) + def test_maybe_convert_objects_nullable_none(self, dtype, val): + # GH#50043 + arr = np.array([val, None, 3], dtype="object") + result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) + expected = IntegerArray( + np.array([val, 0, 3], dtype=dtype), np.array([False, True, False]) + ) + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize( + "convert_to_masked_nullable, exp", + [ + (True, IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True]))), + (False, np.array([2, np.nan], dtype="float64")), + ], + ) + def test_maybe_convert_numeric_nullable_integer( + self, convert_to_masked_nullable, exp + ): + # GH 40687 + arr = np.array([2, np.nan], dtype=object) + result = lib.maybe_convert_numeric( + arr, set(), convert_to_masked_nullable=convert_to_masked_nullable + ) + if convert_to_masked_nullable: + result = IntegerArray(*result) + tm.assert_extension_array_equal(result, exp) + else: + result = result[0] + tm.assert_numpy_array_equal(result, exp) + + @pytest.mark.parametrize( + "convert_to_masked_nullable, exp", + [ + ( + True, + FloatingArray( + np.array([2.0, 0.0], dtype="float64"), np.array([False, True]) + ), + ), + (False, np.array([2.0, np.nan], dtype="float64")), + ], + ) + def test_maybe_convert_numeric_floating_array( + self, convert_to_masked_nullable, exp + ): + # GH 40687 + arr = np.array([2.0, np.nan], dtype=object) + result = lib.maybe_convert_numeric( + arr, set(), convert_to_masked_nullable=convert_to_masked_nullable + ) + if convert_to_masked_nullable: + tm.assert_extension_array_equal(FloatingArray(*result), exp) + else: + result = result[0] + tm.assert_numpy_array_equal(result, exp) + + def test_maybe_convert_objects_bool_nan(self): + # GH32146 + ind = Index([True, False, np.nan], dtype=object) + exp = np.array([True, False, np.nan], dtype=object) + out = lib.maybe_convert_objects(ind.values, safe=1) + tm.assert_numpy_array_equal(out, exp) + + def test_maybe_convert_objects_nullable_boolean(self): + # GH50047 + arr = np.array([True, False], dtype=object) + exp = np.array([True, False]) + out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) + tm.assert_numpy_array_equal(out, exp) + + arr = np.array([True, False, pd.NaT], dtype=object) + exp = np.array([True, False, pd.NaT], dtype=object) + out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) + tm.assert_numpy_array_equal(out, exp) + + @pytest.mark.parametrize("val", [None, np.nan]) + def test_maybe_convert_objects_nullable_boolean_na(self, val): + # GH50047 + arr = np.array([True, False, val], dtype=object) + exp = BooleanArray( + np.array([True, False, False]), np.array([False, False, True]) + ) + out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) + tm.assert_extension_array_equal(out, exp) + + @pytest.mark.parametrize( + "data0", + [ + True, + 1, + 1.0, + 1.0 + 1.0j, + np.int8(1), + np.int16(1), + np.int32(1), + np.int64(1), + np.float16(1), + np.float32(1), + np.float64(1), + np.complex64(1), + np.complex128(1), + ], + ) + @pytest.mark.parametrize( + "data1", + [ + True, + 1, + 1.0, + 1.0 + 1.0j, + np.int8(1), + np.int16(1), + np.int32(1), + np.int64(1), + np.float16(1), + np.float32(1), + np.float64(1), + np.complex64(1), + np.complex128(1), + ], + ) + def test_maybe_convert_objects_itemsize(self, data0, data1): + # GH 40908 + data = [data0, data1] + arr = np.array(data, dtype="object") + + common_kind = np.result_type(type(data0), type(data1)).kind + kind0 = "python" if not hasattr(data0, "dtype") else data0.dtype.kind + kind1 = "python" if not hasattr(data1, "dtype") else data1.dtype.kind + if kind0 != "python" and kind1 != "python": + kind = common_kind + itemsize = max(data0.dtype.itemsize, data1.dtype.itemsize) + elif is_bool(data0) or is_bool(data1): + kind = "bool" if (is_bool(data0) and is_bool(data1)) else "object" + itemsize = "" + elif is_complex(data0) or is_complex(data1): + kind = common_kind + itemsize = 16 + else: + kind = common_kind + itemsize = 8 + + expected = np.array(data, dtype=f"{kind}{itemsize}") + result = lib.maybe_convert_objects(arr) + tm.assert_numpy_array_equal(result, expected) + + def test_mixed_dtypes_remain_object_array(self): + # GH14956 + arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) + result = lib.maybe_convert_objects(arr, convert_non_numeric=True) + tm.assert_numpy_array_equal(result, arr) + + @pytest.mark.parametrize( + "idx", + [ + pd.IntervalIndex.from_breaks(range(5), closed="both"), + pd.period_range("2016-01-01", periods=3, freq="D"), + ], + ) + def test_maybe_convert_objects_ea(self, idx): + result = lib.maybe_convert_objects( + np.array(idx, dtype=object), + convert_non_numeric=True, + ) + tm.assert_extension_array_equal(result, idx._data) + + +class TestTypeInference: + # Dummy class used for testing with Python objects + class Dummy: + pass + + def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype): + # see pandas/conftest.py + inferred_dtype, values = any_skipna_inferred_dtype + + # make sure the inferred dtype of the fixture is as requested + assert inferred_dtype == lib.infer_dtype(values, skipna=True) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_length_zero(self, skipna): + result = lib.infer_dtype(np.array([], dtype="i4"), skipna=skipna) + assert result == "integer" + + result = lib.infer_dtype([], skipna=skipna) + assert result == "empty" + + # GH 18004 + arr = np.array([np.array([], dtype=object), np.array([], dtype=object)]) + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "empty" + + def test_integers(self): + arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "integer" + + arr = np.array([1, 2, 3, np.int64(4), np.int32(5), "foo"], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "mixed-integer" + + arr = np.array([1, 2, 3, 4, 5], dtype="i4") + result = lib.infer_dtype(arr, skipna=True) + assert result == "integer" + + @pytest.mark.parametrize( + "arr, skipna", + [ + (np.array([1, 2, np.nan, np.nan, 3], dtype="O"), False), + (np.array([1, 2, np.nan, np.nan, 3], dtype="O"), True), + (np.array([1, 2, 3, np.int64(4), np.int32(5), np.nan], dtype="O"), False), + (np.array([1, 2, 3, np.int64(4), np.int32(5), np.nan], dtype="O"), True), + ], + ) + def test_integer_na(self, arr, skipna): + # GH 27392 + result = lib.infer_dtype(arr, skipna=skipna) + expected = "integer" if skipna else "integer-na" + assert result == expected + + def test_infer_dtype_skipna_default(self): + # infer_dtype `skipna` default deprecated in GH#24050, + # changed to True in GH#29876 + arr = np.array([1, 2, 3, np.nan], dtype=object) + + result = lib.infer_dtype(arr) + assert result == "integer" + + def test_bools(self): + arr = np.array([True, False, True, True, True], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "boolean" + + arr = np.array([np.bool_(True), np.bool_(False)], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "boolean" + + arr = np.array([True, False, True, "foo"], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "mixed" + + arr = np.array([True, False, True], dtype=bool) + result = lib.infer_dtype(arr, skipna=True) + assert result == "boolean" + + arr = np.array([True, np.nan, False], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "boolean" + + result = lib.infer_dtype(arr, skipna=False) + assert result == "mixed" + + def test_floats(self): + arr = np.array([1.0, 2.0, 3.0, np.float64(4), np.float32(5)], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "floating" + + arr = np.array([1, 2, 3, np.float64(4), np.float32(5), "foo"], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "mixed-integer" + + arr = np.array([1, 2, 3, 4, 5], dtype="f4") + result = lib.infer_dtype(arr, skipna=True) + assert result == "floating" + + arr = np.array([1, 2, 3, 4, 5], dtype="f8") + result = lib.infer_dtype(arr, skipna=True) + assert result == "floating" + + def test_decimals(self): + # GH15690 + arr = np.array([Decimal(1), Decimal(2), Decimal(3)]) + result = lib.infer_dtype(arr, skipna=True) + assert result == "decimal" + + arr = np.array([1.0, 2.0, Decimal(3)]) + result = lib.infer_dtype(arr, skipna=True) + assert result == "mixed" + + result = lib.infer_dtype(arr[::-1], skipna=True) + assert result == "mixed" + + arr = np.array([Decimal(1), Decimal("NaN"), Decimal(3)]) + result = lib.infer_dtype(arr, skipna=True) + assert result == "decimal" + + arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype="O") + result = lib.infer_dtype(arr, skipna=True) + assert result == "decimal" + + # complex is compatible with nan, so skipna has no effect + @pytest.mark.parametrize("skipna", [True, False]) + def test_complex(self, skipna): + # gets cast to complex on array construction + arr = np.array([1.0, 2.0, 1 + 1j]) + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "complex" + + arr = np.array([1.0, 2.0, 1 + 1j], dtype="O") + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "mixed" + + result = lib.infer_dtype(arr[::-1], skipna=skipna) + assert result == "mixed" + + # gets cast to complex on array construction + arr = np.array([1, np.nan, 1 + 1j]) + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "complex" + + arr = np.array([1.0, np.nan, 1 + 1j], dtype="O") + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "mixed" + + # complex with nans stays complex + arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype="O") + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "complex" + + # test smaller complex dtype; will pass through _try_infer_map fastpath + arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype=np.complex64) + result = lib.infer_dtype(arr, skipna=skipna) + assert result == "complex" + + def test_string(self): + pass + + def test_unicode(self): + arr = ["a", np.nan, "c"] + result = lib.infer_dtype(arr, skipna=False) + # This currently returns "mixed", but it's not clear that's optimal. + # This could also return "string" or "mixed-string" + assert result == "mixed" + + # even though we use skipna, we are only skipping those NAs that are + # considered matching by is_string_array + arr = ["a", np.nan, "c"] + result = lib.infer_dtype(arr, skipna=True) + assert result == "string" + + arr = ["a", pd.NA, "c"] + result = lib.infer_dtype(arr, skipna=True) + assert result == "string" + + arr = ["a", pd.NaT, "c"] + result = lib.infer_dtype(arr, skipna=True) + assert result == "mixed" + + arr = ["a", "c"] + result = lib.infer_dtype(arr, skipna=False) + assert result == "string" + + @pytest.mark.parametrize( + "dtype, missing, skipna, expected", + [ + (float, np.nan, False, "floating"), + (float, np.nan, True, "floating"), + (object, np.nan, False, "floating"), + (object, np.nan, True, "empty"), + (object, None, False, "mixed"), + (object, None, True, "empty"), + ], + ) + @pytest.mark.parametrize("box", [Series, np.array]) + def test_object_empty(self, box, missing, dtype, skipna, expected): + # GH 23421 + arr = box([missing, missing], dtype=dtype) + + result = lib.infer_dtype(arr, skipna=skipna) + assert result == expected + + def test_datetime(self): + dates = [datetime(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + assert index.inferred_type == "datetime64" + + def test_infer_dtype_datetime64(self): + arr = np.array( + [np.datetime64("2011-01-01"), np.datetime64("2011-01-01")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" + + @pytest.mark.parametrize("na_value", [pd.NaT, np.nan]) + def test_infer_dtype_datetime64_with_na(self, na_value): + # starts with nan + arr = np.array([na_value, np.datetime64("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" + + arr = np.array([na_value, np.datetime64("2011-01-02"), na_value]) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" + + @pytest.mark.parametrize( + "arr", + [ + np.array( + [np.timedelta64("nat"), np.datetime64("2011-01-02")], dtype=object + ), + np.array( + [np.datetime64("2011-01-02"), np.timedelta64("nat")], dtype=object + ), + np.array([np.datetime64("2011-01-01"), Timestamp("2011-01-02")]), + np.array([Timestamp("2011-01-02"), np.datetime64("2011-01-01")]), + np.array([np.nan, Timestamp("2011-01-02"), 1.1]), + np.array([np.nan, "2011-01-01", Timestamp("2011-01-02")], dtype=object), + np.array([np.datetime64("nat"), np.timedelta64(1, "D")], dtype=object), + np.array([np.timedelta64(1, "D"), np.datetime64("nat")], dtype=object), + ], + ) + def test_infer_datetimelike_dtype_mixed(self, arr): + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + def test_infer_dtype_mixed_integer(self): + arr = np.array([np.nan, Timestamp("2011-01-02"), 1]) + assert lib.infer_dtype(arr, skipna=True) == "mixed-integer" + + @pytest.mark.parametrize( + "arr", + [ + np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")]), + np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]), + np.array([datetime(2011, 1, 1), Timestamp("2011-01-02")]), + ], + ) + def test_infer_dtype_datetime(self, arr): + assert lib.infer_dtype(arr, skipna=True) == "datetime" + + @pytest.mark.parametrize("na_value", [pd.NaT, np.nan]) + @pytest.mark.parametrize( + "time_stamp", [Timestamp("2011-01-01"), datetime(2011, 1, 1)] + ) + def test_infer_dtype_datetime_with_na(self, na_value, time_stamp): + # starts with nan + arr = np.array([na_value, time_stamp]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" + + arr = np.array([na_value, time_stamp, na_value]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" + + @pytest.mark.parametrize( + "arr", + [ + np.array([Timedelta("1 days"), Timedelta("2 days")]), + np.array([np.timedelta64(1, "D"), np.timedelta64(2, "D")], dtype=object), + np.array([timedelta(1), timedelta(2)]), + ], + ) + def test_infer_dtype_timedelta(self, arr): + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + @pytest.mark.parametrize("na_value", [pd.NaT, np.nan]) + @pytest.mark.parametrize( + "delta", [Timedelta("1 days"), np.timedelta64(1, "D"), timedelta(1)] + ) + def test_infer_dtype_timedelta_with_na(self, na_value, delta): + # starts with nan + arr = np.array([na_value, delta]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + arr = np.array([na_value, delta, na_value]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" + + def test_infer_dtype_period(self): + # GH 13664 + arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="D")]) + assert lib.infer_dtype(arr, skipna=True) == "period" + + # non-homogeneous freqs -> mixed + arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="M")]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" + + @pytest.mark.parametrize("klass", [pd.array, Series, Index]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_infer_dtype_period_array(self, klass, skipna): + # https://github.com/pandas-dev/pandas/issues/23553 + values = klass( + [ + Period("2011-01-01", freq="D"), + Period("2011-01-02", freq="D"), + pd.NaT, + ] + ) + assert lib.infer_dtype(values, skipna=skipna) == "period" + + # periods but mixed freq + values = klass( + [ + Period("2011-01-01", freq="D"), + Period("2011-01-02", freq="M"), + pd.NaT, + ] + ) + # with pd.array this becomes NumpyExtensionArray which ends up + # as "unknown-array" + exp = "unknown-array" if klass is pd.array else "mixed" + assert lib.infer_dtype(values, skipna=skipna) == exp + + def test_infer_dtype_period_mixed(self): + arr = np.array( + [Period("2011-01", freq="M"), np.datetime64("nat")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array( + [np.datetime64("nat"), Period("2011-01", freq="M")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + @pytest.mark.parametrize("na_value", [pd.NaT, np.nan]) + def test_infer_dtype_period_with_na(self, na_value): + # starts with nan + arr = np.array([na_value, Period("2011-01", freq="D")]) + assert lib.infer_dtype(arr, skipna=True) == "period" + + arr = np.array([na_value, Period("2011-01", freq="D"), na_value]) + assert lib.infer_dtype(arr, skipna=True) == "period" + + def test_infer_dtype_all_nan_nat_like(self): + arr = np.array([np.nan, np.nan]) + assert lib.infer_dtype(arr, skipna=True) == "floating" + + # nan and None mix are result in mixed + arr = np.array([np.nan, np.nan, None]) + assert lib.infer_dtype(arr, skipna=True) == "empty" + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array([None, np.nan, np.nan]) + assert lib.infer_dtype(arr, skipna=True) == "empty" + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + # pd.NaT + arr = np.array([pd.NaT]) + assert lib.infer_dtype(arr, skipna=False) == "datetime" + + arr = np.array([pd.NaT, np.nan]) + assert lib.infer_dtype(arr, skipna=False) == "datetime" + + arr = np.array([np.nan, pd.NaT]) + assert lib.infer_dtype(arr, skipna=False) == "datetime" + + arr = np.array([np.nan, pd.NaT, np.nan]) + assert lib.infer_dtype(arr, skipna=False) == "datetime" + + arr = np.array([None, pd.NaT, None]) + assert lib.infer_dtype(arr, skipna=False) == "datetime" + + # np.datetime64(nat) + arr = np.array([np.datetime64("nat")]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.datetime64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" + + arr = np.array([pd.NaT, n, np.datetime64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" + + arr = np.array([np.timedelta64("nat")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.timedelta64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" + + arr = np.array([pd.NaT, n, np.timedelta64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" + + # datetime / timedelta mixed + arr = np.array([pd.NaT, np.datetime64("nat"), np.timedelta64("nat"), np.nan]) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array([np.timedelta64("nat"), np.datetime64("nat")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + def test_is_datetimelike_array_all_nan_nat_like(self): + arr = np.array([np.nan, pd.NaT, np.datetime64("nat")]) + assert lib.is_datetime_array(arr) + assert lib.is_datetime64_array(arr) + assert not lib.is_timedelta_or_timedelta64_array(arr) + + arr = np.array([np.nan, pd.NaT, np.timedelta64("nat")]) + assert not lib.is_datetime_array(arr) + assert not lib.is_datetime64_array(arr) + assert lib.is_timedelta_or_timedelta64_array(arr) + + arr = np.array([np.nan, pd.NaT, np.datetime64("nat"), np.timedelta64("nat")]) + assert not lib.is_datetime_array(arr) + assert not lib.is_datetime64_array(arr) + assert not lib.is_timedelta_or_timedelta64_array(arr) + + arr = np.array([np.nan, pd.NaT]) + assert lib.is_datetime_array(arr) + assert lib.is_datetime64_array(arr) + assert lib.is_timedelta_or_timedelta64_array(arr) + + arr = np.array([np.nan, np.nan], dtype=object) + assert not lib.is_datetime_array(arr) + assert not lib.is_datetime64_array(arr) + assert not lib.is_timedelta_or_timedelta64_array(arr) + + assert lib.is_datetime_with_singletz_array( + np.array( + [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130102", tz="US/Eastern"), + ], + dtype=object, + ) + ) + assert not lib.is_datetime_with_singletz_array( + np.array( + [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130102", tz="CET"), + ], + dtype=object, + ) + ) + + @pytest.mark.parametrize( + "func", + [ + "is_datetime_array", + "is_datetime64_array", + "is_bool_array", + "is_timedelta_or_timedelta64_array", + "is_date_array", + "is_time_array", + "is_interval_array", + ], + ) + def test_other_dtypes_for_array(self, func): + func = getattr(lib, func) + arr = np.array(["foo", "bar"]) + assert not func(arr) + assert not func(arr.reshape(2, 1)) + + arr = np.array([1, 2]) + assert not func(arr) + assert not func(arr.reshape(2, 1)) + + def test_date(self): + dates = [date(2012, 1, day) for day in range(1, 20)] + index = Index(dates) + assert index.inferred_type == "date" + + dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan] + result = lib.infer_dtype(dates, skipna=False) + assert result == "mixed" + + result = lib.infer_dtype(dates, skipna=True) + assert result == "date" + + @pytest.mark.parametrize( + "values", + [ + [date(2020, 1, 1), Timestamp("2020-01-01")], + [Timestamp("2020-01-01"), date(2020, 1, 1)], + [date(2020, 1, 1), pd.NaT], + [pd.NaT, date(2020, 1, 1)], + ], + ) + @pytest.mark.parametrize("skipna", [True, False]) + def test_infer_dtype_date_order_invariant(self, values, skipna): + # https://github.com/pandas-dev/pandas/issues/33741 + result = lib.infer_dtype(values, skipna=skipna) + assert result == "date" + + def test_is_numeric_array(self): + assert lib.is_float_array(np.array([1, 2.0])) + assert lib.is_float_array(np.array([1, 2.0, np.nan])) + assert not lib.is_float_array(np.array([1, 2])) + + assert lib.is_integer_array(np.array([1, 2])) + assert not lib.is_integer_array(np.array([1, 2.0])) + + def test_is_string_array(self): + # We should only be accepting pd.NA, np.nan, + # other floating point nans e.g. float('nan')] + # when skipna is True. + assert lib.is_string_array(np.array(["foo", "bar"])) + assert not lib.is_string_array( + np.array(["foo", "bar", pd.NA], dtype=object), skipna=False + ) + assert lib.is_string_array( + np.array(["foo", "bar", pd.NA], dtype=object), skipna=True + ) + # we allow NaN/None in the StringArray constructor, so its allowed here + assert lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=True + ) + assert lib.is_string_array( + np.array(["foo", "bar", np.nan], dtype=object), skipna=True + ) + # But not e.g. datetimelike or Decimal NAs + assert not lib.is_string_array( + np.array(["foo", "bar", pd.NaT], dtype=object), skipna=True + ) + assert not lib.is_string_array( + np.array(["foo", "bar", np.datetime64("NaT")], dtype=object), skipna=True + ) + assert not lib.is_string_array( + np.array(["foo", "bar", Decimal("NaN")], dtype=object), skipna=True + ) + + assert not lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=False + ) + assert not lib.is_string_array( + np.array(["foo", "bar", np.nan], dtype=object), skipna=False + ) + assert not lib.is_string_array(np.array([1, 2])) + + def test_to_object_array_tuples(self): + r = (5, 6) + values = [r] + lib.to_object_array_tuples(values) + + # make sure record array works + record = namedtuple("record", "x y") + r = record(5, 6) + values = [r] + lib.to_object_array_tuples(values) + + def test_object(self): + # GH 7431 + # cannot infer more than this as only a single element + arr = np.array([None], dtype="O") + result = lib.infer_dtype(arr, skipna=False) + assert result == "mixed" + result = lib.infer_dtype(arr, skipna=True) + assert result == "empty" + + def test_to_object_array_width(self): + # see gh-13320 + rows = [[1, 2, 3], [4, 5, 6]] + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows, min_width=1) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array( + [[1, 2, 3, None, None], [4, 5, 6, None, None]], dtype=object + ) + out = lib.to_object_array(rows, min_width=5) + tm.assert_numpy_array_equal(out, expected) + + def test_is_period(self): + # GH#55264 + msg = "is_period is deprecated and will be removed in a future version" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert lib.is_period(Period("2011-01", freq="M")) + assert not lib.is_period(PeriodIndex(["2011-01"], freq="M")) + assert not lib.is_period(Timestamp("2011-01")) + assert not lib.is_period(1) + assert not lib.is_period(np.nan) + + def test_is_interval(self): + # GH#55264 + msg = "is_interval is deprecated and will be removed in a future version" + item = Interval(1, 2) + with tm.assert_produces_warning(FutureWarning, match=msg): + assert lib.is_interval(item) + assert not lib.is_interval(pd.IntervalIndex([item])) + assert not lib.is_interval(pd.IntervalIndex([item])._engine) + + def test_categorical(self): + # GH 8974 + arr = Categorical(list("abc")) + result = lib.infer_dtype(arr, skipna=True) + assert result == "categorical" + + result = lib.infer_dtype(Series(arr), skipna=True) + assert result == "categorical" + + arr = Categorical(list("abc"), categories=["cegfab"], ordered=True) + result = lib.infer_dtype(arr, skipna=True) + assert result == "categorical" + + result = lib.infer_dtype(Series(arr), skipna=True) + assert result == "categorical" + + @pytest.mark.parametrize("asobject", [True, False]) + def test_interval(self, asobject): + idx = pd.IntervalIndex.from_breaks(range(5), closed="both") + if asobject: + idx = idx.astype(object) + + inferred = lib.infer_dtype(idx, skipna=False) + assert inferred == "interval" + + inferred = lib.infer_dtype(idx._data, skipna=False) + assert inferred == "interval" + + inferred = lib.infer_dtype(Series(idx, dtype=idx.dtype), skipna=False) + assert inferred == "interval" + + @pytest.mark.parametrize("value", [Timestamp(0), Timedelta(0), 0, 0.0]) + def test_interval_mismatched_closed(self, value): + first = Interval(value, value, closed="left") + second = Interval(value, value, closed="right") + + # if closed match, we should infer "interval" + arr = np.array([first, first], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "interval" + + # if closed dont match, we should _not_ get "interval" + arr2 = np.array([first, second], dtype=object) + assert lib.infer_dtype(arr2, skipna=False) == "mixed" + + def test_interval_mismatched_subtype(self): + first = Interval(0, 1, closed="left") + second = Interval(Timestamp(0), Timestamp(1), closed="left") + third = Interval(Timedelta(0), Timedelta(1), closed="left") + + arr = np.array([first, second]) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array([second, third]) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + arr = np.array([first, third]) + assert lib.infer_dtype(arr, skipna=False) == "mixed" + + # float vs int subdtype are compatible + flt_interval = Interval(1.5, 2.5, closed="left") + arr = np.array([first, flt_interval], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "interval" + + @pytest.mark.parametrize("klass", [pd.array, Series]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]]) + def test_string_dtype(self, data, skipna, klass, nullable_string_dtype): + # StringArray + val = klass(data, dtype=nullable_string_dtype) + inferred = lib.infer_dtype(val, skipna=skipna) + assert inferred == "string" + + @pytest.mark.parametrize("klass", [pd.array, Series]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("data", [[True, False, True], [True, False, pd.NA]]) + def test_boolean_dtype(self, data, skipna, klass): + # BooleanArray + val = klass(data, dtype="boolean") + inferred = lib.infer_dtype(val, skipna=skipna) + assert inferred == "boolean" + + +class TestNumberScalar: + def test_is_number(self): + assert is_number(True) + assert is_number(1) + assert is_number(1.1) + assert is_number(1 + 3j) + assert is_number(np.int64(1)) + assert is_number(np.float64(1.1)) + assert is_number(np.complex128(1 + 3j)) + assert is_number(np.nan) + + assert not is_number(None) + assert not is_number("x") + assert not is_number(datetime(2011, 1, 1)) + assert not is_number(np.datetime64("2011-01-01")) + assert not is_number(Timestamp("2011-01-01")) + assert not is_number(Timestamp("2011-01-01", tz="US/Eastern")) + assert not is_number(timedelta(1000)) + assert not is_number(Timedelta("1 days")) + + # questionable + assert not is_number(np.bool_(False)) + assert is_number(np.timedelta64(1, "D")) + + def test_is_bool(self): + assert is_bool(True) + assert is_bool(False) + assert is_bool(np.bool_(False)) + + assert not is_bool(1) + assert not is_bool(1.1) + assert not is_bool(1 + 3j) + assert not is_bool(np.int64(1)) + assert not is_bool(np.float64(1.1)) + assert not is_bool(np.complex128(1 + 3j)) + assert not is_bool(np.nan) + assert not is_bool(None) + assert not is_bool("x") + assert not is_bool(datetime(2011, 1, 1)) + assert not is_bool(np.datetime64("2011-01-01")) + assert not is_bool(Timestamp("2011-01-01")) + assert not is_bool(Timestamp("2011-01-01", tz="US/Eastern")) + assert not is_bool(timedelta(1000)) + assert not is_bool(np.timedelta64(1, "D")) + assert not is_bool(Timedelta("1 days")) + + def test_is_integer(self): + assert is_integer(1) + assert is_integer(np.int64(1)) + + assert not is_integer(True) + assert not is_integer(1.1) + assert not is_integer(1 + 3j) + assert not is_integer(False) + assert not is_integer(np.bool_(False)) + assert not is_integer(np.float64(1.1)) + assert not is_integer(np.complex128(1 + 3j)) + assert not is_integer(np.nan) + assert not is_integer(None) + assert not is_integer("x") + assert not is_integer(datetime(2011, 1, 1)) + assert not is_integer(np.datetime64("2011-01-01")) + assert not is_integer(Timestamp("2011-01-01")) + assert not is_integer(Timestamp("2011-01-01", tz="US/Eastern")) + assert not is_integer(timedelta(1000)) + assert not is_integer(Timedelta("1 days")) + assert not is_integer(np.timedelta64(1, "D")) + + def test_is_float(self): + assert is_float(1.1) + assert is_float(np.float64(1.1)) + assert is_float(np.nan) + + assert not is_float(True) + assert not is_float(1) + assert not is_float(1 + 3j) + assert not is_float(False) + assert not is_float(np.bool_(False)) + assert not is_float(np.int64(1)) + assert not is_float(np.complex128(1 + 3j)) + assert not is_float(None) + assert not is_float("x") + assert not is_float(datetime(2011, 1, 1)) + assert not is_float(np.datetime64("2011-01-01")) + assert not is_float(Timestamp("2011-01-01")) + assert not is_float(Timestamp("2011-01-01", tz="US/Eastern")) + assert not is_float(timedelta(1000)) + assert not is_float(np.timedelta64(1, "D")) + assert not is_float(Timedelta("1 days")) + + def test_is_datetime_dtypes(self): + ts = pd.date_range("20130101", periods=3) + tsa = pd.date_range("20130101", periods=3, tz="US/Eastern") + + msg = "is_datetime64tz_dtype is deprecated" + + assert is_datetime64_dtype("datetime64") + assert is_datetime64_dtype("datetime64[ns]") + assert is_datetime64_dtype(ts) + assert not is_datetime64_dtype(tsa) + + assert not is_datetime64_ns_dtype("datetime64") + assert is_datetime64_ns_dtype("datetime64[ns]") + assert is_datetime64_ns_dtype(ts) + assert is_datetime64_ns_dtype(tsa) + + assert is_datetime64_any_dtype("datetime64") + assert is_datetime64_any_dtype("datetime64[ns]") + assert is_datetime64_any_dtype(ts) + assert is_datetime64_any_dtype(tsa) + + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert not is_datetime64tz_dtype("datetime64") + assert not is_datetime64tz_dtype("datetime64[ns]") + assert not is_datetime64tz_dtype(ts) + assert is_datetime64tz_dtype(tsa) + + @pytest.mark.parametrize("tz", ["US/Eastern", "UTC"]) + def test_is_datetime_dtypes_with_tz(self, tz): + dtype = f"datetime64[ns, {tz}]" + assert not is_datetime64_dtype(dtype) + + msg = "is_datetime64tz_dtype is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert is_datetime64tz_dtype(dtype) + assert is_datetime64_ns_dtype(dtype) + assert is_datetime64_any_dtype(dtype) + + def test_is_timedelta(self): + assert is_timedelta64_dtype("timedelta64") + assert is_timedelta64_dtype("timedelta64[ns]") + assert not is_timedelta64_ns_dtype("timedelta64") + assert is_timedelta64_ns_dtype("timedelta64[ns]") + + tdi = TimedeltaIndex([1e14, 2e14], dtype="timedelta64[ns]") + assert is_timedelta64_dtype(tdi) + assert is_timedelta64_ns_dtype(tdi) + assert is_timedelta64_ns_dtype(tdi.astype("timedelta64[ns]")) + + assert not is_timedelta64_ns_dtype(Index([], dtype=np.float64)) + assert not is_timedelta64_ns_dtype(Index([], dtype=np.int64)) + + +class TestIsScalar: + def test_is_scalar_builtin_scalars(self): + assert is_scalar(None) + assert is_scalar(True) + assert is_scalar(False) + assert is_scalar(Fraction()) + assert is_scalar(0.0) + assert is_scalar(1) + assert is_scalar(complex(2)) + assert is_scalar(float("NaN")) + assert is_scalar(np.nan) + assert is_scalar("foobar") + assert is_scalar(b"foobar") + assert is_scalar(datetime(2014, 1, 1)) + assert is_scalar(date(2014, 1, 1)) + assert is_scalar(time(12, 0)) + assert is_scalar(timedelta(hours=1)) + assert is_scalar(pd.NaT) + assert is_scalar(pd.NA) + + def test_is_scalar_builtin_nonscalars(self): + assert not is_scalar({}) + assert not is_scalar([]) + assert not is_scalar([1]) + assert not is_scalar(()) + assert not is_scalar((1,)) + assert not is_scalar(slice(None)) + assert not is_scalar(Ellipsis) + + def test_is_scalar_numpy_array_scalars(self): + assert is_scalar(np.int64(1)) + assert is_scalar(np.float64(1.0)) + assert is_scalar(np.int32(1)) + assert is_scalar(np.complex64(2)) + assert is_scalar(np.object_("foobar")) + assert is_scalar(np.str_("foobar")) + assert is_scalar(np.bytes_(b"foobar")) + assert is_scalar(np.datetime64("2014-01-01")) + assert is_scalar(np.timedelta64(1, "h")) + + @pytest.mark.parametrize( + "zerodim", + [ + np.array(1), + np.array("foobar"), + np.array(np.datetime64("2014-01-01")), + np.array(np.timedelta64(1, "h")), + np.array(np.datetime64("NaT")), + ], + ) + def test_is_scalar_numpy_zerodim_arrays(self, zerodim): + assert not is_scalar(zerodim) + assert is_scalar(lib.item_from_zerodim(zerodim)) + + @pytest.mark.parametrize("arr", [np.array([]), np.array([[]])]) + def test_is_scalar_numpy_arrays(self, arr): + assert not is_scalar(arr) + assert not is_scalar(MockNumpyLikeArray(arr)) + + def test_is_scalar_pandas_scalars(self): + assert is_scalar(Timestamp("2014-01-01")) + assert is_scalar(Timedelta(hours=1)) + assert is_scalar(Period("2014-01-01")) + assert is_scalar(Interval(left=0, right=1)) + assert is_scalar(DateOffset(days=1)) + assert is_scalar(pd.offsets.Minute(3)) + + def test_is_scalar_pandas_containers(self): + assert not is_scalar(Series(dtype=object)) + assert not is_scalar(Series([1])) + assert not is_scalar(DataFrame()) + assert not is_scalar(DataFrame([[1]])) + assert not is_scalar(Index([])) + assert not is_scalar(Index([1])) + assert not is_scalar(Categorical([])) + assert not is_scalar(DatetimeIndex([])._data) + assert not is_scalar(TimedeltaIndex([])._data) + assert not is_scalar(DatetimeIndex([])._data.to_period("D")) + assert not is_scalar(pd.array([1, 2, 3])) + + def test_is_scalar_number(self): + # Number() is not recognied by PyNumber_Check, so by extension + # is not recognized by is_scalar, but instances of non-abstract + # subclasses are. + + class Numeric(Number): + def __init__(self, value) -> None: + self.value = value + + def __int__(self) -> int: + return self.value + + num = Numeric(1) + assert is_scalar(num) + + +@pytest.mark.parametrize("unit", ["ms", "us", "ns"]) +def test_datetimeindex_from_empty_datetime64_array(unit): + idx = DatetimeIndex(np.array([], dtype=f"datetime64[{unit}]")) + assert len(idx) == 0 + + +def test_nan_to_nat_conversions(): + df = DataFrame( + {"A": np.asarray(range(10), dtype="float64"), "B": Timestamp("20010101")} + ) + df.iloc[3:6, :] = np.nan + result = df.loc[4, "B"] + assert result is pd.NaT + + s = df["B"].copy() + s[8:9] = np.nan + assert s[8] is pd.NaT + + +@pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") +def test_is_scipy_sparse(spmatrix): + pytest.importorskip("scipy") + assert is_scipy_sparse(spmatrix([[0, 1]])) + assert not is_scipy_sparse(np.array([1])) + + +def test_ensure_int32(): + values = np.arange(10, dtype=np.int32) + result = ensure_int32(values) + assert result.dtype == np.int32 + + values = np.arange(10, dtype=np.int64) + result = ensure_int32(values) + assert result.dtype == np.int32 + + +@pytest.mark.parametrize( + "right,result", + [ + (0, np.uint8), + (-1, np.int16), + (300, np.uint16), + # For floats, we just upcast directly to float64 instead of trying to + # find a smaller floating dtype + (300.0, np.uint16), # for integer floats, we convert them to ints + (300.1, np.float64), + (np.int16(300), np.int16 if np_version_gt2 else np.uint16), + ], +) +def test_find_result_type_uint_int(right, result): + left_dtype = np.dtype("uint8") + assert find_result_type(left_dtype, right) == result + + +@pytest.mark.parametrize( + "right,result", + [ + (0, np.int8), + (-1, np.int8), + (300, np.int16), + # For floats, we just upcast directly to float64 instead of trying to + # find a smaller floating dtype + (300.0, np.int16), # for integer floats, we convert them to ints + (300.1, np.float64), + (np.int16(300), np.int16), + ], +) +def test_find_result_type_int_int(right, result): + left_dtype = np.dtype("int8") + assert find_result_type(left_dtype, right) == result + + +@pytest.mark.parametrize( + "right,result", + [ + (300.0, np.float64), + (np.float32(300), np.float32), + ], +) +def test_find_result_type_floats(right, result): + left_dtype = np.dtype("float16") + assert find_result_type(left_dtype, right) == result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/test_missing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/test_missing.py new file mode 100644 index 0000000000000000000000000000000000000000..e1f8d8eca2537bdbedd0ba128dff22e63c3c4534 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/dtypes/test_missing.py @@ -0,0 +1,923 @@ +from contextlib import nullcontext +from datetime import datetime +from decimal import Decimal + +import numpy as np +import pytest + +from pandas._config import config as cf + +from pandas._libs import missing as libmissing +from pandas._libs.tslibs import iNaT +from pandas.compat.numpy import np_version_gte1p25 + +from pandas.core.dtypes.common import ( + is_float, + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import ( + array_equivalent, + is_valid_na_for_dtype, + isna, + isnull, + na_value_for_dtype, + notna, + notnull, +) + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, + NaT, + Series, + TimedeltaIndex, + date_range, + period_range, +) +import pandas._testing as tm + +fix_now = pd.Timestamp("2021-01-01") +fix_utcnow = pd.Timestamp("2021-01-01", tz="UTC") + + +@pytest.mark.parametrize("notna_f", [notna, notnull]) +def test_notna_notnull(notna_f): + assert notna_f(1.0) + assert not notna_f(None) + assert not notna_f(np.nan) + + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with cf.option_context("mode.use_inf_as_na", False): + assert notna_f(np.inf) + assert notna_f(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notna_f(arr) + assert result.all() + + with tm.assert_produces_warning(FutureWarning, match=msg): + with cf.option_context("mode.use_inf_as_na", True): + assert not notna_f(np.inf) + assert not notna_f(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notna_f(arr) + assert result.sum() == 2 + + +@pytest.mark.parametrize("null_func", [notna, notnull, isna, isnull]) +@pytest.mark.parametrize( + "ser", + [ + Series( + [str(i) for i in range(5)], + index=Index([str(i) for i in range(5)], dtype=object), + dtype=object, + ), + Series(range(5), date_range("2020-01-01", periods=5)), + Series(range(5), period_range("2020-01-01", periods=5)), + ], +) +def test_null_check_is_series(null_func, ser): + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with cf.option_context("mode.use_inf_as_na", False): + assert isinstance(null_func(ser), Series) + + +class TestIsNA: + def test_0d_array(self): + assert isna(np.array(np.nan)) + assert not isna(np.array(0.0)) + assert not isna(np.array(0)) + # test object dtype + assert isna(np.array(np.nan, dtype=object)) + assert not isna(np.array(0.0, dtype=object)) + assert not isna(np.array(0, dtype=object)) + + @pytest.mark.parametrize("shape", [(4, 0), (4,)]) + def test_empty_object(self, shape): + arr = np.empty(shape=shape, dtype=object) + result = isna(arr) + expected = np.ones(shape=shape, dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("isna_f", [isna, isnull]) + def test_isna_isnull(self, isna_f): + assert not isna_f(1.0) + assert isna_f(None) + assert isna_f(np.nan) + assert float("nan") + assert not isna_f(np.inf) + assert not isna_f(-np.inf) + + # type + assert not isna_f(type(Series(dtype=object))) + assert not isna_f(type(Series(dtype=np.float64))) + assert not isna_f(type(pd.DataFrame())) + + @pytest.mark.parametrize("isna_f", [isna, isnull]) + @pytest.mark.parametrize( + "data", + [ + np.arange(4, dtype=float), + [0.0, 1.0, 0.0, 1.0], + Series(list("abcd"), dtype=object), + date_range("2020-01-01", periods=4), + ], + ) + @pytest.mark.parametrize( + "index", + [ + date_range("2020-01-01", periods=4), + range(4), + period_range("2020-01-01", periods=4), + ], + ) + def test_isna_isnull_frame(self, isna_f, data, index): + # frame + df = pd.DataFrame(data, index=index) + result = isna_f(df) + expected = df.apply(isna_f) + tm.assert_frame_equal(result, expected) + + def test_isna_lists(self): + result = isna([[False]]) + exp = np.array([[False]]) + tm.assert_numpy_array_equal(result, exp) + + result = isna([[1], [2]]) + exp = np.array([[False], [False]]) + tm.assert_numpy_array_equal(result, exp) + + # list of strings / unicode + result = isna(["foo", "bar"]) + exp = np.array([False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = isna(["foo", "bar"]) + exp = np.array([False, False]) + tm.assert_numpy_array_equal(result, exp) + + # GH20675 + result = isna([np.nan, "world"]) + exp = np.array([True, False]) + tm.assert_numpy_array_equal(result, exp) + + def test_isna_nat(self): + result = isna([NaT]) + exp = np.array([True]) + tm.assert_numpy_array_equal(result, exp) + + result = isna(np.array([NaT], dtype=object)) + exp = np.array([True]) + tm.assert_numpy_array_equal(result, exp) + + def test_isna_numpy_nat(self): + arr = np.array( + [ + NaT, + np.datetime64("NaT"), + np.timedelta64("NaT"), + np.datetime64("NaT", "s"), + ] + ) + result = isna(arr) + expected = np.array([True] * 4) + tm.assert_numpy_array_equal(result, expected) + + def test_isna_datetime(self): + assert not isna(datetime.now()) + assert notna(datetime.now()) + + idx = date_range("1/1/1990", periods=20) + exp = np.ones(len(idx), dtype=bool) + tm.assert_numpy_array_equal(notna(idx), exp) + + idx = np.asarray(idx) + idx[0] = iNaT + idx = DatetimeIndex(idx) + mask = isna(idx) + assert mask[0] + exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) + tm.assert_numpy_array_equal(mask, exp) + + # GH 9129 + pidx = idx.to_period(freq="M") + mask = isna(pidx) + assert mask[0] + exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) + tm.assert_numpy_array_equal(mask, exp) + + mask = isna(pidx[1:]) + exp = np.zeros(len(mask), dtype=bool) + tm.assert_numpy_array_equal(mask, exp) + + def test_isna_old_datetimelike(self): + # isna_old should work for dt64tz, td64, and period, not just tznaive + dti = date_range("2016-01-01", periods=3) + dta = dti._data + dta[-1] = NaT + expected = np.array([False, False, True], dtype=bool) + + objs = [dta, dta.tz_localize("US/Eastern"), dta - dta, dta.to_period("D")] + + for obj in objs: + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with cf.option_context("mode.use_inf_as_na", True): + result = isna(obj) + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "value, expected", + [ + (np.complex128(np.nan), True), + (np.float64(1), False), + (np.array([1, 1 + 0j, np.nan, 3]), np.array([False, False, True, False])), + ( + np.array([1, 1 + 0j, np.nan, 3], dtype=object), + np.array([False, False, True, False]), + ), + ( + np.array([1, 1 + 0j, np.nan, 3]).astype(object), + np.array([False, False, True, False]), + ), + ], + ) + def test_complex(self, value, expected): + result = isna(value) + if is_scalar(result): + assert result is expected + else: + tm.assert_numpy_array_equal(result, expected) + + def test_datetime_other_units(self): + idx = DatetimeIndex(["2011-01-01", "NaT", "2011-01-02"]) + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isna(idx), exp) + tm.assert_numpy_array_equal(notna(idx), ~exp) + tm.assert_numpy_array_equal(isna(idx.values), exp) + tm.assert_numpy_array_equal(notna(idx.values), ~exp) + + @pytest.mark.parametrize( + "dtype", + [ + "datetime64[D]", + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ], + ) + def test_datetime_other_units_astype(self, dtype): + idx = DatetimeIndex(["2011-01-01", "NaT", "2011-01-02"]) + values = idx.values.astype(dtype) + + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isna(values), exp) + tm.assert_numpy_array_equal(notna(values), ~exp) + + exp = Series([False, True, False]) + s = Series(values) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + s = Series(values, dtype=object) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + + def test_timedelta_other_units(self): + idx = TimedeltaIndex(["1 days", "NaT", "2 days"]) + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isna(idx), exp) + tm.assert_numpy_array_equal(notna(idx), ~exp) + tm.assert_numpy_array_equal(isna(idx.values), exp) + tm.assert_numpy_array_equal(notna(idx.values), ~exp) + + @pytest.mark.parametrize( + "dtype", + [ + "timedelta64[D]", + "timedelta64[h]", + "timedelta64[m]", + "timedelta64[s]", + "timedelta64[ms]", + "timedelta64[us]", + "timedelta64[ns]", + ], + ) + def test_timedelta_other_units_dtype(self, dtype): + idx = TimedeltaIndex(["1 days", "NaT", "2 days"]) + values = idx.values.astype(dtype) + + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isna(values), exp) + tm.assert_numpy_array_equal(notna(values), ~exp) + + exp = Series([False, True, False]) + s = Series(values) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + s = Series(values, dtype=object) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + + def test_period(self): + idx = pd.PeriodIndex(["2011-01", "NaT", "2012-01"], freq="M") + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isna(idx), exp) + tm.assert_numpy_array_equal(notna(idx), ~exp) + + exp = Series([False, True, False]) + s = Series(idx) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + s = Series(idx, dtype=object) + tm.assert_series_equal(isna(s), exp) + tm.assert_series_equal(notna(s), ~exp) + + def test_decimal(self): + # scalars GH#23530 + a = Decimal(1.0) + assert isna(a) is False + assert notna(a) is True + + b = Decimal("NaN") + assert isna(b) is True + assert notna(b) is False + + # array + arr = np.array([a, b]) + expected = np.array([False, True]) + result = isna(arr) + tm.assert_numpy_array_equal(result, expected) + + result = notna(arr) + tm.assert_numpy_array_equal(result, ~expected) + + # series + ser = Series(arr) + expected = Series(expected) + result = isna(ser) + tm.assert_series_equal(result, expected) + + result = notna(ser) + tm.assert_series_equal(result, ~expected) + + # index + idx = Index(arr) + expected = np.array([False, True]) + result = isna(idx) + tm.assert_numpy_array_equal(result, expected) + + result = notna(idx) + tm.assert_numpy_array_equal(result, ~expected) + + +@pytest.mark.parametrize("dtype_equal", [True, False]) +def test_array_equivalent(dtype_equal): + assert array_equivalent( + np.array([np.nan, np.nan]), np.array([np.nan, np.nan]), dtype_equal=dtype_equal + ) + assert array_equivalent( + np.array([np.nan, 1, np.nan]), + np.array([np.nan, 1, np.nan]), + dtype_equal=dtype_equal, + ) + assert array_equivalent( + np.array([np.nan, None], dtype="object"), + np.array([np.nan, None], dtype="object"), + dtype_equal=dtype_equal, + ) + # Check the handling of nested arrays in array_equivalent_object + assert array_equivalent( + np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), + np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), + dtype_equal=dtype_equal, + ) + assert array_equivalent( + np.array([np.nan, 1 + 1j], dtype="complex"), + np.array([np.nan, 1 + 1j], dtype="complex"), + dtype_equal=dtype_equal, + ) + assert not array_equivalent( + np.array([np.nan, 1 + 1j], dtype="complex"), + np.array([np.nan, 1 + 2j], dtype="complex"), + dtype_equal=dtype_equal, + ) + assert not array_equivalent( + np.array([np.nan, 1, np.nan]), + np.array([np.nan, 2, np.nan]), + dtype_equal=dtype_equal, + ) + assert not array_equivalent( + np.array(["a", "b", "c", "d"]), np.array(["e", "e"]), dtype_equal=dtype_equal + ) + assert array_equivalent( + Index([0, np.nan]), Index([0, np.nan]), dtype_equal=dtype_equal + ) + assert not array_equivalent( + Index([0, np.nan]), Index([1, np.nan]), dtype_equal=dtype_equal + ) + + +@pytest.mark.parametrize("dtype_equal", [True, False]) +def test_array_equivalent_tdi(dtype_equal): + assert array_equivalent( + TimedeltaIndex([0, np.nan]), + TimedeltaIndex([0, np.nan]), + dtype_equal=dtype_equal, + ) + assert not array_equivalent( + TimedeltaIndex([0, np.nan]), + TimedeltaIndex([1, np.nan]), + dtype_equal=dtype_equal, + ) + + +@pytest.mark.parametrize("dtype_equal", [True, False]) +def test_array_equivalent_dti(dtype_equal): + assert array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan]), dtype_equal=dtype_equal + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]), dtype_equal=dtype_equal + ) + + dti1 = DatetimeIndex([0, np.nan], tz="US/Eastern") + dti2 = DatetimeIndex([0, np.nan], tz="CET") + dti3 = DatetimeIndex([1, np.nan], tz="US/Eastern") + + assert array_equivalent( + dti1, + dti1, + dtype_equal=dtype_equal, + ) + assert not array_equivalent( + dti1, + dti3, + dtype_equal=dtype_equal, + ) + # The rest are not dtype_equal + assert not array_equivalent(DatetimeIndex([0, np.nan]), dti1) + assert array_equivalent( + dti2, + dti1, + ) + + assert not array_equivalent(DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) + + +@pytest.mark.parametrize( + "val", [1, 1.1, 1 + 1j, True, "abc", [1, 2], (1, 2), {1, 2}, {"a": 1}, None] +) +def test_array_equivalent_series(val): + arr = np.array([1, 2]) + msg = "elementwise comparison failed" + cm = ( + # stacklevel is chosen to make sense when called from .equals + tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False) + if isinstance(val, str) and not np_version_gte1p25 + else nullcontext() + ) + with cm: + assert not array_equivalent(Series([arr, arr]), Series([arr, val])) + + +def test_array_equivalent_array_mismatched_shape(): + # to trigger the motivating bug, the first N elements of the arrays need + # to match + first = np.array([1, 2, 3]) + second = np.array([1, 2]) + + left = Series([first, "a"], dtype=object) + right = Series([second, "a"], dtype=object) + assert not array_equivalent(left, right) + + +def test_array_equivalent_array_mismatched_dtype(): + # same shape, different dtype can still be equivalent + first = np.array([1, 2], dtype=np.float64) + second = np.array([1, 2]) + + left = Series([first, "a"], dtype=object) + right = Series([second, "a"], dtype=object) + assert array_equivalent(left, right) + + +def test_array_equivalent_different_dtype_but_equal(): + # Unclear if this is exposed anywhere in the public-facing API + assert array_equivalent(np.array([1, 2]), np.array([1.0, 2.0])) + + +@pytest.mark.parametrize( + "lvalue, rvalue", + [ + # There are 3 variants for each of lvalue and rvalue. We include all + # three for the tz-naive `now` and exclude the datetim64 variant + # for utcnow because it drops tzinfo. + (fix_now, fix_utcnow), + (fix_now.to_datetime64(), fix_utcnow), + (fix_now.to_pydatetime(), fix_utcnow), + (fix_now, fix_utcnow), + (fix_now.to_datetime64(), fix_utcnow.to_pydatetime()), + (fix_now.to_pydatetime(), fix_utcnow.to_pydatetime()), + ], +) +def test_array_equivalent_tzawareness(lvalue, rvalue): + # we shouldn't raise if comparing tzaware and tznaive datetimes + left = np.array([lvalue], dtype=object) + right = np.array([rvalue], dtype=object) + + assert not array_equivalent(left, right, strict_nan=True) + assert not array_equivalent(left, right, strict_nan=False) + + +def test_array_equivalent_compat(): + # see gh-13388 + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + assert array_equivalent(m, n, strict_nan=True) + assert array_equivalent(m, n, strict_nan=False) + + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (4, 3)], dtype=[("a", int), ("b", float)]) + assert not array_equivalent(m, n, strict_nan=True) + assert not array_equivalent(m, n, strict_nan=False) + + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (3, 4)], dtype=[("b", int), ("a", float)]) + assert not array_equivalent(m, n, strict_nan=True) + assert not array_equivalent(m, n, strict_nan=False) + + +@pytest.mark.parametrize("dtype", ["O", "S", "U"]) +def test_array_equivalent_str(dtype): + assert array_equivalent( + np.array(["A", "B"], dtype=dtype), np.array(["A", "B"], dtype=dtype) + ) + assert not array_equivalent( + np.array(["A", "B"], dtype=dtype), np.array(["A", "X"], dtype=dtype) + ) + + +@pytest.mark.parametrize("strict_nan", [True, False]) +def test_array_equivalent_nested(strict_nan): + # reached in groupby aggregations, make sure we use np.any when checking + # if the comparison is truthy + left = np.array([np.array([50, 70, 90]), np.array([20, 30])], dtype=object) + right = np.array([np.array([50, 70, 90]), np.array([20, 30])], dtype=object) + + assert array_equivalent(left, right, strict_nan=strict_nan) + assert not array_equivalent(left, right[::-1], strict_nan=strict_nan) + + left = np.empty(2, dtype=object) + left[:] = [np.array([50, 70, 90]), np.array([20, 30, 40])] + right = np.empty(2, dtype=object) + right[:] = [np.array([50, 70, 90]), np.array([20, 30, 40])] + assert array_equivalent(left, right, strict_nan=strict_nan) + assert not array_equivalent(left, right[::-1], strict_nan=strict_nan) + + left = np.array([np.array([50, 50, 50]), np.array([40, 40])], dtype=object) + right = np.array([50, 40]) + assert not array_equivalent(left, right, strict_nan=strict_nan) + + +@pytest.mark.filterwarnings("ignore:elementwise comparison failed:DeprecationWarning") +@pytest.mark.parametrize("strict_nan", [True, False]) +def test_array_equivalent_nested2(strict_nan): + # more than one level of nesting + left = np.array( + [ + np.array([np.array([50, 70]), np.array([90])], dtype=object), + np.array([np.array([20, 30])], dtype=object), + ], + dtype=object, + ) + right = np.array( + [ + np.array([np.array([50, 70]), np.array([90])], dtype=object), + np.array([np.array([20, 30])], dtype=object), + ], + dtype=object, + ) + assert array_equivalent(left, right, strict_nan=strict_nan) + assert not array_equivalent(left, right[::-1], strict_nan=strict_nan) + + left = np.array([np.array([np.array([50, 50, 50])], dtype=object)], dtype=object) + right = np.array([50]) + assert not array_equivalent(left, right, strict_nan=strict_nan) + + +@pytest.mark.parametrize("strict_nan", [True, False]) +def test_array_equivalent_nested_list(strict_nan): + left = np.array([[50, 70, 90], [20, 30]], dtype=object) + right = np.array([[50, 70, 90], [20, 30]], dtype=object) + + assert array_equivalent(left, right, strict_nan=strict_nan) + assert not array_equivalent(left, right[::-1], strict_nan=strict_nan) + + left = np.array([[50, 50, 50], [40, 40]], dtype=object) + right = np.array([50, 40]) + assert not array_equivalent(left, right, strict_nan=strict_nan) + + +@pytest.mark.filterwarnings("ignore:elementwise comparison failed:DeprecationWarning") +@pytest.mark.xfail(reason="failing") +@pytest.mark.parametrize("strict_nan", [True, False]) +def test_array_equivalent_nested_mixed_list(strict_nan): + # mixed arrays / lists in left and right + # https://github.com/pandas-dev/pandas/issues/50360 + left = np.array([np.array([1, 2, 3]), np.array([4, 5])], dtype=object) + right = np.array([[1, 2, 3], [4, 5]], dtype=object) + + assert array_equivalent(left, right, strict_nan=strict_nan) + assert not array_equivalent(left, right[::-1], strict_nan=strict_nan) + + # multiple levels of nesting + left = np.array( + [ + np.array([np.array([1, 2, 3]), np.array([4, 5])], dtype=object), + np.array([np.array([6]), np.array([7, 8]), np.array([9])], dtype=object), + ], + dtype=object, + ) + right = np.array([[[1, 2, 3], [4, 5]], [[6], [7, 8], [9]]], dtype=object) + assert array_equivalent(left, right, strict_nan=strict_nan) + assert not array_equivalent(left, right[::-1], strict_nan=strict_nan) + + # same-length lists + subarr = np.empty(2, dtype=object) + subarr[:] = [ + np.array([None, "b"], dtype=object), + np.array(["c", "d"], dtype=object), + ] + left = np.array([subarr, None], dtype=object) + right = np.array([[[None, "b"], ["c", "d"]], None], dtype=object) + assert array_equivalent(left, right, strict_nan=strict_nan) + assert not array_equivalent(left, right[::-1], strict_nan=strict_nan) + + +@pytest.mark.xfail(reason="failing") +@pytest.mark.parametrize("strict_nan", [True, False]) +def test_array_equivalent_nested_dicts(strict_nan): + left = np.array([{"f1": 1, "f2": np.array(["a", "b"], dtype=object)}], dtype=object) + right = np.array( + [{"f1": 1, "f2": np.array(["a", "b"], dtype=object)}], dtype=object + ) + assert array_equivalent(left, right, strict_nan=strict_nan) + assert not array_equivalent(left, right[::-1], strict_nan=strict_nan) + + right2 = np.array([{"f1": 1, "f2": ["a", "b"]}], dtype=object) + assert array_equivalent(left, right2, strict_nan=strict_nan) + assert not array_equivalent(left, right2[::-1], strict_nan=strict_nan) + + +def test_array_equivalent_index_with_tuples(): + # GH#48446 + idx1 = Index(np.array([(pd.NA, 4), (1, 1)], dtype="object")) + idx2 = Index(np.array([(1, 1), (pd.NA, 4)], dtype="object")) + assert not array_equivalent(idx1, idx2) + assert not idx1.equals(idx2) + assert not array_equivalent(idx2, idx1) + assert not idx2.equals(idx1) + + idx1 = Index(np.array([(4, pd.NA), (1, 1)], dtype="object")) + idx2 = Index(np.array([(1, 1), (4, pd.NA)], dtype="object")) + assert not array_equivalent(idx1, idx2) + assert not idx1.equals(idx2) + assert not array_equivalent(idx2, idx1) + assert not idx2.equals(idx1) + + +@pytest.mark.parametrize( + "dtype, na_value", + [ + # Datetime-like + (np.dtype("M8[ns]"), np.datetime64("NaT", "ns")), + (np.dtype("m8[ns]"), np.timedelta64("NaT", "ns")), + (DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]"), NaT), + (PeriodDtype("M"), NaT), + # Integer + ("u1", 0), + ("u2", 0), + ("u4", 0), + ("u8", 0), + ("i1", 0), + ("i2", 0), + ("i4", 0), + ("i8", 0), + # Bool + ("bool", False), + # Float + ("f2", np.nan), + ("f4", np.nan), + ("f8", np.nan), + # Object + ("O", np.nan), + # Interval + (IntervalDtype(), np.nan), + ], +) +def test_na_value_for_dtype(dtype, na_value): + result = na_value_for_dtype(pandas_dtype(dtype)) + # identify check doesn't work for datetime64/timedelta64("NaT") bc they + # are not singletons + assert result is na_value or ( + isna(result) and isna(na_value) and type(result) is type(na_value) + ) + + +class TestNAObj: + def _check_behavior(self, arr, expected): + result = libmissing.isnaobj(arr) + tm.assert_numpy_array_equal(result, expected) + result = libmissing.isnaobj(arr, inf_as_na=True) + tm.assert_numpy_array_equal(result, expected) + + arr = np.atleast_2d(arr) + expected = np.atleast_2d(expected) + + result = libmissing.isnaobj(arr) + tm.assert_numpy_array_equal(result, expected) + result = libmissing.isnaobj(arr, inf_as_na=True) + tm.assert_numpy_array_equal(result, expected) + + # Test fortran order + arr = arr.copy(order="F") + result = libmissing.isnaobj(arr) + tm.assert_numpy_array_equal(result, expected) + result = libmissing.isnaobj(arr, inf_as_na=True) + tm.assert_numpy_array_equal(result, expected) + + def test_basic(self): + arr = np.array([1, None, "foo", -5.1, NaT, np.nan]) + expected = np.array([False, True, False, False, True, True]) + + self._check_behavior(arr, expected) + + def test_non_obj_dtype(self): + arr = np.array([1, 3, np.nan, 5], dtype=float) + expected = np.array([False, False, True, False]) + + self._check_behavior(arr, expected) + + def test_empty_arr(self): + arr = np.array([]) + expected = np.array([], dtype=bool) + + self._check_behavior(arr, expected) + + def test_empty_str_inp(self): + arr = np.array([""]) # empty but not na + expected = np.array([False]) + + self._check_behavior(arr, expected) + + def test_empty_like(self): + # see gh-13717: no segfaults! + arr = np.empty_like([None]) + expected = np.array([True]) + + self._check_behavior(arr, expected) + + +m8_units = ["as", "ps", "ns", "us", "ms", "s", "m", "h", "D", "W", "M", "Y"] + +na_vals = ( + [ + None, + NaT, + float("NaN"), + complex("NaN"), + np.nan, + np.float64("NaN"), + np.float32("NaN"), + np.complex64(np.nan), + np.complex128(np.nan), + np.datetime64("NaT"), + np.timedelta64("NaT"), + ] + + [np.datetime64("NaT", unit) for unit in m8_units] + + [np.timedelta64("NaT", unit) for unit in m8_units] +) + +inf_vals = [ + float("inf"), + float("-inf"), + complex("inf"), + complex("-inf"), + np.inf, + -np.inf, +] + +int_na_vals = [ + # Values that match iNaT, which we treat as null in specific cases + np.int64(NaT._value), + int(NaT._value), +] + +sometimes_na_vals = [Decimal("NaN")] + +never_na_vals = [ + # float/complex values that when viewed as int64 match iNaT + -0.0, + np.float64("-0.0"), + -0j, + np.complex64(-0j), +] + + +class TestLibMissing: + @pytest.mark.parametrize("func", [libmissing.checknull, isna]) + @pytest.mark.parametrize( + "value", na_vals + sometimes_na_vals # type: ignore[operator] + ) + def test_checknull_na_vals(self, func, value): + assert func(value) + + @pytest.mark.parametrize("func", [libmissing.checknull, isna]) + @pytest.mark.parametrize("value", inf_vals) + def test_checknull_inf_vals(self, func, value): + assert not func(value) + + @pytest.mark.parametrize("func", [libmissing.checknull, isna]) + @pytest.mark.parametrize("value", int_na_vals) + def test_checknull_intna_vals(self, func, value): + assert not func(value) + + @pytest.mark.parametrize("func", [libmissing.checknull, isna]) + @pytest.mark.parametrize("value", never_na_vals) + def test_checknull_never_na_vals(self, func, value): + assert not func(value) + + @pytest.mark.parametrize( + "value", na_vals + sometimes_na_vals # type: ignore[operator] + ) + def test_checknull_old_na_vals(self, value): + assert libmissing.checknull(value, inf_as_na=True) + + @pytest.mark.parametrize("value", inf_vals) + def test_checknull_old_inf_vals(self, value): + assert libmissing.checknull(value, inf_as_na=True) + + @pytest.mark.parametrize("value", int_na_vals) + def test_checknull_old_intna_vals(self, value): + assert not libmissing.checknull(value, inf_as_na=True) + + @pytest.mark.parametrize("value", int_na_vals) + def test_checknull_old_never_na_vals(self, value): + assert not libmissing.checknull(value, inf_as_na=True) + + def test_is_matching_na(self, nulls_fixture, nulls_fixture2): + left = nulls_fixture + right = nulls_fixture2 + + assert libmissing.is_matching_na(left, left) + + if left is right: + assert libmissing.is_matching_na(left, right) + elif is_float(left) and is_float(right): + # np.nan vs float("NaN") we consider as matching + assert libmissing.is_matching_na(left, right) + elif type(left) is type(right): + # e.g. both Decimal("NaN") + assert libmissing.is_matching_na(left, right) + else: + assert not libmissing.is_matching_na(left, right) + + def test_is_matching_na_nan_matches_none(self): + assert not libmissing.is_matching_na(None, np.nan) + assert not libmissing.is_matching_na(np.nan, None) + + assert libmissing.is_matching_na(None, np.nan, nan_matches_none=True) + assert libmissing.is_matching_na(np.nan, None, nan_matches_none=True) + + +class TestIsValidNAForDtype: + def test_is_valid_na_for_dtype_interval(self): + dtype = IntervalDtype("int64", "left") + assert not is_valid_na_for_dtype(NaT, dtype) + + dtype = IntervalDtype("datetime64[ns]", "both") + assert not is_valid_na_for_dtype(NaT, dtype) + + def test_is_valid_na_for_dtype_categorical(self): + dtype = CategoricalDtype(categories=[0, 1, 2]) + assert is_valid_na_for_dtype(np.nan, dtype) + + assert not is_valid_na_for_dtype(NaT, dtype) + assert not is_valid_na_for_dtype(np.datetime64("NaT", "ns"), dtype) + assert not is_valid_na_for_dtype(np.timedelta64("NaT", "ns"), dtype) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1cff3a9c86eb014d3f371466689a6f5c8de530b9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93ffc0f5c35955dc8b6e27d19d0838564332aa79 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74c35cce4c5cc9fad0de60e42c8afb9f4c34cd3c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/conftest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_alter_axes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_alter_axes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d44f9de64102f53fb7359616b52de2732f33cec Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_alter_axes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..986ed20d7adf3195c2c484d027001f9d5413e913 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_arrow_interface.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_arrow_interface.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d154d94c1a4ef945b38d101e69aa552620a30d7c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_arrow_interface.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_block_internals.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_block_internals.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f28420862ec41f0c86a63510e215822d1a06918c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_block_internals.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_cumulative.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_cumulative.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..220d9f66a3b0bd56a7fee5d79d413f8aed4f3eaa Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_cumulative.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_iteration.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_iteration.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..209aad3ee7d447997417b527c0f5b7fe8dc4582f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_iteration.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_logical_ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_logical_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61b60f836a57a5621488642576363a01c2dc45fd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_logical_ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_nonunique_indexes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_nonunique_indexes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a1d3e9bc93687937064a92b7c139f6323cc65af Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_nonunique_indexes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_npfuncs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_npfuncs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54249a632c2ade752fe5a7f830dec47c942419dc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_npfuncs.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_query_eval.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_query_eval.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8aeab6e2e1605abac5a270d0e951476d670cdd9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_query_eval.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_repr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_repr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8922ad5fbf6139972e6dc02089804f29029d6e4d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_repr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_subclass.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_subclass.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5b3471659a18f82e82a6a8fc93e6f04bb337b9c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_subclass.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_ufunc.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_ufunc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38ebb638322e4fd1003e56b461d2ebddf62916b1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_ufunc.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_unary.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_unary.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4874ba1f0e0bab80517801d7991479095c9961fe Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_unary.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_validate.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_validate.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..127a17a435743c478770df3a27d9296ec8ea0268 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/__pycache__/test_validate.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/constructors/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/constructors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/constructors/test_from_dict.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/constructors/test_from_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..60a8e688b3b8adc495c7b6e6ddb47532df131852 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/constructors/test_from_dict.py @@ -0,0 +1,228 @@ +from collections import OrderedDict + +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +from pandas import ( + DataFrame, + Index, + MultiIndex, + RangeIndex, + Series, +) +import pandas._testing as tm + + +class TestFromDict: + # Note: these tests are specific to the from_dict method, not for + # passing dictionaries to DataFrame.__init__ + + def test_constructor_list_of_odicts(self): + data = [ + OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]]), + OrderedDict([["a", 1.5], ["b", 3], ["d", 6]]), + OrderedDict([["a", 1.5], ["d", 6]]), + OrderedDict(), + OrderedDict([["a", 1.5], ["b", 3], ["c", 4]]), + OrderedDict([["b", 3], ["c", 4], ["d", 6]]), + ] + + result = DataFrame(data) + expected = DataFrame.from_dict( + dict(zip(range(len(data)), data)), orient="index" + ) + tm.assert_frame_equal(result, expected.reindex(result.index)) + + def test_constructor_single_row(self): + data = [OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]])] + + result = DataFrame(data) + expected = DataFrame.from_dict(dict(zip([0], data)), orient="index").reindex( + result.index + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.skipif( + using_pyarrow_string_dtype(), reason="columns inferring logic broken" + ) + def test_constructor_list_of_series(self): + data = [ + OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), + OrderedDict([["a", 1.5], ["b", 3.0], ["c", 6.0]]), + ] + sdict = OrderedDict(zip(["x", "y"], data)) + idx = Index(["a", "b", "c"]) + + # all named + data2 = [ + Series([1.5, 3, 4], idx, dtype="O", name="x"), + Series([1.5, 3, 6], idx, name="y"), + ] + result = DataFrame(data2) + expected = DataFrame.from_dict(sdict, orient="index") + tm.assert_frame_equal(result, expected) + + # some unnamed + data2 = [ + Series([1.5, 3, 4], idx, dtype="O", name="x"), + Series([1.5, 3, 6], idx), + ] + result = DataFrame(data2) + + sdict = OrderedDict(zip(["x", "Unnamed 0"], data)) + expected = DataFrame.from_dict(sdict, orient="index") + tm.assert_frame_equal(result, expected) + + # none named + data = [ + OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]]), + OrderedDict([["a", 1.5], ["b", 3], ["d", 6]]), + OrderedDict([["a", 1.5], ["d", 6]]), + OrderedDict(), + OrderedDict([["a", 1.5], ["b", 3], ["c", 4]]), + OrderedDict([["b", 3], ["c", 4], ["d", 6]]), + ] + data = [Series(d) for d in data] + + result = DataFrame(data) + sdict = OrderedDict(zip(range(len(data)), data)) + expected = DataFrame.from_dict(sdict, orient="index") + tm.assert_frame_equal(result, expected.reindex(result.index)) + + result2 = DataFrame(data, index=np.arange(6, dtype=np.int64)) + tm.assert_frame_equal(result, result2) + + result = DataFrame([Series(dtype=object)]) + expected = DataFrame(index=[0]) + tm.assert_frame_equal(result, expected) + + data = [ + OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), + OrderedDict([["a", 1.5], ["b", 3.0], ["c", 6.0]]), + ] + sdict = OrderedDict(zip(range(len(data)), data)) + + idx = Index(["a", "b", "c"]) + data2 = [Series([1.5, 3, 4], idx, dtype="O"), Series([1.5, 3, 6], idx)] + result = DataFrame(data2) + expected = DataFrame.from_dict(sdict, orient="index") + tm.assert_frame_equal(result, expected) + + def test_constructor_orient(self, float_string_frame): + data_dict = float_string_frame.T._series + recons = DataFrame.from_dict(data_dict, orient="index") + expected = float_string_frame.reindex(index=recons.index) + tm.assert_frame_equal(recons, expected) + + # dict of sequence + a = {"hi": [32, 3, 3], "there": [3, 5, 3]} + rs = DataFrame.from_dict(a, orient="index") + xp = DataFrame.from_dict(a).T.reindex(list(a.keys())) + tm.assert_frame_equal(rs, xp) + + def test_constructor_from_ordered_dict(self): + # GH#8425 + a = OrderedDict( + [ + ("one", OrderedDict([("col_a", "foo1"), ("col_b", "bar1")])), + ("two", OrderedDict([("col_a", "foo2"), ("col_b", "bar2")])), + ("three", OrderedDict([("col_a", "foo3"), ("col_b", "bar3")])), + ] + ) + expected = DataFrame.from_dict(a, orient="columns").T + result = DataFrame.from_dict(a, orient="index") + tm.assert_frame_equal(result, expected) + + def test_from_dict_columns_parameter(self): + # GH#18529 + # Test new columns parameter for from_dict that was added to make + # from_items(..., orient='index', columns=[...]) easier to replicate + result = DataFrame.from_dict( + OrderedDict([("A", [1, 2]), ("B", [4, 5])]), + orient="index", + columns=["one", "two"], + ) + expected = DataFrame([[1, 2], [4, 5]], index=["A", "B"], columns=["one", "two"]) + tm.assert_frame_equal(result, expected) + + msg = "cannot use columns parameter with orient='columns'" + with pytest.raises(ValueError, match=msg): + DataFrame.from_dict( + {"A": [1, 2], "B": [4, 5]}, + orient="columns", + columns=["one", "two"], + ) + with pytest.raises(ValueError, match=msg): + DataFrame.from_dict({"A": [1, 2], "B": [4, 5]}, columns=["one", "two"]) + + @pytest.mark.parametrize( + "data_dict, orient, expected", + [ + ({}, "index", RangeIndex(0)), + ( + [{("a",): 1}, {("a",): 2}], + "columns", + Index([("a",)], tupleize_cols=False), + ), + ( + [OrderedDict([(("a",), 1), (("b",), 2)])], + "columns", + Index([("a",), ("b",)], tupleize_cols=False), + ), + ([{("a", "b"): 1}], "columns", Index([("a", "b")], tupleize_cols=False)), + ], + ) + def test_constructor_from_dict_tuples(self, data_dict, orient, expected): + # GH#16769 + df = DataFrame.from_dict(data_dict, orient) + result = df.columns + tm.assert_index_equal(result, expected) + + def test_frame_dict_constructor_empty_series(self): + s1 = Series( + [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)]) + ) + s2 = Series( + [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)]) + ) + s3 = Series(dtype=object) + + # it works! + DataFrame({"foo": s1, "bar": s2, "baz": s3}) + DataFrame.from_dict({"foo": s1, "baz": s3, "bar": s2}) + + def test_from_dict_scalars_requires_index(self): + msg = "If using all scalar values, you must pass an index" + with pytest.raises(ValueError, match=msg): + DataFrame.from_dict(OrderedDict([("b", 8), ("a", 5), ("a", 6)])) + + def test_from_dict_orient_invalid(self): + msg = ( + "Expected 'index', 'columns' or 'tight' for orient parameter. " + "Got 'abc' instead" + ) + with pytest.raises(ValueError, match=msg): + DataFrame.from_dict({"foo": 1, "baz": 3, "bar": 2}, orient="abc") + + def test_from_dict_order_with_single_column(self): + data = { + "alpha": { + "value2": 123, + "value1": 532, + "animal": 222, + "plant": False, + "name": "test", + } + } + result = DataFrame.from_dict( + data, + orient="columns", + ) + expected = DataFrame( + [[123], [532], [222], [False], ["test"]], + index=["value2", "value1", "animal", "plant", "name"], + columns=["alpha"], + ) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/constructors/test_from_records.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/constructors/test_from_records.py new file mode 100644 index 0000000000000000000000000000000000000000..3622571f1365d5a00cb4c84d45a5c169f4ff54d2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/constructors/test_from_records.py @@ -0,0 +1,505 @@ +from collections.abc import Iterator +from datetime import datetime +from decimal import Decimal + +import numpy as np +import pytest +import pytz + +from pandas._config import using_pyarrow_string_dtype + +from pandas.compat import is_platform_little_endian + +from pandas import ( + CategoricalIndex, + DataFrame, + Index, + Interval, + RangeIndex, + Series, + date_range, +) +import pandas._testing as tm + + +class TestFromRecords: + def test_from_records_dt64tz_frame(self): + # GH#51162 don't lose tz when calling from_records with DataFrame input + dti = date_range("2016-01-01", periods=10, tz="US/Pacific") + df = DataFrame({i: dti for i in range(4)}) + with tm.assert_produces_warning(FutureWarning): + res = DataFrame.from_records(df) + tm.assert_frame_equal(res, df) + + def test_from_records_with_datetimes(self): + # this may fail on certain platforms because of a numpy issue + # related GH#6140 + if not is_platform_little_endian(): + pytest.skip("known failure of test on non-little endian") + + # construction with a null in a recarray + # GH#6140 + expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]}) + + arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] + dtypes = [("EXPIRY", " None: + self.args = args + + def __getitem__(self, i): + return self.args[i] + + def __iter__(self) -> Iterator: + return iter(self.args) + + recs = [Record(1, 2, 3), Record(4, 5, 6), Record(7, 8, 9)] + tups = [tuple(rec) for rec in recs] + + result = DataFrame.from_records(recs) + expected = DataFrame.from_records(tups) + tm.assert_frame_equal(result, expected) + + def test_from_records_len0_with_columns(self): + # GH#2633 + result = DataFrame.from_records([], index="foo", columns=["foo", "bar"]) + expected = Index(["bar"]) + + assert len(result) == 0 + assert result.index.name == "foo" + tm.assert_index_equal(result.columns, expected) + + def test_from_records_series_list_dict(self): + # GH#27358 + expected = DataFrame([[{"a": 1, "b": 2}, {"a": 3, "b": 4}]]).T + data = Series([[{"a": 1, "b": 2}], [{"a": 3, "b": 4}]]) + result = DataFrame.from_records(data) + tm.assert_frame_equal(result, expected) + + def test_from_records_series_categorical_index(self): + # GH#32805 + index = CategoricalIndex( + [Interval(-20, -10), Interval(-10, 0), Interval(0, 10)] + ) + series_of_dicts = Series([{"a": 1}, {"a": 2}, {"b": 3}], index=index) + frame = DataFrame.from_records(series_of_dicts, index=index) + expected = DataFrame( + {"a": [1, 2, np.nan], "b": [np.nan, np.nan, 3]}, index=index + ) + tm.assert_frame_equal(frame, expected) + + def test_frame_from_records_utc(self): + rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)} + + # it works + DataFrame.from_records([rec], index="begin_time") + + def test_from_records_to_records(self): + # from numpy documentation + arr = np.zeros((2,), dtype=("i4,f4,S10")) + arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] + + DataFrame.from_records(arr) + + index = Index(np.arange(len(arr))[::-1]) + indexed_frame = DataFrame.from_records(arr, index=index) + tm.assert_index_equal(indexed_frame.index, index) + + # without names, it should go to last ditch + arr2 = np.zeros((2, 3)) + tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) + + # wrong length + msg = "|".join( + [ + r"Length of values \(2\) does not match length of index \(1\)", + ] + ) + with pytest.raises(ValueError, match=msg): + DataFrame.from_records(arr, index=index[:-1]) + + indexed_frame = DataFrame.from_records(arr, index="f1") + + # what to do? + records = indexed_frame.to_records() + assert len(records.dtype.names) == 3 + + records = indexed_frame.to_records(index=False) + assert len(records.dtype.names) == 2 + assert "index" not in records.dtype.names + + def test_from_records_nones(self): + tuples = [(1, 2, None, 3), (1, 2, None, 3), (None, 2, 5, 3)] + + df = DataFrame.from_records(tuples, columns=["a", "b", "c", "d"]) + assert np.isnan(df["c"][0]) + + def test_from_records_iterator(self): + arr = np.array( + [(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5.0, 5.0, 6, 6), (7.0, 7.0, 8, 8)], + dtype=[ + ("x", np.float64), + ("u", np.float32), + ("y", np.int64), + ("z", np.int32), + ], + ) + df = DataFrame.from_records(iter(arr), nrows=2) + xp = DataFrame( + { + "x": np.array([1.0, 3.0], dtype=np.float64), + "u": np.array([1.0, 3.0], dtype=np.float32), + "y": np.array([2, 4], dtype=np.int64), + "z": np.array([2, 4], dtype=np.int32), + } + ) + tm.assert_frame_equal(df.reindex_like(xp), xp) + + # no dtypes specified here, so just compare with the default + arr = [(1.0, 2), (3.0, 4), (5.0, 6), (7.0, 8)] + df = DataFrame.from_records(iter(arr), columns=["x", "y"], nrows=2) + tm.assert_frame_equal(df, xp.reindex(columns=["x", "y"]), check_dtype=False) + + def test_from_records_tuples_generator(self): + def tuple_generator(length): + for i in range(length): + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + yield (i, letters[i % len(letters)], i / length) + + columns_names = ["Integer", "String", "Float"] + columns = [ + [i[j] for i in tuple_generator(10)] for j in range(len(columns_names)) + ] + data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]} + expected = DataFrame(data, columns=columns_names) + + generator = tuple_generator(10) + result = DataFrame.from_records(generator, columns=columns_names) + tm.assert_frame_equal(result, expected) + + def test_from_records_lists_generator(self): + def list_generator(length): + for i in range(length): + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + yield [i, letters[i % len(letters)], i / length] + + columns_names = ["Integer", "String", "Float"] + columns = [ + [i[j] for i in list_generator(10)] for j in range(len(columns_names)) + ] + data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]} + expected = DataFrame(data, columns=columns_names) + + generator = list_generator(10) + result = DataFrame.from_records(generator, columns=columns_names) + tm.assert_frame_equal(result, expected) + + def test_from_records_columns_not_modified(self): + tuples = [(1, 2, 3), (1, 2, 3), (2, 5, 3)] + + columns = ["a", "b", "c"] + original_columns = list(columns) + + DataFrame.from_records(tuples, columns=columns, index="a") + + assert columns == original_columns + + def test_from_records_decimal(self): + tuples = [(Decimal("1.5"),), (Decimal("2.5"),), (None,)] + + df = DataFrame.from_records(tuples, columns=["a"]) + assert df["a"].dtype == object + + df = DataFrame.from_records(tuples, columns=["a"], coerce_float=True) + assert df["a"].dtype == np.float64 + assert np.isnan(df["a"].values[-1]) + + def test_from_records_duplicates(self): + result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"]) + + expected = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"]) + + tm.assert_frame_equal(result, expected) + + def test_from_records_set_index_name(self): + def create_dict(order_id): + return { + "order_id": order_id, + "quantity": np.random.default_rng(2).integers(1, 10), + "price": np.random.default_rng(2).integers(1, 10), + } + + documents = [create_dict(i) for i in range(10)] + # demo missing data + documents.append({"order_id": 10, "quantity": 5}) + + result = DataFrame.from_records(documents, index="order_id") + assert result.index.name == "order_id" + + # MultiIndex + result = DataFrame.from_records(documents, index=["order_id", "quantity"]) + assert result.index.names == ("order_id", "quantity") + + def test_from_records_misc_brokenness(self): + # GH#2179 + + data = {1: ["foo"], 2: ["bar"]} + + result = DataFrame.from_records(data, columns=["a", "b"]) + exp = DataFrame(data, columns=["a", "b"]) + tm.assert_frame_equal(result, exp) + + # overlap in index/index_names + + data = {"a": [1, 2, 3], "b": [4, 5, 6]} + + result = DataFrame.from_records(data, index=["a", "b", "c"]) + exp = DataFrame(data, index=["a", "b", "c"]) + tm.assert_frame_equal(result, exp) + + def test_from_records_misc_brokenness2(self): + # GH#2623 + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), "hi"]) # test col upconverts to obj + result = DataFrame.from_records(rows, columns=["date", "test"]) + expected = DataFrame( + {"date": [row[0] for row in rows], "test": [row[1] for row in rows]} + ) + tm.assert_frame_equal(result, expected) + assert result.dtypes["test"] == np.dtype(object) + + def test_from_records_misc_brokenness3(self): + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), 1]) + result = DataFrame.from_records(rows, columns=["date", "test"]) + expected = DataFrame( + {"date": [row[0] for row in rows], "test": [row[1] for row in rows]} + ) + tm.assert_frame_equal(result, expected) + + def test_from_records_empty(self): + # GH#3562 + result = DataFrame.from_records([], columns=["a", "b", "c"]) + expected = DataFrame(columns=["a", "b", "c"]) + tm.assert_frame_equal(result, expected) + + result = DataFrame.from_records([], columns=["a", "b", "b"]) + expected = DataFrame(columns=["a", "b", "b"]) + tm.assert_frame_equal(result, expected) + + def test_from_records_empty_with_nonempty_fields_gh3682(self): + a = np.array([(1, 2)], dtype=[("id", np.int64), ("value", np.int64)]) + df = DataFrame.from_records(a, index="id") + + ex_index = Index([1], name="id") + expected = DataFrame({"value": [2]}, index=ex_index, columns=["value"]) + tm.assert_frame_equal(df, expected) + + b = a[:0] + df2 = DataFrame.from_records(b, index="id") + tm.assert_frame_equal(df2, df.iloc[:0]) + + def test_from_records_empty2(self): + # GH#42456 + dtype = [("prop", int)] + shape = (0, len(dtype)) + arr = np.empty(shape, dtype=dtype) + + result = DataFrame.from_records(arr) + expected = DataFrame({"prop": np.array([], dtype=int)}) + tm.assert_frame_equal(result, expected) + + alt = DataFrame(arr) + tm.assert_frame_equal(alt, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_coercion.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_coercion.py new file mode 100644 index 0000000000000000000000000000000000000000..ba0d8613b62287942f7fffeac96efe2abd0c9cd9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_coercion.py @@ -0,0 +1,199 @@ +""" +Tests for values coercion in setitem-like operations on DataFrame. + +For the most part, these should be multi-column DataFrames, otherwise +we would share the tests with Series. +""" +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameSetitemCoercion: + @pytest.mark.parametrize("consolidate", [True, False]) + def test_loc_setitem_multiindex_columns(self, consolidate): + # GH#18415 Setting values in a single column preserves dtype, + # while setting them in multiple columns did unwanted cast. + + # Note that A here has 2 blocks, below we do the same thing + # with a consolidated frame. + A = DataFrame(np.zeros((6, 5), dtype=np.float32)) + A = pd.concat([A, A], axis=1, keys=[1, 2]) + if consolidate: + A = A._consolidate() + + A.loc[2:3, (1, slice(2, 3))] = np.ones((2, 2), dtype=np.float32) + assert (A.dtypes == np.float32).all() + + A.loc[0:5, (1, slice(2, 3))] = np.ones((6, 2), dtype=np.float32) + + assert (A.dtypes == np.float32).all() + + A.loc[:, (1, slice(2, 3))] = np.ones((6, 2), dtype=np.float32) + assert (A.dtypes == np.float32).all() + + # TODO: i think this isn't about MultiIndex and could be done with iloc? + + +def test_37477(): + # fixed by GH#45121 + orig = DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]}) + expected = DataFrame({"A": [1, 2, 3], "B": [3, 1.2, 5]}) + + df = orig.copy() + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + df.at[1, "B"] = 1.2 + tm.assert_frame_equal(df, expected) + + df = orig.copy() + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + df.loc[1, "B"] = 1.2 + tm.assert_frame_equal(df, expected) + + df = orig.copy() + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + df.iat[1, 1] = 1.2 + tm.assert_frame_equal(df, expected) + + df = orig.copy() + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + df.iloc[1, 1] = 1.2 + tm.assert_frame_equal(df, expected) + + +def test_6942(indexer_al): + # check that the .at __setitem__ after setting "Live" actually sets the data + start = Timestamp("2014-04-01") + t1 = Timestamp("2014-04-23 12:42:38.883082") + t2 = Timestamp("2014-04-24 01:33:30.040039") + + dti = date_range(start, periods=1) + orig = DataFrame(index=dti, columns=["timenow", "Live"]) + + df = orig.copy() + indexer_al(df)[start, "timenow"] = t1 + + df["Live"] = True + + df.at[start, "timenow"] = t2 + assert df.iloc[0, 0] == t2 + + +def test_26395(indexer_al): + # .at case fixed by GH#45121 (best guess) + df = DataFrame(index=["A", "B", "C"]) + df["D"] = 0 + + indexer_al(df)["C", "D"] = 2 + expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) + tm.assert_frame_equal(df, expected) + + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + indexer_al(df)["C", "D"] = 44.5 + expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64) + tm.assert_frame_equal(df, expected) + + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + indexer_al(df)["C", "D"] = "hello" + expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.xfail(reason="unwanted upcast") +def test_15231(): + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + df.loc[2] = Series({"a": 5, "b": 6}) + assert (df.dtypes == np.int64).all() + + df.loc[3] = Series({"a": 7}) + + # df["a"] doesn't have any NaNs, should not have been cast + exp_dtypes = Series([np.int64, np.float64], dtype=object, index=["a", "b"]) + tm.assert_series_equal(df.dtypes, exp_dtypes) + + +def test_iloc_setitem_unnecesssary_float_upcasting(): + # GH#12255 + df = DataFrame( + { + 0: np.array([1, 3], dtype=np.float32), + 1: np.array([2, 4], dtype=np.float32), + 2: ["a", "b"], + } + ) + orig = df.copy() + + values = df[0].values.reshape(2, 1) + df.iloc[:, 0:1] = values + + tm.assert_frame_equal(df, orig) + + +@pytest.mark.xfail(reason="unwanted casting to dt64") +def test_12499(): + # TODO: OP in GH#12499 used np.datetim64("NaT") instead of pd.NaT, + # which has consequences for the expected df["two"] (though i think at + # the time it might not have because of a separate bug). See if it makes + # a difference which one we use here. + ts = Timestamp("2016-03-01 03:13:22.98986", tz="UTC") + + data = [{"one": 0, "two": ts}] + orig = DataFrame(data) + df = orig.copy() + df.loc[1] = [np.nan, NaT] + + expected = DataFrame( + {"one": [0, np.nan], "two": Series([ts, NaT], dtype="datetime64[ns, UTC]")} + ) + tm.assert_frame_equal(df, expected) + + data = [{"one": 0, "two": ts}] + df = orig.copy() + df.loc[1, :] = [np.nan, NaT] + tm.assert_frame_equal(df, expected) + + +def test_20476(): + mi = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) + df = DataFrame(-1, index=range(3), columns=mi) + filler = DataFrame([[1, 2, 3.0]] * 3, index=range(3), columns=["a", "b", "c"]) + df["A"] = filler + + expected = DataFrame( + { + 0: [1, 1, 1], + 1: [2, 2, 2], + 2: [3.0, 3.0, 3.0], + 3: [-1, -1, -1], + 4: [-1, -1, -1], + 5: [-1, -1, -1], + } + ) + expected.columns = mi + exp_dtypes = Series( + [np.dtype(np.int64)] * 2 + [np.dtype(np.float64)] + [np.dtype(np.int64)] * 3, + index=mi, + ) + tm.assert_series_equal(df.dtypes, exp_dtypes) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_delitem.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_delitem.py new file mode 100644 index 0000000000000000000000000000000000000000..daec991b7a8dbf8de0221f041e767cb9ce58ae29 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_delitem.py @@ -0,0 +1,60 @@ +import re + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + MultiIndex, +) + + +class TestDataFrameDelItem: + def test_delitem(self, float_frame): + del float_frame["A"] + assert "A" not in float_frame + + def test_delitem_multiindex(self): + midx = MultiIndex.from_product([["A", "B"], [1, 2]]) + df = DataFrame(np.random.default_rng(2).standard_normal((4, 4)), columns=midx) + assert len(df.columns) == 4 + assert ("A",) in df.columns + assert "A" in df.columns + + result = df["A"] + assert isinstance(result, DataFrame) + del df["A"] + + assert len(df.columns) == 2 + + # A still in the levels, BUT get a KeyError if trying + # to delete + assert ("A",) not in df.columns + with pytest.raises(KeyError, match=re.escape("('A',)")): + del df[("A",)] + + # behavior of dropped/deleted MultiIndex levels changed from + # GH 2770 to GH 19027: MultiIndex no longer '.__contains__' + # levels which are dropped/deleted + assert "A" not in df.columns + with pytest.raises(KeyError, match=re.escape("('A',)")): + del df["A"] + + def test_delitem_corner(self, float_frame): + f = float_frame.copy() + del f["D"] + assert len(f.columns) == 3 + with pytest.raises(KeyError, match=r"^'D'$"): + del f["D"] + del f["B"] + assert len(f.columns) == 2 + + def test_delitem_col_still_multiindex(self): + arrays = [["a", "b", "c", "top"], ["", "", "", "OD"], ["", "", "", "wx"]] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + + df = DataFrame(np.random.default_rng(2).standard_normal((3, 4)), columns=index) + del df[("a", "", "")] + assert isinstance(df.columns, MultiIndex) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_get.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_get.py new file mode 100644 index 0000000000000000000000000000000000000000..5f2651eec683c10097fb623728048b64778c87e8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_get.py @@ -0,0 +1,27 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestGet: + def test_get(self, float_frame): + b = float_frame.get("B") + tm.assert_series_equal(b, float_frame["B"]) + + assert float_frame.get("foo") is None + tm.assert_series_equal( + float_frame.get("foo", float_frame["B"]), float_frame["B"] + ) + + @pytest.mark.parametrize( + "df", + [ + DataFrame(), + DataFrame(columns=list("AB")), + DataFrame(columns=list("AB"), index=range(3)), + ], + ) + def test_get_none(self, df): + # see gh-5652 + assert df.get(None) is None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_get_value.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_get_value.py new file mode 100644 index 0000000000000000000000000000000000000000..65a1c64a1578ad0cadd9ed6470ab60a2087ffec5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_get_value.py @@ -0,0 +1,22 @@ +import pytest + +from pandas import ( + DataFrame, + MultiIndex, +) + + +class TestGetValue: + def test_get_set_value_no_partial_indexing(self): + # partial w/ MultiIndex raise exception + index = MultiIndex.from_tuples([(0, 1), (0, 2), (1, 1), (1, 2)]) + df = DataFrame(index=index, columns=range(4)) + with pytest.raises(KeyError, match=r"^0$"): + df._get_value(0, 1) + + def test_get_value(self, float_frame): + for idx in float_frame.index: + for col in float_frame.columns: + result = float_frame._get_value(idx, col) + expected = float_frame[col][idx] + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_getitem.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_getitem.py new file mode 100644 index 0000000000000000000000000000000000000000..a36b0c0e850b3d0994349065cc5390c9f40cbf60 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_getitem.py @@ -0,0 +1,472 @@ +import re + +import numpy as np +import pytest + +from pandas import ( + Categorical, + CategoricalDtype, + CategoricalIndex, + DataFrame, + DateOffset, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + concat, + date_range, + get_dummies, + period_range, +) +import pandas._testing as tm +from pandas.core.arrays import SparseArray + + +class TestGetitem: + def test_getitem_unused_level_raises(self): + # GH#20410 + mi = MultiIndex( + levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]], + codes=[[1, 0], [1, 0]], + ) + df = DataFrame(-1, index=range(3), columns=mi) + + with pytest.raises(KeyError, match="notevenone"): + df["notevenone"] + + def test_getitem_periodindex(self): + rng = period_range("1/1/2000", periods=5) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)), columns=rng) + + ts = df[rng[0]] + tm.assert_series_equal(ts, df.iloc[:, 0]) + + ts = df["1/1/2000"] + tm.assert_series_equal(ts, df.iloc[:, 0]) + + def test_getitem_list_of_labels_categoricalindex_cols(self): + # GH#16115 + cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) + + expected = DataFrame([[1, 0], [0, 1]], dtype="bool", index=[0, 1], columns=cats) + dummies = get_dummies(cats) + result = dummies[list(dummies.columns)] + tm.assert_frame_equal(result, expected) + + def test_getitem_sparse_column_return_type_and_dtype(self): + # https://github.com/pandas-dev/pandas/issues/23559 + data = SparseArray([0, 1]) + df = DataFrame({"A": data}) + expected = Series(data, name="A") + result = df["A"] + tm.assert_series_equal(result, expected) + + # Also check iloc and loc while we're here + result = df.iloc[:, 0] + tm.assert_series_equal(result, expected) + + result = df.loc[:, "A"] + tm.assert_series_equal(result, expected) + + def test_getitem_string_columns(self): + # GH#46185 + df = DataFrame([[1, 2]], columns=Index(["A", "B"], dtype="string")) + result = df.A + expected = df["A"] + tm.assert_series_equal(result, expected) + + +class TestGetitemListLike: + def test_getitem_list_missing_key(self): + # GH#13822, incorrect error string with non-unique columns when missing + # column is accessed + df = DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}) + df.columns = ["x", "x", "z"] + + # Check that we get the correct value in the KeyError + with pytest.raises(KeyError, match=r"\['y'\] not in index"): + df[["x", "y", "z"]] + + def test_getitem_list_duplicates(self): + # GH#1943 + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 4)), columns=list("AABC") + ) + df.columns.name = "foo" + + result = df[["B", "C"]] + assert result.columns.name == "foo" + + expected = df.iloc[:, 2:] + tm.assert_frame_equal(result, expected) + + def test_getitem_dupe_cols(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) + msg = "\"None of [Index(['baf'], dtype=" + with pytest.raises(KeyError, match=re.escape(msg)): + df[["baf"]] + + @pytest.mark.parametrize( + "idx_type", + [ + list, + iter, + Index, + set, + lambda keys: dict(zip(keys, range(len(keys)))), + lambda keys: dict(zip(keys, range(len(keys)))).keys(), + ], + ids=["list", "iter", "Index", "set", "dict", "dict_keys"], + ) + @pytest.mark.parametrize("levels", [1, 2]) + def test_getitem_listlike(self, idx_type, levels, float_frame): + # GH#21294 + + if levels == 1: + frame, missing = float_frame, "food" + else: + # MultiIndex columns + frame = DataFrame( + np.random.default_rng(2).standard_normal((8, 3)), + columns=Index( + [("foo", "bar"), ("baz", "qux"), ("peek", "aboo")], + name=("sth", "sth2"), + ), + ) + missing = ("good", "food") + + keys = [frame.columns[1], frame.columns[0]] + idx = idx_type(keys) + idx_check = list(idx_type(keys)) + + if isinstance(idx, (set, dict)): + with pytest.raises(TypeError, match="as an indexer is not supported"): + frame[idx] + + return + else: + result = frame[idx] + + expected = frame.loc[:, idx_check] + expected.columns.names = frame.columns.names + + tm.assert_frame_equal(result, expected) + + idx = idx_type(keys + [missing]) + with pytest.raises(KeyError, match="not in index"): + frame[idx] + + def test_getitem_iloc_generator(self): + # GH#39614 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + indexer = (x for x in [1, 2]) + result = df.iloc[indexer] + expected = DataFrame({"a": [2, 3], "b": [5, 6]}, index=[1, 2]) + tm.assert_frame_equal(result, expected) + + def test_getitem_iloc_two_dimensional_generator(self): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + indexer = (x for x in [1, 2]) + result = df.iloc[indexer, 1] + expected = Series([5, 6], name="b", index=[1, 2]) + tm.assert_series_equal(result, expected) + + def test_getitem_iloc_dateoffset_days(self): + # GH 46671 + df = DataFrame( + list(range(10)), + index=date_range("01-01-2022", periods=10, freq=DateOffset(days=1)), + ) + result = df.loc["2022-01-01":"2022-01-03"] + expected = DataFrame( + [0, 1, 2], + index=DatetimeIndex( + ["2022-01-01", "2022-01-02", "2022-01-03"], + dtype="datetime64[ns]", + freq=DateOffset(days=1), + ), + ) + tm.assert_frame_equal(result, expected) + + df = DataFrame( + list(range(10)), + index=date_range( + "01-01-2022", periods=10, freq=DateOffset(days=1, hours=2) + ), + ) + result = df.loc["2022-01-01":"2022-01-03"] + expected = DataFrame( + [0, 1, 2], + index=DatetimeIndex( + ["2022-01-01 00:00:00", "2022-01-02 02:00:00", "2022-01-03 04:00:00"], + dtype="datetime64[ns]", + freq=DateOffset(days=1, hours=2), + ), + ) + tm.assert_frame_equal(result, expected) + + df = DataFrame( + list(range(10)), + index=date_range("01-01-2022", periods=10, freq=DateOffset(minutes=3)), + ) + result = df.loc["2022-01-01":"2022-01-03"] + tm.assert_frame_equal(result, df) + + +class TestGetitemCallable: + def test_getitem_callable(self, float_frame): + # GH#12533 + result = float_frame[lambda x: "A"] + expected = float_frame.loc[:, "A"] + tm.assert_series_equal(result, expected) + + result = float_frame[lambda x: ["A", "B"]] + expected = float_frame.loc[:, ["A", "B"]] + tm.assert_frame_equal(result, float_frame.loc[:, ["A", "B"]]) + + df = float_frame[:3] + result = df[lambda x: [True, False, True]] + expected = float_frame.iloc[[0, 2], :] + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_columns_one_level(self): + # GH#29749 + df = DataFrame([[1, 2]], columns=[["a", "b"]]) + expected = DataFrame([1], columns=[["a"]]) + + result = df["a"] + tm.assert_frame_equal(result, expected) + + result = df.loc[:, "a"] + tm.assert_frame_equal(result, expected) + + +class TestGetitemBooleanMask: + def test_getitem_bool_mask_categorical_index(self): + df3 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + }, + index=CategoricalIndex( + [1, 1, 2, 1, 3, 2], + dtype=CategoricalDtype([3, 2, 1], ordered=True), + name="B", + ), + ) + df4 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + }, + index=CategoricalIndex( + [1, 1, 2, 1, 3, 2], + dtype=CategoricalDtype([3, 2, 1], ordered=False), + name="B", + ), + ) + + result = df3[df3.index == "a"] + expected = df3.iloc[[]] + tm.assert_frame_equal(result, expected) + + result = df4[df4.index == "a"] + expected = df4.iloc[[]] + tm.assert_frame_equal(result, expected) + + result = df3[df3.index == 1] + expected = df3.iloc[[0, 1, 3]] + tm.assert_frame_equal(result, expected) + + result = df4[df4.index == 1] + expected = df4.iloc[[0, 1, 3]] + tm.assert_frame_equal(result, expected) + + # since we have an ordered categorical + + # CategoricalIndex([1, 1, 2, 1, 3, 2], + # categories=[3, 2, 1], + # ordered=True, + # name='B') + result = df3[df3.index < 2] + expected = df3.iloc[[4]] + tm.assert_frame_equal(result, expected) + + result = df3[df3.index > 1] + expected = df3.iloc[[]] + tm.assert_frame_equal(result, expected) + + # unordered + # cannot be compared + + # CategoricalIndex([1, 1, 2, 1, 3, 2], + # categories=[3, 2, 1], + # ordered=False, + # name='B') + msg = "Unordered Categoricals can only compare equality or not" + with pytest.raises(TypeError, match=msg): + df4[df4.index < 2] + with pytest.raises(TypeError, match=msg): + df4[df4.index > 1] + + @pytest.mark.parametrize( + "data1,data2,expected_data", + ( + ( + [[1, 2], [3, 4]], + [[0.5, 6], [7, 8]], + [[np.nan, 3.0], [np.nan, 4.0], [np.nan, 7.0], [6.0, 8.0]], + ), + ( + [[1, 2], [3, 4]], + [[5, 6], [7, 8]], + [[np.nan, 3.0], [np.nan, 4.0], [5, 7], [6, 8]], + ), + ), + ) + def test_getitem_bool_mask_duplicate_columns_mixed_dtypes( + self, + data1, + data2, + expected_data, + ): + # GH#31954 + + df1 = DataFrame(np.array(data1)) + df2 = DataFrame(np.array(data2)) + df = concat([df1, df2], axis=1) + + result = df[df > 2] + + exdict = {i: np.array(col) for i, col in enumerate(expected_data)} + expected = DataFrame(exdict).rename(columns={2: 0, 3: 1}) + tm.assert_frame_equal(result, expected) + + @pytest.fixture + def df_dup_cols(self): + dups = ["A", "A", "C", "D"] + df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") + return df + + def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self, df_dup_cols): + # `df.A > 6` is a DataFrame with a different shape from df + + # boolean with the duplicate raises + df = df_dup_cols + msg = "cannot reindex on an axis with duplicate labels" + with pytest.raises(ValueError, match=msg): + df[df.A > 6] + + def test_getitem_boolean_series_with_duplicate_columns(self, df_dup_cols): + # boolean indexing + # GH#4879 + df = DataFrame( + np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" + ) + expected = df[df.C > 6] + expected.columns = df_dup_cols.columns + + df = df_dup_cols + result = df[df.C > 6] + + tm.assert_frame_equal(result, expected) + + def test_getitem_boolean_frame_with_duplicate_columns(self, df_dup_cols): + # where + df = DataFrame( + np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" + ) + # `df > 6` is a DataFrame with the same shape+alignment as df + expected = df[df > 6] + expected.columns = df_dup_cols.columns + + df = df_dup_cols + result = df[df > 6] + + tm.assert_frame_equal(result, expected) + + def test_getitem_empty_frame_with_boolean(self): + # Test for issue GH#11859 + + df = DataFrame() + df2 = df[df > 0] + tm.assert_frame_equal(df, df2) + + def test_getitem_returns_view_when_column_is_unique_in_df( + self, using_copy_on_write, warn_copy_on_write + ): + # GH#45316 + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) + df_orig = df.copy() + view = df["b"] + with tm.assert_cow_warning(warn_copy_on_write): + view.loc[:] = 100 + if using_copy_on_write: + expected = df_orig + else: + expected = DataFrame([[1, 2, 100], [4, 5, 100]], columns=["a", "a", "b"]) + tm.assert_frame_equal(df, expected) + + def test_getitem_frozenset_unique_in_column(self): + # GH#41062 + df = DataFrame([[1, 2, 3, 4]], columns=[frozenset(["KEY"]), "B", "C", "C"]) + result = df[frozenset(["KEY"])] + expected = Series([1], name=frozenset(["KEY"])) + tm.assert_series_equal(result, expected) + + +class TestGetitemSlice: + def test_getitem_slice_float64(self, frame_or_series): + values = np.arange(10.0, 50.0, 2) + index = Index(values) + + start, end = values[[5, 15]] + + data = np.random.default_rng(2).standard_normal((20, 3)) + if frame_or_series is not DataFrame: + data = data[:, 0] + + obj = frame_or_series(data, index=index) + + result = obj[start:end] + expected = obj.iloc[5:16] + tm.assert_equal(result, expected) + + result = obj.loc[start:end] + tm.assert_equal(result, expected) + + def test_getitem_datetime_slice(self): + # GH#43223 + df = DataFrame( + {"a": 0}, + index=DatetimeIndex( + [ + "11.01.2011 22:00", + "11.01.2011 23:00", + "12.01.2011 00:00", + "2011-01-13 00:00", + ] + ), + ) + with pytest.raises( + KeyError, match="Value based partial slicing on non-monotonic" + ): + df["2011-01-01":"2011-11-01"] + + def test_getitem_slice_same_dim_only_one_axis(self): + # GH#54622 + df = DataFrame(np.random.default_rng(2).standard_normal((10, 8))) + result = df.iloc[(slice(None, None, 2),)] + assert result.shape == (5, 8) + expected = df.iloc[slice(None, None, 2), slice(None)] + tm.assert_frame_equal(result, expected) + + +class TestGetitemDeprecatedIndexers: + @pytest.mark.parametrize("key", [{"a", "b"}, {"a": "a"}]) + def test_getitem_dict_and_set_deprecated(self, key): + # GH#42825 enforced in 2.0 + df = DataFrame( + [[1, 2], [3, 4]], columns=MultiIndex.from_tuples([("a", 1), ("b", 2)]) + ) + with pytest.raises(TypeError, match="as an indexer is not supported"): + df[key] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..22d9c7f26a57ceb3876e2e30cef11017cb8b24b0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_indexing.py @@ -0,0 +1,2024 @@ +from collections import namedtuple +from datetime import ( + datetime, + timedelta, +) +from decimal import Decimal +import re + +import numpy as np +import pytest + +from pandas._libs import iNaT +from pandas.errors import ( + InvalidIndexError, + PerformanceWarning, + SettingWithCopyError, +) +import pandas.util._test_decorators as td + +from pandas.core.dtypes.common import is_integer + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + isna, + notna, + to_datetime, +) +import pandas._testing as tm + +# We pass through a TypeError raised by numpy +_slice_msg = "slice indices must be integers or None or have an __index__ method" + + +class TestDataFrameIndexing: + def test_getitem(self, float_frame): + # Slicing + sl = float_frame[:20] + assert len(sl.index) == 20 + + # Column access + for _, series in sl.items(): + assert len(series.index) == 20 + tm.assert_index_equal(series.index, sl.index) + + for key, _ in float_frame._series.items(): + assert float_frame[key] is not None + + assert "random" not in float_frame + with pytest.raises(KeyError, match="random"): + float_frame["random"] + + def test_getitem_numeric_should_not_fallback_to_positional(self, any_numeric_dtype): + # GH51053 + dtype = any_numeric_dtype + idx = Index([1, 0, 1], dtype=dtype) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=idx) + result = df[1] + expected = DataFrame([[1, 3], [4, 6]], columns=Index([1, 1], dtype=dtype)) + tm.assert_frame_equal(result, expected, check_exact=True) + + def test_getitem2(self, float_frame): + df = float_frame.copy() + df["$10"] = np.random.default_rng(2).standard_normal(len(df)) + + ad = np.random.default_rng(2).standard_normal(len(df)) + df["@awesome_domain"] = ad + + with pytest.raises(KeyError, match=re.escape("'df[\"$10\"]'")): + df.__getitem__('df["$10"]') + + res = df["@awesome_domain"] + tm.assert_numpy_array_equal(ad, res.values) + + def test_setitem_numeric_should_not_fallback_to_positional(self, any_numeric_dtype): + # GH51053 + dtype = any_numeric_dtype + idx = Index([1, 0, 1], dtype=dtype) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=idx) + df[1] = 10 + expected = DataFrame([[10, 2, 10], [10, 5, 10]], columns=idx) + tm.assert_frame_equal(df, expected, check_exact=True) + + def test_setitem_list(self, float_frame): + float_frame["E"] = "foo" + data = float_frame[["A", "B"]] + float_frame[["B", "A"]] = data + + tm.assert_series_equal(float_frame["B"], data["A"], check_names=False) + tm.assert_series_equal(float_frame["A"], data["B"], check_names=False) + + msg = "Columns must be same length as key" + with pytest.raises(ValueError, match=msg): + data[["A"]] = float_frame[["A", "B"]] + newcolumndata = range(len(data.index) - 1) + msg = ( + rf"Length of values \({len(newcolumndata)}\) " + rf"does not match length of index \({len(data)}\)" + ) + with pytest.raises(ValueError, match=msg): + data["A"] = newcolumndata + + def test_setitem_list2(self): + df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=int) + df.loc[1, ["tt1", "tt2"]] = [1, 2] + + result = df.loc[df.index[1], ["tt1", "tt2"]] + expected = Series([1, 2], df.columns, dtype=int, name=1) + tm.assert_series_equal(result, expected) + + df["tt1"] = df["tt2"] = "0" + df.loc[df.index[1], ["tt1", "tt2"]] = ["1", "2"] + result = df.loc[df.index[1], ["tt1", "tt2"]] + expected = Series(["1", "2"], df.columns, name=1) + tm.assert_series_equal(result, expected) + + def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_frame): + # boolean indexing + d = datetime_frame.index[10] + indexer = datetime_frame.index > d + indexer_obj = indexer.astype(object) + + subindex = datetime_frame.index[indexer] + subframe = datetime_frame[indexer] + + tm.assert_index_equal(subindex, subframe.index) + with pytest.raises(ValueError, match="Item wrong length"): + datetime_frame[indexer[:-1]] + + subframe_obj = datetime_frame[indexer_obj] + tm.assert_frame_equal(subframe_obj, subframe) + + with pytest.raises(ValueError, match="Boolean array expected"): + datetime_frame[datetime_frame] + + # test that Series work + indexer_obj = Series(indexer_obj, datetime_frame.index) + + subframe_obj = datetime_frame[indexer_obj] + tm.assert_frame_equal(subframe_obj, subframe) + + # test that Series indexers reindex + # we are producing a warning that since the passed boolean + # key is not the same as the given index, we will reindex + # not sure this is really necessary + with tm.assert_produces_warning(UserWarning): + indexer_obj = indexer_obj.reindex(datetime_frame.index[::-1]) + subframe_obj = datetime_frame[indexer_obj] + tm.assert_frame_equal(subframe_obj, subframe) + + # test df[df > 0] + for df in [ + datetime_frame, + mixed_float_frame, + mixed_int_frame, + ]: + data = df._get_numeric_data() + bif = df[df > 0] + bifw = DataFrame( + {c: np.where(data[c] > 0, data[c], np.nan) for c in data.columns}, + index=data.index, + columns=data.columns, + ) + + # add back other columns to compare + for c in df.columns: + if c not in bifw: + bifw[c] = df[c] + bifw = bifw.reindex(columns=df.columns) + + tm.assert_frame_equal(bif, bifw, check_dtype=False) + for c in df.columns: + if bif[c].dtype != bifw[c].dtype: + assert bif[c].dtype == df[c].dtype + + def test_getitem_boolean_casting(self, datetime_frame): + # don't upcast if we don't need to + df = datetime_frame.copy() + df["E"] = 1 + df["E"] = df["E"].astype("int32") + df["E1"] = df["E"].copy() + df["F"] = 1 + df["F"] = df["F"].astype("int64") + df["F1"] = df["F"].copy() + + casted = df[df > 0] + result = casted.dtypes + expected = Series( + [np.dtype("float64")] * 4 + + [np.dtype("int32")] * 2 + + [np.dtype("int64")] * 2, + index=["A", "B", "C", "D", "E", "E1", "F", "F1"], + ) + tm.assert_series_equal(result, expected) + + # int block splitting + df.loc[df.index[1:3], ["E1", "F1"]] = 0 + casted = df[df > 0] + result = casted.dtypes + expected = Series( + [np.dtype("float64")] * 4 + + [np.dtype("int32")] + + [np.dtype("float64")] + + [np.dtype("int64")] + + [np.dtype("float64")], + index=["A", "B", "C", "D", "E", "E1", "F", "F1"], + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "lst", [[True, False, True], [True, True, True], [False, False, False]] + ) + def test_getitem_boolean_list(self, lst): + df = DataFrame(np.arange(12).reshape(3, 4)) + result = df[lst] + expected = df.loc[df.index[lst]] + tm.assert_frame_equal(result, expected) + + def test_getitem_boolean_iadd(self): + arr = np.random.default_rng(2).standard_normal((5, 5)) + + df = DataFrame(arr.copy(), columns=["A", "B", "C", "D", "E"]) + + df[df < 0] += 1 + arr[arr < 0] += 1 + + tm.assert_almost_equal(df.values, arr) + + def test_boolean_index_empty_corner(self): + # #2096 + blah = DataFrame(np.empty([0, 1]), columns=["A"], index=DatetimeIndex([])) + + # both of these should succeed trivially + k = np.array([], bool) + + blah[k] + blah[k] = 0 + + def test_getitem_ix_mixed_integer(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 3)), + index=[1, 10, "C", "E"], + columns=[1, 2, 3], + ) + + result = df.iloc[:-1] + expected = df.loc[df.index[:-1]] + tm.assert_frame_equal(result, expected) + + result = df.loc[[1, 10]] + expected = df.loc[Index([1, 10])] + tm.assert_frame_equal(result, expected) + + def test_getitem_ix_mixed_integer2(self): + # 11320 + df = DataFrame( + { + "rna": (1.5, 2.2, 3.2, 4.5), + -1000: [11, 21, 36, 40], + 0: [10, 22, 43, 34], + 1000: [0, 10, 20, 30], + }, + columns=["rna", -1000, 0, 1000], + ) + result = df[[1000]] + expected = df.iloc[:, [3]] + tm.assert_frame_equal(result, expected) + result = df[[-1000]] + expected = df.iloc[:, [1]] + tm.assert_frame_equal(result, expected) + + def test_getattr(self, float_frame): + tm.assert_series_equal(float_frame.A, float_frame["A"]) + msg = "'DataFrame' object has no attribute 'NONEXISTENT_NAME'" + with pytest.raises(AttributeError, match=msg): + float_frame.NONEXISTENT_NAME + + def test_setattr_column(self): + df = DataFrame({"foobar": 1}, index=range(10)) + + df.foobar = 5 + assert (df.foobar == 5).all() + + def test_setitem( + self, float_frame, using_copy_on_write, warn_copy_on_write, using_infer_string + ): + # not sure what else to do here + series = float_frame["A"][::2] + float_frame["col5"] = series + assert "col5" in float_frame + + assert len(series) == 15 + assert len(float_frame) == 30 + + exp = np.ravel(np.column_stack((series.values, [np.nan] * 15))) + exp = Series(exp, index=float_frame.index, name="col5") + tm.assert_series_equal(float_frame["col5"], exp) + + series = float_frame["A"] + float_frame["col6"] = series + tm.assert_series_equal(series, float_frame["col6"], check_names=False) + + # set ndarray + arr = np.random.default_rng(2).standard_normal(len(float_frame)) + float_frame["col9"] = arr + assert (float_frame["col9"] == arr).all() + + float_frame["col7"] = 5 + assert (float_frame["col7"] == 5).all() + + float_frame["col0"] = 3.14 + assert (float_frame["col0"] == 3.14).all() + + float_frame["col8"] = "foo" + assert (float_frame["col8"] == "foo").all() + + # this is partially a view (e.g. some blocks are view) + # so raise/warn + smaller = float_frame[:2] + + msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" + if using_copy_on_write or warn_copy_on_write: + # With CoW, adding a new column doesn't raise a warning + smaller["col10"] = ["1", "2"] + else: + with pytest.raises(SettingWithCopyError, match=msg): + smaller["col10"] = ["1", "2"] + + if using_infer_string: + assert smaller["col10"].dtype == "string" + else: + assert smaller["col10"].dtype == np.object_ + assert (smaller["col10"] == ["1", "2"]).all() + + def test_setitem2(self): + # dtype changing GH4204 + df = DataFrame([[0, 0]]) + df.iloc[0] = np.nan + expected = DataFrame([[np.nan, np.nan]]) + tm.assert_frame_equal(df, expected) + + df = DataFrame([[0, 0]]) + df.loc[0] = np.nan + tm.assert_frame_equal(df, expected) + + def test_setitem_boolean(self, float_frame): + df = float_frame.copy() + values = float_frame.values.copy() + + df[df["A"] > 0] = 4 + values[values[:, 0] > 0] = 4 + tm.assert_almost_equal(df.values, values) + + # test that column reindexing works + series = df["A"] == 4 + series = series.reindex(df.index[::-1]) + df[series] = 1 + values[values[:, 0] == 4] = 1 + tm.assert_almost_equal(df.values, values) + + df[df > 0] = 5 + values[values > 0] = 5 + tm.assert_almost_equal(df.values, values) + + df[df == 5] = 0 + values[values == 5] = 0 + tm.assert_almost_equal(df.values, values) + + # a df that needs alignment first + df[df[:-1] < 0] = 2 + np.putmask(values[:-1], values[:-1] < 0, 2) + tm.assert_almost_equal(df.values, values) + + # indexed with same shape but rows-reversed df + df[df[::-1] == 2] = 3 + values[values == 2] = 3 + tm.assert_almost_equal(df.values, values) + + msg = "Must pass DataFrame or 2-d ndarray with boolean values only" + with pytest.raises(TypeError, match=msg): + df[df * 0] = 2 + + # index with DataFrame + df_orig = df.copy() + mask = df > np.abs(df) + df[df > np.abs(df)] = np.nan + values = df_orig.values.copy() + values[mask.values] = np.nan + expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns) + tm.assert_frame_equal(df, expected) + + # set from DataFrame + df[df > np.abs(df)] = df * 2 + np.putmask(values, mask.values, df.values * 2) + expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns) + tm.assert_frame_equal(df, expected) + + def test_setitem_cast(self, float_frame): + float_frame["D"] = float_frame["D"].astype("i8") + assert float_frame["D"].dtype == np.int64 + + # #669, should not cast? + # this is now set to int64, which means a replacement of the column to + # the value dtype (and nothing to do with the existing dtype) + float_frame["B"] = 0 + assert float_frame["B"].dtype == np.int64 + + # cast if pass array of course + float_frame["B"] = np.arange(len(float_frame)) + assert issubclass(float_frame["B"].dtype.type, np.integer) + + float_frame["foo"] = "bar" + float_frame["foo"] = 0 + assert float_frame["foo"].dtype == np.int64 + + float_frame["foo"] = "bar" + float_frame["foo"] = 2.5 + assert float_frame["foo"].dtype == np.float64 + + float_frame["something"] = 0 + assert float_frame["something"].dtype == np.int64 + float_frame["something"] = 2 + assert float_frame["something"].dtype == np.int64 + float_frame["something"] = 2.5 + assert float_frame["something"].dtype == np.float64 + + def test_setitem_corner(self, float_frame, using_infer_string): + # corner case + df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3)) + del df["B"] + df["B"] = [1.0, 2.0, 3.0] + assert "B" in df + assert len(df.columns) == 2 + + df["A"] = "beginning" + df["E"] = "foo" + df["D"] = "bar" + df[datetime.now()] = "date" + df[datetime.now()] = 5.0 + + # what to do when empty frame with index + dm = DataFrame(index=float_frame.index) + dm["A"] = "foo" + dm["B"] = "bar" + assert len(dm.columns) == 2 + assert dm.values.dtype == np.object_ + + # upcast + dm["C"] = 1 + assert dm["C"].dtype == np.int64 + + dm["E"] = 1.0 + assert dm["E"].dtype == np.float64 + + # set existing column + dm["A"] = "bar" + assert "bar" == dm["A"].iloc[0] + + dm = DataFrame(index=np.arange(3)) + dm["A"] = 1 + dm["foo"] = "bar" + del dm["foo"] + dm["foo"] = "bar" + if using_infer_string: + assert dm["foo"].dtype == "string" + else: + assert dm["foo"].dtype == np.object_ + + dm["coercible"] = ["1", "2", "3"] + if using_infer_string: + assert dm["coercible"].dtype == "string" + else: + assert dm["coercible"].dtype == np.object_ + + def test_setitem_corner2(self): + data = { + "title": ["foobar", "bar", "foobar"] + ["foobar"] * 17, + "cruft": np.random.default_rng(2).random(20), + } + + df = DataFrame(data) + ix = df[df["title"] == "bar"].index + + df.loc[ix, ["title"]] = "foobar" + df.loc[ix, ["cruft"]] = 0 + + assert df.loc[1, "title"] == "foobar" + assert df.loc[1, "cruft"] == 0 + + def test_setitem_ambig(self, using_infer_string): + # Difficulties with mixed-type data + # Created as float type + dm = DataFrame(index=range(3), columns=range(3)) + + coercable_series = Series([Decimal(1) for _ in range(3)], index=range(3)) + uncoercable_series = Series(["foo", "bzr", "baz"], index=range(3)) + + dm[0] = np.ones(3) + assert len(dm.columns) == 3 + + dm[1] = coercable_series + assert len(dm.columns) == 3 + + dm[2] = uncoercable_series + assert len(dm.columns) == 3 + if using_infer_string: + assert dm[2].dtype == "string" + else: + assert dm[2].dtype == np.object_ + + def test_setitem_None(self, float_frame, using_infer_string): + # GH #766 + float_frame[None] = float_frame["A"] + key = None if not using_infer_string else np.nan + tm.assert_series_equal( + float_frame.iloc[:, -1], float_frame["A"], check_names=False + ) + tm.assert_series_equal( + float_frame.loc[:, key], float_frame["A"], check_names=False + ) + tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False) + + def test_loc_setitem_boolean_mask_allfalse(self): + # GH 9596 + df = DataFrame( + {"a": ["1", "2", "3"], "b": ["11", "22", "33"], "c": ["111", "222", "333"]} + ) + + result = df.copy() + result.loc[result.b.isna(), "a"] = result.a.copy() + tm.assert_frame_equal(result, df) + + def test_getitem_fancy_slice_integers_step(self): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) + + # this is OK + df.iloc[:8:2] + df.iloc[:8:2] = np.nan + assert isna(df.iloc[:8:2]).values.all() + + def test_getitem_setitem_integer_slice_keyerrors(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 5)), index=range(0, 20, 2) + ) + + # this is OK + cp = df.copy() + cp.iloc[4:10] = 0 + assert (cp.iloc[4:10] == 0).values.all() + + # so is this + cp = df.copy() + cp.iloc[3:11] = 0 + assert (cp.iloc[3:11] == 0).values.all() + + result = df.iloc[2:6] + result2 = df.loc[3:11] + expected = df.reindex([4, 6, 8, 10]) + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + # non-monotonic, raise KeyError + df2 = df.iloc[list(range(5)) + list(range(5, 10))[::-1]] + with pytest.raises(KeyError, match=r"^3$"): + df2.loc[3:11] + with pytest.raises(KeyError, match=r"^3$"): + df2.loc[3:11] = 0 + + @td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view + def test_fancy_getitem_slice_mixed( + self, float_frame, float_string_frame, using_copy_on_write, warn_copy_on_write + ): + sliced = float_string_frame.iloc[:, -3:] + assert sliced["D"].dtype == np.float64 + + # get view with single block + # setting it triggers setting with copy + original = float_frame.copy() + sliced = float_frame.iloc[:, -3:] + + assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values) + + with tm.assert_cow_warning(warn_copy_on_write): + sliced.loc[:, "C"] = 4.0 + if not using_copy_on_write: + assert (float_frame["C"] == 4).all() + + # with the enforcement of GH#45333 in 2.0, this remains a view + np.shares_memory(sliced["C"]._values, float_frame["C"]._values) + else: + tm.assert_frame_equal(float_frame, original) + + def test_getitem_setitem_non_ix_labels(self): + df = DataFrame(range(20), index=date_range("2020-01-01", periods=20)) + + start, end = df.index[[5, 10]] + + result = df.loc[start:end] + result2 = df[start:end] + expected = df[5:11] + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + result = df.copy() + result.loc[start:end] = 0 + result2 = df.copy() + result2[start:end] = 0 + expected = df.copy() + expected[5:11] = 0 + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + def test_ix_multi_take(self): + df = DataFrame(np.random.default_rng(2).standard_normal((3, 2))) + rs = df.loc[df.index == 0, :] + xp = df.reindex([0]) + tm.assert_frame_equal(rs, xp) + + # GH#1321 + df = DataFrame(np.random.default_rng(2).standard_normal((3, 2))) + rs = df.loc[df.index == 0, df.columns == 1] + xp = df.reindex(index=[0], columns=[1]) + tm.assert_frame_equal(rs, xp) + + def test_getitem_fancy_scalar(self, float_frame): + f = float_frame + ix = f.loc + + # individual value + for col in f.columns: + ts = f[col] + for idx in f.index[::5]: + assert ix[idx, col] == ts[idx] + + @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values + def test_setitem_fancy_scalar(self, float_frame): + f = float_frame + expected = float_frame.copy() + ix = f.loc + + # individual value + for j, col in enumerate(f.columns): + f[col] + for idx in f.index[::5]: + i = f.index.get_loc(idx) + val = np.random.default_rng(2).standard_normal() + expected.iloc[i, j] = val + + ix[idx, col] = val + tm.assert_frame_equal(f, expected) + + def test_getitem_fancy_boolean(self, float_frame): + f = float_frame + ix = f.loc + + expected = f.reindex(columns=["B", "D"]) + result = ix[:, [False, True, False, True]] + tm.assert_frame_equal(result, expected) + + expected = f.reindex(index=f.index[5:10], columns=["B", "D"]) + result = ix[f.index[5:10], [False, True, False, True]] + tm.assert_frame_equal(result, expected) + + boolvec = f.index > f.index[7] + expected = f.reindex(index=f.index[boolvec]) + result = ix[boolvec] + tm.assert_frame_equal(result, expected) + result = ix[boolvec, :] + tm.assert_frame_equal(result, expected) + + result = ix[boolvec, f.columns[2:]] + expected = f.reindex(index=f.index[boolvec], columns=["C", "D"]) + tm.assert_frame_equal(result, expected) + + @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values + def test_setitem_fancy_boolean(self, float_frame): + # from 2d, set with booleans + frame = float_frame.copy() + expected = float_frame.copy() + values = expected.values.copy() + + mask = frame["A"] > 0 + frame.loc[mask] = 0.0 + values[mask.values] = 0.0 + expected = DataFrame(values, index=expected.index, columns=expected.columns) + tm.assert_frame_equal(frame, expected) + + frame = float_frame.copy() + expected = float_frame.copy() + values = expected.values.copy() + frame.loc[mask, ["A", "B"]] = 0.0 + values[mask.values, :2] = 0.0 + expected = DataFrame(values, index=expected.index, columns=expected.columns) + tm.assert_frame_equal(frame, expected) + + def test_getitem_fancy_ints(self, float_frame): + result = float_frame.iloc[[1, 4, 7]] + expected = float_frame.loc[float_frame.index[[1, 4, 7]]] + tm.assert_frame_equal(result, expected) + + result = float_frame.iloc[:, [2, 0, 1]] + expected = float_frame.loc[:, float_frame.columns[[2, 0, 1]]] + tm.assert_frame_equal(result, expected) + + def test_getitem_setitem_boolean_misaligned(self, float_frame): + # boolean index misaligned labels + mask = float_frame["A"][::-1] > 1 + + result = float_frame.loc[mask] + expected = float_frame.loc[mask[::-1]] + tm.assert_frame_equal(result, expected) + + cp = float_frame.copy() + expected = float_frame.copy() + cp.loc[mask] = 0 + expected.loc[mask] = 0 + tm.assert_frame_equal(cp, expected) + + def test_getitem_setitem_boolean_multi(self): + df = DataFrame(np.random.default_rng(2).standard_normal((3, 2))) + + # get + k1 = np.array([True, False, True]) + k2 = np.array([False, True]) + result = df.loc[k1, k2] + expected = df.loc[[0, 2], [1]] + tm.assert_frame_equal(result, expected) + + expected = df.copy() + df.loc[np.array([True, False, True]), np.array([False, True])] = 5 + expected.loc[[0, 2], [1]] = 5 + tm.assert_frame_equal(df, expected) + + def test_getitem_setitem_float_labels(self, using_array_manager): + index = Index([1.5, 2, 3, 4, 5]) + df = DataFrame(np.random.default_rng(2).standard_normal((5, 5)), index=index) + + result = df.loc[1.5:4] + expected = df.reindex([1.5, 2, 3, 4]) + tm.assert_frame_equal(result, expected) + assert len(result) == 4 + + result = df.loc[4:5] + expected = df.reindex([4, 5]) # reindex with int + tm.assert_frame_equal(result, expected, check_index_type=False) + assert len(result) == 2 + + result = df.loc[4:5] + expected = df.reindex([4.0, 5.0]) # reindex with float + tm.assert_frame_equal(result, expected) + assert len(result) == 2 + + # loc_float changes this to work properly + result = df.loc[1:2] + expected = df.iloc[0:2] + tm.assert_frame_equal(result, expected) + + expected = df.iloc[0:2] + msg = r"The behavior of obj\[i:j\] with a float-dtype index" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df[1:2] + tm.assert_frame_equal(result, expected) + + # #2727 + index = Index([1.0, 2.5, 3.5, 4.5, 5.0]) + df = DataFrame(np.random.default_rng(2).standard_normal((5, 5)), index=index) + + # positional slicing only via iloc! + msg = ( + "cannot do positional indexing on Index with " + r"these indexers \[1.0\] of type float" + ) + with pytest.raises(TypeError, match=msg): + df.iloc[1.0:5] + + result = df.iloc[4:5] + expected = df.reindex([5.0]) + tm.assert_frame_equal(result, expected) + assert len(result) == 1 + + cp = df.copy() + + with pytest.raises(TypeError, match=_slice_msg): + cp.iloc[1.0:5] = 0 + + with pytest.raises(TypeError, match=msg): + result = cp.iloc[1.0:5] == 0 + + assert result.values.all() + assert (cp.iloc[0:1] == df.iloc[0:1]).values.all() + + cp = df.copy() + cp.iloc[4:5] = 0 + assert (cp.iloc[4:5] == 0).values.all() + assert (cp.iloc[0:4] == df.iloc[0:4]).values.all() + + # float slicing + result = df.loc[1.0:5] + expected = df + tm.assert_frame_equal(result, expected) + assert len(result) == 5 + + result = df.loc[1.1:5] + expected = df.reindex([2.5, 3.5, 4.5, 5.0]) + tm.assert_frame_equal(result, expected) + assert len(result) == 4 + + result = df.loc[4.51:5] + expected = df.reindex([5.0]) + tm.assert_frame_equal(result, expected) + assert len(result) == 1 + + result = df.loc[1.0:5.0] + expected = df.reindex([1.0, 2.5, 3.5, 4.5, 5.0]) + tm.assert_frame_equal(result, expected) + assert len(result) == 5 + + cp = df.copy() + cp.loc[1.0:5.0] = 0 + result = cp.loc[1.0:5.0] + assert (result == 0).values.all() + + def test_setitem_single_column_mixed_datetime(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 3)), + index=["a", "b", "c", "d", "e"], + columns=["foo", "bar", "baz"], + ) + + df["timestamp"] = Timestamp("20010102") + + # check our dtypes + result = df.dtypes + expected = Series( + [np.dtype("float64")] * 3 + [np.dtype("datetime64[s]")], + index=["foo", "bar", "baz", "timestamp"], + ) + tm.assert_series_equal(result, expected) + + # GH#16674 iNaT is treated as an integer when given by the user + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + df.loc["b", "timestamp"] = iNaT + assert not isna(df.loc["b", "timestamp"]) + assert df["timestamp"].dtype == np.object_ + assert df.loc["b", "timestamp"] == iNaT + + # allow this syntax (as of GH#3216) + df.loc["c", "timestamp"] = np.nan + assert isna(df.loc["c", "timestamp"]) + + # allow this syntax + df.loc["d", :] = np.nan + assert not isna(df.loc["c", :]).all() + + def test_setitem_mixed_datetime(self): + # GH 9336 + expected = DataFrame( + { + "a": [0, 0, 0, 0, 13, 14], + "b": [ + datetime(2012, 1, 1), + 1, + "x", + "y", + datetime(2013, 1, 1), + datetime(2014, 1, 1), + ], + } + ) + df = DataFrame(0, columns=list("ab"), index=range(6)) + df["b"] = pd.NaT + df.loc[0, "b"] = datetime(2012, 1, 1) + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + df.loc[1, "b"] = 1 + df.loc[[2, 3], "b"] = "x", "y" + A = np.array( + [ + [13, np.datetime64("2013-01-01T00:00:00")], + [14, np.datetime64("2014-01-01T00:00:00")], + ] + ) + df.loc[[4, 5], ["a", "b"]] = A + tm.assert_frame_equal(df, expected) + + def test_setitem_frame_float(self, float_frame): + piece = float_frame.loc[float_frame.index[:2], ["A", "B"]] + float_frame.loc[float_frame.index[-2] :, ["A", "B"]] = piece.values + result = float_frame.loc[float_frame.index[-2:], ["A", "B"]].values + expected = piece.values + tm.assert_almost_equal(result, expected) + + def test_setitem_frame_mixed(self, float_string_frame): + # GH 3216 + + # already aligned + f = float_string_frame.copy() + piece = DataFrame( + [[1.0, 2.0], [3.0, 4.0]], index=f.index[0:2], columns=["A", "B"] + ) + key = (f.index[slice(None, 2)], ["A", "B"]) + f.loc[key] = piece + tm.assert_almost_equal(f.loc[f.index[0:2], ["A", "B"]].values, piece.values) + + def test_setitem_frame_mixed_rows_unaligned(self, float_string_frame): + # GH#3216 rows unaligned + f = float_string_frame.copy() + piece = DataFrame( + [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]], + index=list(f.index[0:2]) + ["foo", "bar"], + columns=["A", "B"], + ) + key = (f.index[slice(None, 2)], ["A", "B"]) + f.loc[key] = piece + tm.assert_almost_equal( + f.loc[f.index[0:2:], ["A", "B"]].values, piece.values[0:2] + ) + + def test_setitem_frame_mixed_key_unaligned(self, float_string_frame): + # GH#3216 key is unaligned with values + f = float_string_frame.copy() + piece = f.loc[f.index[:2], ["A"]] + piece.index = f.index[-2:] + key = (f.index[slice(-2, None)], ["A", "B"]) + f.loc[key] = piece + piece["B"] = np.nan + tm.assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values) + + def test_setitem_frame_mixed_ndarray(self, float_string_frame): + # GH#3216 ndarray + f = float_string_frame.copy() + piece = float_string_frame.loc[f.index[:2], ["A", "B"]] + key = (f.index[slice(-2, None)], ["A", "B"]) + f.loc[key] = piece.values + tm.assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values) + + def test_setitem_frame_upcast(self): + # needs upcasting + df = DataFrame([[1, 2, "foo"], [3, 4, "bar"]], columns=["A", "B", "C"]) + df2 = df.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 + expected = df.reindex(columns=["A", "B"]) + expected += 0.5 + expected["C"] = df["C"] + tm.assert_frame_equal(df2, expected) + + def test_setitem_frame_align(self, float_frame): + piece = float_frame.loc[float_frame.index[:2], ["A", "B"]] + piece.index = float_frame.index[-2:] + piece.columns = ["A", "B"] + float_frame.loc[float_frame.index[-2:], ["A", "B"]] = piece + result = float_frame.loc[float_frame.index[-2:], ["A", "B"]].values + expected = piece.values + tm.assert_almost_equal(result, expected) + + def test_getitem_setitem_ix_duplicates(self): + # #1201 + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 3)), + index=["foo", "foo", "bar", "baz", "bar"], + ) + + result = df.loc["foo"] + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.loc["bar"] + expected = df.iloc[[2, 4]] + tm.assert_frame_equal(result, expected) + + result = df.loc["baz"] + expected = df.iloc[3] + tm.assert_series_equal(result, expected) + + def test_getitem_ix_boolean_duplicates_multiple(self): + # #1201 + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 3)), + index=["foo", "foo", "bar", "baz", "bar"], + ) + + result = df.loc[["bar"]] + exp = df.iloc[[2, 4]] + tm.assert_frame_equal(result, exp) + + result = df.loc[df[1] > 0] + exp = df[df[1] > 0] + tm.assert_frame_equal(result, exp) + + result = df.loc[df[0] > 0] + exp = df[df[0] > 0] + tm.assert_frame_equal(result, exp) + + @pytest.mark.parametrize("bool_value", [True, False]) + def test_getitem_setitem_ix_bool_keyerror(self, bool_value): + # #2199 + df = DataFrame({"a": [1, 2, 3]}) + message = f"{bool_value}: boolean label can not be used without a boolean index" + with pytest.raises(KeyError, match=message): + df.loc[bool_value] + + msg = "cannot use a single bool to index into setitem" + with pytest.raises(KeyError, match=msg): + df.loc[bool_value] = 0 + + # TODO: rename? remove? + def test_single_element_ix_dont_upcast(self, float_frame): + float_frame["E"] = 1 + assert issubclass(float_frame["E"].dtype.type, (int, np.integer)) + + result = float_frame.loc[float_frame.index[5], "E"] + assert is_integer(result) + + # GH 11617 + df = DataFrame({"a": [1.23]}) + df["b"] = 666 + + result = df.loc[0, "b"] + assert is_integer(result) + + expected = Series([666], [0], name="b") + result = df.loc[[0], "b"] + tm.assert_series_equal(result, expected) + + def test_iloc_callable_tuple_return_value(self): + # GH53769 + df = DataFrame(np.arange(40).reshape(10, 4), index=range(0, 20, 2)) + msg = "callable with iloc" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.iloc[lambda _: (0,)] + with tm.assert_produces_warning(FutureWarning, match=msg): + df.iloc[lambda _: (0,)] = 1 + + def test_iloc_row(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), index=range(0, 20, 2) + ) + + result = df.iloc[1] + exp = df.loc[2] + tm.assert_series_equal(result, exp) + + result = df.iloc[2] + exp = df.loc[4] + tm.assert_series_equal(result, exp) + + # slice + result = df.iloc[slice(4, 8)] + expected = df.loc[8:14] + tm.assert_frame_equal(result, expected) + + # list of integers + result = df.iloc[[1, 2, 4, 6]] + expected = df.reindex(df.index[[1, 2, 4, 6]]) + tm.assert_frame_equal(result, expected) + + def test_iloc_row_slice_view(self, using_copy_on_write, warn_copy_on_write): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), index=range(0, 20, 2) + ) + original = df.copy() + + # verify slice is view + # setting it makes it raise/warn + subset = df.iloc[slice(4, 8)] + + assert np.shares_memory(df[2], subset[2]) + + exp_col = original[2].copy() + with tm.assert_cow_warning(warn_copy_on_write): + subset.loc[:, 2] = 0.0 + if not using_copy_on_write: + exp_col._values[4:8] = 0.0 + + # With the enforcement of GH#45333 in 2.0, this remains a view + assert np.shares_memory(df[2], subset[2]) + tm.assert_series_equal(df[2], exp_col) + + def test_iloc_col(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 10)), columns=range(0, 20, 2) + ) + + result = df.iloc[:, 1] + exp = df.loc[:, 2] + tm.assert_series_equal(result, exp) + + result = df.iloc[:, 2] + exp = df.loc[:, 4] + tm.assert_series_equal(result, exp) + + # slice + result = df.iloc[:, slice(4, 8)] + expected = df.loc[:, 8:14] + tm.assert_frame_equal(result, expected) + + # list of integers + result = df.iloc[:, [1, 2, 4, 6]] + expected = df.reindex(columns=df.columns[[1, 2, 4, 6]]) + tm.assert_frame_equal(result, expected) + + def test_iloc_col_slice_view( + self, using_array_manager, using_copy_on_write, warn_copy_on_write + ): + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 10)), columns=range(0, 20, 2) + ) + original = df.copy() + subset = df.iloc[:, slice(4, 8)] + + if not using_array_manager and not using_copy_on_write: + # verify slice is view + assert np.shares_memory(df[8]._values, subset[8]._values) + + with tm.assert_cow_warning(warn_copy_on_write): + subset.loc[:, 8] = 0.0 + + assert (df[8] == 0).all() + + # with the enforcement of GH#45333 in 2.0, this remains a view + assert np.shares_memory(df[8]._values, subset[8]._values) + else: + if using_copy_on_write: + # verify slice is view + assert np.shares_memory(df[8]._values, subset[8]._values) + subset[8] = 0.0 + # subset changed + assert (subset[8] == 0).all() + # but df itself did not change (setitem replaces full column) + tm.assert_frame_equal(df, original) + + def test_loc_duplicates(self): + # gh-17105 + + # insert a duplicate element to the index + trange = date_range( + start=Timestamp(year=2017, month=1, day=1), + end=Timestamp(year=2017, month=1, day=5), + ) + + trange = trange.insert(loc=5, item=Timestamp(year=2017, month=1, day=5)) + + df = DataFrame(0, index=trange, columns=["A", "B"]) + bool_idx = np.array([False, False, False, False, False, True]) + + # assignment + df.loc[trange[bool_idx], "A"] = 6 + + expected = DataFrame( + {"A": [0, 0, 0, 0, 6, 6], "B": [0, 0, 0, 0, 0, 0]}, index=trange + ) + tm.assert_frame_equal(df, expected) + + # in-place + df = DataFrame(0, index=trange, columns=["A", "B"]) + df.loc[trange[bool_idx], "A"] += 6 + tm.assert_frame_equal(df, expected) + + def test_setitem_with_unaligned_tz_aware_datetime_column(self): + # GH 12981 + # Assignment of unaligned offset-aware datetime series. + # Make sure timezone isn't lost + column = Series(date_range("2015-01-01", periods=3, tz="utc"), name="dates") + df = DataFrame({"dates": column}) + df["dates"] = column[[1, 0, 2]] + tm.assert_series_equal(df["dates"], column) + + df = DataFrame({"dates": column}) + df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] + tm.assert_series_equal(df["dates"], column) + + def test_loc_setitem_datetimelike_with_inference(self): + # GH 7592 + # assignment of timedeltas with NaT + + one_hour = timedelta(hours=1) + df = DataFrame(index=date_range("20130101", periods=4)) + df["A"] = np.array([1 * one_hour] * 4, dtype="m8[ns]") + df.loc[:, "B"] = np.array([2 * one_hour] * 4, dtype="m8[ns]") + df.loc[df.index[:3], "C"] = np.array([3 * one_hour] * 3, dtype="m8[ns]") + df.loc[:, "D"] = np.array([4 * one_hour] * 4, dtype="m8[ns]") + df.loc[df.index[:3], "E"] = np.array([5 * one_hour] * 3, dtype="m8[ns]") + df["F"] = np.timedelta64("NaT") + df.loc[df.index[:-1], "F"] = np.array([6 * one_hour] * 3, dtype="m8[ns]") + df.loc[df.index[-3] :, "G"] = date_range("20130101", periods=3) + df["H"] = np.datetime64("NaT") + result = df.dtypes + expected = Series( + [np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2, + index=list("ABCDEFGH"), + ) + tm.assert_series_equal(result, expected) + + def test_getitem_boolean_indexing_mixed(self): + df = DataFrame( + { + 0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, + 1: { + 35: np.nan, + 40: 0.32632316859446198, + 43: np.nan, + 49: 0.32632316859446198, + 50: 0.39114724480578139, + }, + 2: { + 35: np.nan, + 40: np.nan, + 43: 0.29012581014105987, + 49: np.nan, + 50: np.nan, + }, + 3: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, + 4: { + 35: 0.34215328467153283, + 40: np.nan, + 43: np.nan, + 49: np.nan, + 50: np.nan, + }, + "y": {35: 0, 40: 0, 43: 0, 49: 0, 50: 1}, + } + ) + + # mixed int/float ok + df2 = df.copy() + df2[df2 > 0.3] = 1 + expected = df.copy() + expected.loc[40, 1] = 1 + expected.loc[49, 1] = 1 + expected.loc[50, 1] = 1 + expected.loc[35, 4] = 1 + tm.assert_frame_equal(df2, expected) + + df["foo"] = "test" + msg = "not supported between instances|unorderable types" + + with pytest.raises(TypeError, match=msg): + df[df > 0.3] = 1 + + def test_type_error_multiindex(self): + # See gh-12218 + mi = MultiIndex.from_product([["x", "y"], [0, 1]], names=[None, "c"]) + dg = DataFrame( + [[1, 1, 2, 2], [3, 3, 4, 4]], columns=mi, index=Index([0, 1], name="i") + ) + with pytest.raises(InvalidIndexError, match="slice"): + dg[:, 0] + + index = Index(range(2), name="i") + columns = MultiIndex( + levels=[["x", "y"], [0, 1]], codes=[[0, 1], [0, 0]], names=[None, "c"] + ) + expected = DataFrame([[1, 2], [3, 4]], columns=columns, index=index) + + result = dg.loc[:, (slice(None), 0)] + tm.assert_frame_equal(result, expected) + + name = ("x", 0) + index = Index(range(2), name="i") + expected = Series([1, 3], index=index, name=name) + + result = dg["x", 0] + tm.assert_series_equal(result, expected) + + def test_getitem_interval_index_partial_indexing(self): + # GH#36490 + df = DataFrame( + np.ones((3, 4)), columns=pd.IntervalIndex.from_breaks(np.arange(5)) + ) + + expected = df.iloc[:, 0] + + res = df[0.5] + tm.assert_series_equal(res, expected) + + res = df.loc[:, 0.5] + tm.assert_series_equal(res, expected) + + def test_setitem_array_as_cell_value(self): + # GH#43422 + df = DataFrame(columns=["a", "b"], dtype=object) + df.loc[0] = {"a": np.zeros((2,)), "b": np.zeros((2, 2))} + expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]}) + tm.assert_frame_equal(df, expected) + + def test_iloc_setitem_nullable_2d_values(self): + df = DataFrame({"A": [1, 2, 3]}, dtype="Int64") + orig = df.copy() + + df.loc[:] = df.values[:, ::-1] + tm.assert_frame_equal(df, orig) + + df.loc[:] = pd.core.arrays.NumpyExtensionArray(df.values[:, ::-1]) + tm.assert_frame_equal(df, orig) + + df.iloc[:] = df.iloc[:, :].copy() + tm.assert_frame_equal(df, orig) + + def test_getitem_segfault_with_empty_like_object(self): + # GH#46848 + df = DataFrame(np.empty((1, 1), dtype=object)) + df[0] = np.empty_like(df[0]) + # this produces the segfault + df[[0]] + + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + @pytest.mark.parametrize( + "null", [pd.NaT, pd.NaT.to_numpy("M8[ns]"), pd.NaT.to_numpy("m8[ns]")] + ) + def test_setting_mismatched_na_into_nullable_fails( + self, null, any_numeric_ea_dtype + ): + # GH#44514 don't cast mismatched nulls to pd.NA + df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype) + ser = df["A"].copy() + arr = ser._values + + msg = "|".join( + [ + r"timedelta64\[ns\] cannot be converted to (Floating|Integer)Dtype", + r"datetime64\[ns\] cannot be converted to (Floating|Integer)Dtype", + "'values' contains non-numeric NA", + r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}", + ] + ) + with pytest.raises(TypeError, match=msg): + arr[0] = null + + with pytest.raises(TypeError, match=msg): + arr[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + ser[0] = null + + with pytest.raises(TypeError, match=msg): + ser[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + ser.iloc[0] = null + + with pytest.raises(TypeError, match=msg): + ser.iloc[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + df.iloc[0, 0] = null + + with pytest.raises(TypeError, match=msg): + df.iloc[:2, 0] = [null, null] + + # Multi-Block + df2 = df.copy() + df2["B"] = ser.copy() + with pytest.raises(TypeError, match=msg): + df2.iloc[0, 0] = null + + with pytest.raises(TypeError, match=msg): + df2.iloc[:2, 0] = [null, null] + + def test_loc_expand_empty_frame_keep_index_name(self): + # GH#45621 + df = DataFrame(columns=["b"], index=Index([], name="a")) + df.loc[0] = 1 + expected = DataFrame({"b": [1]}, index=Index([0], name="a")) + tm.assert_frame_equal(df, expected) + + def test_loc_expand_empty_frame_keep_midx_names(self): + # GH#46317 + df = DataFrame( + columns=["d"], index=MultiIndex.from_tuples([], names=["a", "b", "c"]) + ) + df.loc[(1, 2, 3)] = "foo" + expected = DataFrame( + {"d": ["foo"]}, + index=MultiIndex.from_tuples([(1, 2, 3)], names=["a", "b", "c"]), + ) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "val, idxr", + [ + ("x", "a"), + ("x", ["a"]), + (1, "a"), + (1, ["a"]), + ], + ) + def test_loc_setitem_rhs_frame(self, idxr, val): + # GH#47578 + df = DataFrame({"a": [1, 2]}) + + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + df.loc[:, idxr] = DataFrame({"a": [val, 11]}, index=[1, 2]) + expected = DataFrame({"a": [np.nan, val]}) + tm.assert_frame_equal(df, expected) + + @td.skip_array_manager_invalid_test + def test_iloc_setitem_enlarge_no_warning(self, warn_copy_on_write): + # GH#47381 + df = DataFrame(columns=["a", "b"]) + expected = df.copy() + view = df[:] + df.iloc[:, 0] = np.array([1, 2], dtype=np.float64) + tm.assert_frame_equal(view, expected) + + def test_loc_internals_not_updated_correctly(self): + # GH#47867 all steps are necessary to reproduce the initial bug + df = DataFrame( + {"bool_col": True, "a": 1, "b": 2.5}, + index=MultiIndex.from_arrays([[1, 2], [1, 2]], names=["idx1", "idx2"]), + ) + idx = [(1, 1)] + + df["c"] = 3 + df.loc[idx, "c"] = 0 + + df.loc[idx, "c"] + df.loc[idx, ["a", "b"]] + + df.loc[idx, "c"] = 15 + result = df.loc[idx, "c"] + expected = df = Series( + 15, + index=MultiIndex.from_arrays([[1], [1]], names=["idx1", "idx2"]), + name="c", + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("val", [None, [None], pd.NA, [pd.NA]]) + def test_iloc_setitem_string_list_na(self, val): + # GH#45469 + df = DataFrame({"a": ["a", "b", "c"]}, dtype="string") + df.iloc[[0], :] = val + expected = DataFrame({"a": [pd.NA, "b", "c"]}, dtype="string") + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("val", [None, pd.NA]) + def test_iloc_setitem_string_na(self, val): + # GH#45469 + df = DataFrame({"a": ["a", "b", "c"]}, dtype="string") + df.iloc[0, :] = val + expected = DataFrame({"a": [pd.NA, "b", "c"]}, dtype="string") + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("func", [list, Series, np.array]) + def test_iloc_setitem_ea_null_slice_length_one_list(self, func): + # GH#48016 + df = DataFrame({"a": [1, 2, 3]}, dtype="Int64") + df.iloc[:, func([0])] = 5 + expected = DataFrame({"a": [5, 5, 5]}, dtype="Int64") + tm.assert_frame_equal(df, expected) + + def test_loc_named_tuple_for_midx(self): + # GH#48124 + df = DataFrame( + index=MultiIndex.from_product( + [["A", "B"], ["a", "b", "c"]], names=["first", "second"] + ) + ) + indexer_tuple = namedtuple("Indexer", df.index.names) + idxr = indexer_tuple(first="A", second=["a", "b"]) + result = df.loc[idxr, :] + expected = DataFrame( + index=MultiIndex.from_tuples( + [("A", "a"), ("A", "b")], names=["first", "second"] + ) + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("indexer", [["a"], "a"]) + @pytest.mark.parametrize("col", [{}, {"b": 1}]) + def test_set_2d_casting_date_to_int(self, col, indexer): + # GH#49159 + df = DataFrame( + {"a": [Timestamp("2022-12-29"), Timestamp("2022-12-30")], **col}, + ) + df.loc[[1], indexer] = df["a"] + pd.Timedelta(days=1) + expected = DataFrame( + {"a": [Timestamp("2022-12-29"), Timestamp("2022-12-31")], **col}, + ) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("col", [{}, {"name": "a"}]) + def test_loc_setitem_reordering_with_all_true_indexer(self, col): + # GH#48701 + n = 17 + df = DataFrame({**col, "x": range(n), "y": range(n)}) + expected = df.copy() + df.loc[n * [True], ["x", "y"]] = df[["x", "y"]] + tm.assert_frame_equal(df, expected) + + def test_loc_rhs_empty_warning(self): + # GH48480 + df = DataFrame(columns=["a", "b"]) + expected = df.copy() + rhs = DataFrame(columns=["a"]) + with tm.assert_produces_warning(None): + df.loc[:, "a"] = rhs + tm.assert_frame_equal(df, expected) + + def test_iloc_ea_series_indexer(self): + # GH#49521 + df = DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]) + indexer = Series([0, 1], dtype="Int64") + row_indexer = Series([1], dtype="Int64") + result = df.iloc[row_indexer, indexer] + expected = DataFrame([[5, 6]], index=[1]) + tm.assert_frame_equal(result, expected) + + result = df.iloc[row_indexer.values, indexer.values] + tm.assert_frame_equal(result, expected) + + def test_iloc_ea_series_indexer_with_na(self): + # GH#49521 + df = DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]) + indexer = Series([0, pd.NA], dtype="Int64") + msg = "cannot convert" + with pytest.raises(ValueError, match=msg): + df.iloc[:, indexer] + with pytest.raises(ValueError, match=msg): + df.iloc[:, indexer.values] + + @pytest.mark.parametrize("indexer", [True, (True,)]) + @pytest.mark.parametrize("dtype", [bool, "boolean"]) + def test_loc_bool_multiindex(self, dtype, indexer): + # GH#47687 + midx = MultiIndex.from_arrays( + [ + Series([True, True, False, False], dtype=dtype), + Series([True, False, True, False], dtype=dtype), + ], + names=["a", "b"], + ) + df = DataFrame({"c": [1, 2, 3, 4]}, index=midx) + with tm.maybe_produces_warning(PerformanceWarning, isinstance(indexer, tuple)): + result = df.loc[indexer] + expected = DataFrame( + {"c": [1, 2]}, index=Index([True, False], name="b", dtype=dtype) + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("utc", [False, True]) + @pytest.mark.parametrize("indexer", ["date", ["date"]]) + def test_loc_datetime_assignment_dtype_does_not_change(self, utc, indexer): + # GH#49837 + df = DataFrame( + { + "date": to_datetime( + [datetime(2022, 1, 20), datetime(2022, 1, 22)], utc=utc + ), + "update": [True, False], + } + ) + expected = df.copy(deep=True) + + update_df = df[df["update"]] + + df.loc[df["update"], indexer] = update_df["date"] + + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("indexer, idx", [(tm.loc, 1), (tm.iloc, 2)]) + def test_setitem_value_coercing_dtypes(self, indexer, idx): + # GH#50467 + df = DataFrame([["1", np.nan], ["2", np.nan], ["3", np.nan]], dtype=object) + rhs = DataFrame([[1, np.nan], [2, np.nan]]) + indexer(df)[:idx, :] = rhs + expected = DataFrame([[1, np.nan], [2, np.nan], ["3", np.nan]], dtype=object) + tm.assert_frame_equal(df, expected) + + +class TestDataFrameIndexingUInt64: + def test_setitem(self): + df = DataFrame( + {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, + dtype=np.uint64, + ) + idx = df["A"].rename("foo") + + # setitem + assert "C" not in df.columns + df["C"] = idx + tm.assert_series_equal(df["C"], Series(idx, name="C")) + + assert "D" not in df.columns + df["D"] = "foo" + df["D"] = idx + tm.assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] + + # With NaN: because uint64 has no NaN element, + # the column should be cast to object. + df2 = df.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df2.iloc[1, 1] = pd.NaT + df2.iloc[1, 2] = pd.NaT + result = df2["B"] + tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) + tm.assert_series_equal( + df2.dtypes, + Series( + [np.dtype("uint64"), np.dtype("O"), np.dtype("O")], + index=["A", "B", "C"], + ), + ) + + +def test_object_casting_indexing_wraps_datetimelike(using_array_manager): + # GH#31649, check the indexing methods all the way down the stack + df = DataFrame( + { + "A": [1, 2], + "B": date_range("2000", periods=2), + "C": pd.timedelta_range("1 Day", periods=2), + } + ) + + ser = df.loc[0] + assert isinstance(ser.values[1], Timestamp) + assert isinstance(ser.values[2], pd.Timedelta) + + ser = df.iloc[0] + assert isinstance(ser.values[1], Timestamp) + assert isinstance(ser.values[2], pd.Timedelta) + + ser = df.xs(0, axis=0) + assert isinstance(ser.values[1], Timestamp) + assert isinstance(ser.values[2], pd.Timedelta) + + if using_array_manager: + # remainder of the test checking BlockManager internals + return + + mgr = df._mgr + mgr._rebuild_blknos_and_blklocs() + arr = mgr.fast_xs(0).array + assert isinstance(arr[1], Timestamp) + assert isinstance(arr[2], pd.Timedelta) + + blk = mgr.blocks[mgr.blknos[1]] + assert blk.dtype == "M8[ns]" # we got the right block + val = blk.iget((0, 0)) + assert isinstance(val, Timestamp) + + blk = mgr.blocks[mgr.blknos[2]] + assert blk.dtype == "m8[ns]" # we got the right block + val = blk.iget((0, 0)) + assert isinstance(val, pd.Timedelta) + + +msg1 = r"Cannot setitem on a Categorical with a new category( \(.*\))?, set the" +msg2 = "Cannot set a Categorical with another, without identical categories" + + +class TestLocILocDataFrameCategorical: + @pytest.fixture + def orig(self): + cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) + values = [1, 1, 1, 1, 1, 1, 1] + orig = DataFrame({"cats": cats, "values": values}, index=idx) + return orig + + @pytest.fixture + def exp_single_row(self): + # The expected values if we change a single row + cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values1 = [1, 1, 2, 1, 1, 1, 1] + exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) + return exp_single_row + + @pytest.fixture + def exp_multi_row(self): + # assign multiple rows (mixed values) (-> array) -> exp_multi_row + # changed multiple rows + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values2 = [1, 1, 2, 2, 1, 1, 1] + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) + return exp_multi_row + + @pytest.fixture + def exp_parts_cats_col(self): + # changed part of the cats column + cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values3 = [1, 1, 1, 1, 1, 1, 1] + exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) + return exp_parts_cats_col + + @pytest.fixture + def exp_single_cats_value(self): + # changed single value in cats col + cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values4 = [1, 1, 1, 1, 1, 1, 1] + exp_single_cats_value = DataFrame( + {"cats": cats4, "values": values4}, index=idx4 + ) + return exp_single_cats_value + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_list_of_lists(self, orig, exp_multi_row, indexer): + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + + key = slice(2, 4) + if indexer is tm.loc: + key = slice("j", "k") + + indexer(df)[key, :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + df = orig.copy() + with pytest.raises(TypeError, match=msg1): + indexer(df)[key, :] = [["c", 2], ["c", 2]] + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc, tm.at, tm.iat]) + def test_loc_iloc_at_iat_setitem_single_value_in_categories( + self, orig, exp_single_cats_value, indexer + ): + # - assign a single value -> exp_single_cats_value + df = orig.copy() + + key = (2, 0) + if indexer in [tm.loc, tm.at]: + key = (df.index[2], df.columns[0]) + + # "b" is among the categories for df["cat"}] + indexer(df)[key] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # "c" is not among the categories for df["cat"] + with pytest.raises(TypeError, match=msg1): + indexer(df)[key] = "c" + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_mask_single_value_in_categories( + self, orig, exp_single_cats_value, indexer + ): + # mask with single True + df = orig.copy() + + mask = df.index == "j" + key = 0 + if indexer is tm.loc: + key = df.columns[key] + + indexer(df)[mask, key] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_full_row_non_categorical_rhs( + self, orig, exp_single_row, indexer + ): + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + + key = 2 + if indexer is tm.loc: + key = df.index[2] + + # not categorical dtype, but "b" _is_ among the categories for df["cat"] + indexer(df)[key, :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # "c" is not among the categories for df["cat"] + with pytest.raises(TypeError, match=msg1): + indexer(df)[key, :] = ["c", 2] + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_partial_col_categorical_rhs( + self, orig, exp_parts_cats_col, indexer + ): + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + + key = (slice(2, 4), 0) + if indexer is tm.loc: + key = (slice("j", "k"), df.columns[0]) + + # same categories as we currently have in df["cats"] + compat = Categorical(["b", "b"], categories=["a", "b"]) + indexer(df)[key] = compat + tm.assert_frame_equal(df, exp_parts_cats_col) + + # categories do not match df["cat"]'s, but "b" is among them + semi_compat = Categorical(list("bb"), categories=list("abc")) + with pytest.raises(TypeError, match=msg2): + # different categories but holdable values + # -> not sure if this should fail or pass + indexer(df)[key] = semi_compat + + # categories do not match df["cat"]'s, and "c" is not among them + incompat = Categorical(list("cc"), categories=list("abc")) + with pytest.raises(TypeError, match=msg2): + # different values + indexer(df)[key] = incompat + + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_non_categorical_rhs( + self, orig, exp_parts_cats_col, indexer + ): + # assign a part of a column with dtype != categorical -> exp_parts_cats_col + df = orig.copy() + + key = (slice(2, 4), 0) + if indexer is tm.loc: + key = (slice("j", "k"), df.columns[0]) + + # "b" is among the categories for df["cat"] + indexer(df)[key] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + # "c" not part of the categories + with pytest.raises(TypeError, match=msg1): + indexer(df)[key] = ["c", "c"] + + @pytest.mark.parametrize("indexer", [tm.getitem, tm.loc, tm.iloc]) + def test_getitem_preserve_object_index_with_dates(self, indexer): + # https://github.com/pandas-dev/pandas/pull/42950 - when selecting a column + # from dataframe, don't try to infer object dtype index on Series construction + idx = date_range("2012", periods=3).astype(object) + df = DataFrame({0: [1, 2, 3]}, index=idx) + assert df.index.dtype == object + + if indexer is tm.getitem: + ser = indexer(df)[0] + else: + ser = indexer(df)[:, 0] + + assert ser.index.dtype == object + + def test_loc_on_multiindex_one_level(self): + # GH#45779 + df = DataFrame( + data=[[0], [1]], + index=MultiIndex.from_tuples([("a",), ("b",)], names=["first"]), + ) + expected = DataFrame( + data=[[0]], index=MultiIndex.from_tuples([("a",)], names=["first"]) + ) + result = df.loc["a"] + tm.assert_frame_equal(result, expected) + + +class TestDeprecatedIndexers: + @pytest.mark.parametrize( + "key", [{1}, {1: 1}, ({1}, "a"), ({1: 1}, "a"), (1, {"a"}), (1, {"a": "a"})] + ) + def test_getitem_dict_and_set_deprecated(self, key): + # GH#42825 enforced in 2.0 + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + with pytest.raises(TypeError, match="as an indexer is not supported"): + df.loc[key] + + @pytest.mark.parametrize( + "key", + [ + {1}, + {1: 1}, + (({1}, 2), "a"), + (({1: 1}, 2), "a"), + ((1, 2), {"a"}), + ((1, 2), {"a": "a"}), + ], + ) + def test_getitem_dict_and_set_deprecated_multiindex(self, key): + # GH#42825 enforced in 2.0 + df = DataFrame( + [[1, 2], [3, 4]], + columns=["a", "b"], + index=MultiIndex.from_tuples([(1, 2), (3, 4)]), + ) + with pytest.raises(TypeError, match="as an indexer is not supported"): + df.loc[key] + + @pytest.mark.parametrize( + "key", [{1}, {1: 1}, ({1}, "a"), ({1: 1}, "a"), (1, {"a"}), (1, {"a": "a"})] + ) + def test_setitem_dict_and_set_disallowed(self, key): + # GH#42825 enforced in 2.0 + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + with pytest.raises(TypeError, match="as an indexer is not supported"): + df.loc[key] = 1 + + @pytest.mark.parametrize( + "key", + [ + {1}, + {1: 1}, + (({1}, 2), "a"), + (({1: 1}, 2), "a"), + ((1, 2), {"a"}), + ((1, 2), {"a": "a"}), + ], + ) + def test_setitem_dict_and_set_disallowed_multiindex(self, key): + # GH#42825 enforced in 2.0 + df = DataFrame( + [[1, 2], [3, 4]], + columns=["a", "b"], + index=MultiIndex.from_tuples([(1, 2), (3, 4)]), + ) + with pytest.raises(TypeError, match="as an indexer is not supported"): + df.loc[key] = 1 + + +def test_adding_new_conditional_column() -> None: + # https://github.com/pandas-dev/pandas/issues/55025 + df = DataFrame({"x": [1]}) + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame({"x": [1], "y": ["1"]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"x": [1]}) + # try inserting something which numpy would store as 'object' + value = lambda x: x + df.loc[df["x"] == 1, "y"] = value + expected = DataFrame({"x": [1], "y": [value]}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize( + ("dtype", "infer_string"), + [ + (object, False), + ("string[pyarrow_numpy]", True), + ], +) +def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: + # https://github.com/pandas-dev/pandas/issues/56204 + pytest.importorskip("pyarrow") + + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + with pd.option_context("future.infer_string", infer_string): + df.loc[df["a"] == 1, "c"] = "1" + expected = DataFrame({"a": [1, 2], "b": [3, 4], "c": ["1", float("nan")]}).astype( + {"a": "int64", "b": "int64", "c": dtype} + ) + tm.assert_frame_equal(df, expected) + + +def test_add_new_column_infer_string(): + # GH#55366 + pytest.importorskip("pyarrow") + df = DataFrame({"x": [1]}) + with pd.option_context("future.infer_string", True): + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame( + {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, + columns=Index(["x", "y"], dtype=object), + ) + tm.assert_frame_equal(df, expected) + + +class TestSetitemValidation: + # This is adapted from pandas/tests/arrays/masked/test_indexing.py + # but checks for warnings instead of errors. + def _check_setitem_invalid(self, df, invalid, indexer, warn): + msg = "Setting an item of incompatible dtype is deprecated" + msg = re.escape(msg) + + orig_df = df.copy() + + # iloc + with tm.assert_produces_warning(warn, match=msg): + df.iloc[indexer, 0] = invalid + df = orig_df.copy() + + # loc + with tm.assert_produces_warning(warn, match=msg): + df.loc[indexer, "a"] = invalid + df = orig_df.copy() + + _invalid_scalars = [ + 1 + 2j, + "True", + "1", + "1.0", + pd.NaT, + np.datetime64("NaT"), + np.timedelta64("NaT"), + ] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] + + @pytest.mark.parametrize( + "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] + ) + @pytest.mark.parametrize("indexer", _indexers) + def test_setitem_validation_scalar_bool(self, invalid, indexer): + df = DataFrame({"a": [True, False, False]}, dtype="bool") + self._check_setitem_invalid(df, invalid, indexer, FutureWarning) + + @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) + @pytest.mark.parametrize("indexer", _indexers) + def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): + df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) + if isna(invalid) and invalid is not pd.NaT and not np.isnat(invalid): + warn = None + else: + warn = FutureWarning + self._check_setitem_invalid(df, invalid, indexer, warn) + + @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) + @pytest.mark.parametrize("indexer", _indexers) + def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): + df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype) + self._check_setitem_invalid(df, invalid, indexer, FutureWarning) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_insert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_insert.py new file mode 100644 index 0000000000000000000000000000000000000000..7e702bdc993bd1444dc48f85e016c768dadd042f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_insert.py @@ -0,0 +1,120 @@ +""" +test_insert is specifically for the DataFrame.insert method; not to be +confused with tests with "insert" in their names that are really testing +__setitem__. +""" +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +from pandas import ( + DataFrame, + Index, +) +import pandas._testing as tm + + +class TestDataFrameInsert: + def test_insert(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 3)), + index=np.arange(5), + columns=["c", "b", "a"], + ) + + df.insert(0, "foo", df["a"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"])) + tm.assert_series_equal(df["a"], df["foo"], check_names=False) + + df.insert(2, "bar", df["c"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"])) + tm.assert_almost_equal(df["c"], df["bar"], check_names=False) + + with pytest.raises(ValueError, match="already exists"): + df.insert(1, "a", df["b"]) + + msg = "cannot insert c, already exists" + with pytest.raises(ValueError, match=msg): + df.insert(1, "c", df["b"]) + + df.columns.name = "some_name" + # preserve columns name field + df.insert(0, "baz", df["c"]) + assert df.columns.name == "some_name" + + def test_insert_column_bug_4032(self): + # GH#4032, inserting a column and renaming causing errors + df = DataFrame({"b": [1.1, 2.2]}) + + df = df.rename(columns={}) + df.insert(0, "a", [1, 2]) + result = df.rename(columns={}) + + expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + df.insert(0, "c", [1.3, 2.3]) + result = df.rename(columns={}) + + expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) + tm.assert_frame_equal(result, expected) + + def test_insert_with_columns_dups(self): + # GH#14291 + df = DataFrame() + df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True) + df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) + df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) + exp = DataFrame( + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] + ) + tm.assert_frame_equal(df, exp) + + def test_insert_item_cache(self, using_array_manager, using_copy_on_write): + df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) + ser = df[0] + + if using_array_manager: + expected_warning = None + else: + # with BlockManager warn about high fragmentation of single dtype + expected_warning = PerformanceWarning + + with tm.assert_produces_warning(expected_warning): + for n in range(100): + df[n + 3] = df[1] * n + + if using_copy_on_write: + ser.iloc[0] = 99 + assert df.iloc[0, 0] == df[0][0] + assert df.iloc[0, 0] != 99 + else: + ser.values[0] = 99 + assert df.iloc[0, 0] == df[0][0] + assert df.iloc[0, 0] == 99 + + def test_insert_EA_no_warning(self): + # PerformanceWarning about fragmented frame should not be raised when + # using EAs (https://github.com/pandas-dev/pandas/issues/44098) + df = DataFrame( + np.random.default_rng(2).integers(0, 100, size=(3, 100)), dtype="Int64" + ) + with tm.assert_produces_warning(None): + df["a"] = np.array([1, 2, 3]) + + def test_insert_frame(self): + # GH#42403 + df = DataFrame({"col1": [1, 2], "col2": [3, 4]}) + + msg = ( + "Expected a one-dimensional object, got a DataFrame with 2 columns instead." + ) + with pytest.raises(ValueError, match=msg): + df.insert(1, "newcol", df) + + def test_insert_int64_loc(self): + # GH#53193 + df = DataFrame({"a": [1, 2]}) + df.insert(np.int64(0), "b", 0) + tm.assert_frame_equal(df, DataFrame({"b": [0, 0], "a": [1, 2]})) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_mask.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..264e27c9c122ebb6d59c5b16531ebbdc8ce51320 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_mask.py @@ -0,0 +1,152 @@ +""" +Tests for DataFrame.mask; tests DataFrame.where as a side-effect. +""" + +import numpy as np + +from pandas import ( + NA, + DataFrame, + Float64Dtype, + Series, + StringDtype, + Timedelta, + isna, +) +import pandas._testing as tm + + +class TestDataFrameMask: + def test_mask(self): + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3))) + cond = df > 0 + + rs = df.where(cond, np.nan) + tm.assert_frame_equal(rs, df.mask(df <= 0)) + tm.assert_frame_equal(rs, df.mask(~cond)) + + other = DataFrame(np.random.default_rng(2).standard_normal((5, 3))) + rs = df.where(cond, other) + tm.assert_frame_equal(rs, df.mask(df <= 0, other)) + tm.assert_frame_equal(rs, df.mask(~cond, other)) + + def test_mask2(self): + # see GH#21891 + df = DataFrame([1, 2]) + res = df.mask([[True], [False]]) + + exp = DataFrame([np.nan, 2]) + tm.assert_frame_equal(res, exp) + + def test_mask_inplace(self): + # GH#8801 + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3))) + cond = df > 0 + + rdf = df.copy() + + return_value = rdf.where(cond, inplace=True) + assert return_value is None + tm.assert_frame_equal(rdf, df.where(cond)) + tm.assert_frame_equal(rdf, df.mask(~cond)) + + rdf = df.copy() + return_value = rdf.where(cond, -df, inplace=True) + assert return_value is None + tm.assert_frame_equal(rdf, df.where(cond, -df)) + tm.assert_frame_equal(rdf, df.mask(~cond, -df)) + + def test_mask_edge_case_1xN_frame(self): + # GH#4071 + df = DataFrame([[1, 2]]) + res = df.mask(DataFrame([[True, False]])) + expec = DataFrame([[np.nan, 2]]) + tm.assert_frame_equal(res, expec) + + def test_mask_callable(self): + # GH#12533 + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + result = df.mask(lambda x: x > 4, lambda x: x + 1) + exp = DataFrame([[1, 2, 3], [4, 6, 7], [8, 9, 10]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.mask(df > 4, df + 1)) + + # return ndarray and scalar + result = df.mask(lambda x: (x % 2 == 0).values, lambda x: 99) + exp = DataFrame([[1, 99, 3], [99, 5, 99], [7, 99, 9]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.mask(df % 2 == 0, 99)) + + # chain + result = (df + 2).mask(lambda x: x > 8, lambda x: x + 10) + exp = DataFrame([[3, 4, 5], [6, 7, 8], [19, 20, 21]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, (df + 2).mask((df + 2) > 8, (df + 2) + 10)) + + def test_mask_dtype_bool_conversion(self): + # GH#3733 + df = DataFrame(data=np.random.default_rng(2).standard_normal((100, 50))) + df = df.where(df > 0) # create nans + bools = df > 0 + mask = isna(df) + expected = bools.astype(object).mask(mask) + result = bools.mask(mask) + tm.assert_frame_equal(result, expected) + + +def test_mask_stringdtype(frame_or_series): + # GH 40824 + obj = DataFrame( + {"A": ["foo", "bar", "baz", NA]}, + index=["id1", "id2", "id3", "id4"], + dtype=StringDtype(), + ) + filtered_obj = DataFrame( + {"A": ["this", "that"]}, index=["id2", "id3"], dtype=StringDtype() + ) + expected = DataFrame( + {"A": [NA, "this", "that", NA]}, + index=["id1", "id2", "id3", "id4"], + dtype=StringDtype(), + ) + if frame_or_series is Series: + obj = obj["A"] + filtered_obj = filtered_obj["A"] + expected = expected["A"] + + filter_ser = Series([False, True, True, False]) + result = obj.mask(filter_ser, filtered_obj) + + tm.assert_equal(result, expected) + + +def test_mask_where_dtype_timedelta(): + # https://github.com/pandas-dev/pandas/issues/39548 + df = DataFrame([Timedelta(i, unit="d") for i in range(5)]) + + expected = DataFrame(np.full(5, np.nan, dtype="timedelta64[ns]")) + tm.assert_frame_equal(df.mask(df.notna()), expected) + + expected = DataFrame( + [np.nan, np.nan, np.nan, Timedelta("3 day"), Timedelta("4 day")] + ) + tm.assert_frame_equal(df.where(df > Timedelta(2, unit="d")), expected) + + +def test_mask_return_dtype(): + # GH#50488 + ser = Series([0.0, 1.0, 2.0, 3.0], dtype=Float64Dtype()) + cond = ~ser.isna() + other = Series([True, False, True, False]) + excepted = Series([1.0, 0.0, 1.0, 0.0], dtype=ser.dtype) + result = ser.mask(cond, other) + tm.assert_series_equal(result, excepted) + + +def test_mask_inplace_no_other(): + # GH#51685 + df = DataFrame({"a": [1.0, 2.0], "b": ["x", "y"]}) + cond = DataFrame({"a": [True, False], "b": [False, True]}) + df.mask(cond, inplace=True) + expected = DataFrame({"a": [np.nan, 2], "b": ["x", np.nan]}) + tm.assert_frame_equal(df, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_set_value.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_set_value.py new file mode 100644 index 0000000000000000000000000000000000000000..ce771280bc26494f1e2eeeba5a978c5bd4b2eb42 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_set_value.py @@ -0,0 +1,77 @@ +import numpy as np + +from pandas.core.dtypes.common import is_float_dtype + +from pandas import ( + DataFrame, + isna, +) +import pandas._testing as tm + + +class TestSetValue: + def test_set_value(self, float_frame): + for idx in float_frame.index: + for col in float_frame.columns: + float_frame._set_value(idx, col, 1) + assert float_frame[col][idx] == 1 + + def test_set_value_resize(self, float_frame, using_infer_string): + res = float_frame._set_value("foobar", "B", 0) + assert res is None + assert float_frame.index[-1] == "foobar" + assert float_frame._get_value("foobar", "B") == 0 + + float_frame.loc["foobar", "qux"] = 0 + assert float_frame._get_value("foobar", "qux") == 0 + + res = float_frame.copy() + res._set_value("foobar", "baz", "sam") + if using_infer_string: + assert res["baz"].dtype == "string" + else: + assert res["baz"].dtype == np.object_ + res = float_frame.copy() + res._set_value("foobar", "baz", True) + assert res["baz"].dtype == np.object_ + + res = float_frame.copy() + res._set_value("foobar", "baz", 5) + assert is_float_dtype(res["baz"]) + assert isna(res["baz"].drop(["foobar"])).all() + + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + res._set_value("foobar", "baz", "sam") + assert res.loc["foobar", "baz"] == "sam" + + def test_set_value_with_index_dtype_change(self): + df_orig = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + index=range(3), + columns=list("ABC"), + ) + + # this is actually ambiguous as the 2 is interpreted as a positional + # so column is not created + df = df_orig.copy() + df._set_value("C", 2, 1.0) + assert list(df.index) == list(df_orig.index) + ["C"] + # assert list(df.columns) == list(df_orig.columns) + [2] + + df = df_orig.copy() + df.loc["C", 2] = 1.0 + assert list(df.index) == list(df_orig.index) + ["C"] + # assert list(df.columns) == list(df_orig.columns) + [2] + + # create both new + df = df_orig.copy() + df._set_value("C", "D", 1.0) + assert list(df.index) == list(df_orig.index) + ["C"] + assert list(df.columns) == list(df_orig.columns) + ["D"] + + df = df_orig.copy() + df.loc["C", "D"] = 1.0 + assert list(df.index) == list(df_orig.index) + ["C"] + assert list(df.columns) == list(df_orig.columns) + ["D"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_setitem.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_setitem.py new file mode 100644 index 0000000000000000000000000000000000000000..a58dd701f0f22e520391f169e9e6d942570991df --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_setitem.py @@ -0,0 +1,1419 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas.core.dtypes.base import _registry as ea_registry +from pandas.core.dtypes.common import is_object_dtype +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + MultiIndex, + NaT, + Period, + PeriodIndex, + Series, + Timestamp, + cut, + date_range, + notna, + period_range, +) +import pandas._testing as tm +from pandas.core.arrays import SparseArray + +from pandas.tseries.offsets import BDay + + +class TestDataFrameSetItem: + def test_setitem_str_subclass(self): + # GH#37366 + class mystring(str): + pass + + data = ["2020-10-22 01:21:00+00:00"] + index = DatetimeIndex(data) + df = DataFrame({"a": [1]}, index=index) + df["b"] = 2 + df[mystring("c")] = 3 + expected = DataFrame({"a": [1], "b": [2], mystring("c"): [3]}, index=index) + tm.assert_equal(df, expected) + + @pytest.mark.parametrize( + "dtype", ["int32", "int64", "uint32", "uint64", "float32", "float64"] + ) + def test_setitem_dtype(self, dtype, float_frame): + # Use integers since casting negative floats to uints is undefined + arr = np.random.default_rng(2).integers(1, 10, len(float_frame)) + + float_frame[dtype] = np.array(arr, dtype=dtype) + assert float_frame[dtype].dtype.name == dtype + + def test_setitem_list_not_dataframe(self, float_frame): + data = np.random.default_rng(2).standard_normal((len(float_frame), 2)) + float_frame[["A", "B"]] = data + tm.assert_almost_equal(float_frame[["A", "B"]].values, data) + + def test_setitem_error_msmgs(self): + # GH 7432 + df = DataFrame( + {"bar": [1, 2, 3], "baz": ["d", "e", "f"]}, + index=Index(["a", "b", "c"], name="foo"), + ) + ser = Series( + ["g", "h", "i", "j"], + index=Index(["a", "b", "c", "a"], name="foo"), + name="fiz", + ) + msg = "cannot reindex on an axis with duplicate labels" + with pytest.raises(ValueError, match=msg): + df["newcol"] = ser + + # GH 4107, more descriptive error message + df = DataFrame( + np.random.default_rng(2).integers(0, 2, (4, 4)), + columns=["a", "b", "c", "d"], + ) + + msg = "Cannot set a DataFrame with multiple columns to the single column gr" + with pytest.raises(ValueError, match=msg): + df["gr"] = df.groupby(["b", "c"]).count() + + # GH 55956, specific message for zero columns + msg = "Cannot set a DataFrame without columns to the column gr" + with pytest.raises(ValueError, match=msg): + df["gr"] = DataFrame() + + def test_setitem_benchmark(self): + # from the vb_suite/frame_methods/frame_insert_columns + N = 10 + K = 5 + df = DataFrame(index=range(N)) + new_col = np.random.default_rng(2).standard_normal(N) + for i in range(K): + df[i] = new_col + expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N)) + tm.assert_frame_equal(df, expected) + + def test_setitem_different_dtype(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 3)), + index=np.arange(5), + columns=["c", "b", "a"], + ) + df.insert(0, "foo", df["a"]) + df.insert(2, "bar", df["c"]) + + # diff dtype + + # new item + df["x"] = df["a"].astype("float32") + result = df.dtypes + expected = Series( + [np.dtype("float64")] * 5 + [np.dtype("float32")], + index=["foo", "c", "bar", "b", "a", "x"], + ) + tm.assert_series_equal(result, expected) + + # replacing current (in different block) + df["a"] = df["a"].astype("float32") + result = df.dtypes + expected = Series( + [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2, + index=["foo", "c", "bar", "b", "a", "x"], + ) + tm.assert_series_equal(result, expected) + + df["y"] = df["a"].astype("int32") + result = df.dtypes + expected = Series( + [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")], + index=["foo", "c", "bar", "b", "a", "x", "y"], + ) + tm.assert_series_equal(result, expected) + + def test_setitem_empty_columns(self): + # GH 13522 + df = DataFrame(index=["A", "B", "C"]) + df["X"] = df.index + df["X"] = ["x", "y", "z"] + exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) + tm.assert_frame_equal(df, exp) + + def test_setitem_dt64_index_empty_columns(self): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") + df = DataFrame(index=np.arange(len(rng))) + + df["A"] = rng + assert df["A"].dtype == np.dtype("M8[ns]") + + def test_setitem_timestamp_empty_columns(self): + # GH#19843 + df = DataFrame(index=range(3)) + df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns") + + expected = DataFrame( + [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"] + ) + tm.assert_frame_equal(df, expected) + + def test_setitem_wrong_length_categorical_dtype_raises(self): + # GH#29523 + cat = Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) + df = DataFrame(range(10), columns=["bar"]) + + msg = ( + rf"Length of values \({len(cat)}\) " + rf"does not match length of index \({len(df)}\)" + ) + with pytest.raises(ValueError, match=msg): + df["foo"] = cat + + def test_setitem_with_sparse_value(self): + # GH#8131 + df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) + sp_array = SparseArray([0, 0, 1]) + df["new_column"] = sp_array + + expected = Series(sp_array, name="new_column") + tm.assert_series_equal(df["new_column"], expected) + + def test_setitem_with_unaligned_sparse_value(self): + df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) + sp_series = Series(SparseArray([0, 0, 1]), index=[2, 1, 0]) + + df["new_column"] = sp_series + expected = Series(SparseArray([1, 0, 0]), name="new_column") + tm.assert_series_equal(df["new_column"], expected) + + def test_setitem_period_preserves_dtype(self): + # GH: 26861 + data = [Period("2003-12", "D")] + result = DataFrame([]) + result["a"] = data + + expected = DataFrame({"a": data}) + + tm.assert_frame_equal(result, expected) + + def test_setitem_dict_preserves_dtypes(self): + # https://github.com/pandas-dev/pandas/issues/34573 + expected = DataFrame( + { + "a": Series([0, 1, 2], dtype="int64"), + "b": Series([1, 2, 3], dtype=float), + "c": Series([1, 2, 3], dtype=float), + "d": Series([1, 2, 3], dtype="uint32"), + } + ) + df = DataFrame( + { + "a": Series([], dtype="int64"), + "b": Series([], dtype=float), + "c": Series([], dtype=float), + "d": Series([], dtype="uint32"), + } + ) + for idx, b in enumerate([1, 2, 3]): + df.loc[df.shape[0]] = { + "a": int(idx), + "b": float(b), + "c": float(b), + "d": np.uint32(b), + } + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "obj,dtype", + [ + (Period("2020-01"), PeriodDtype("M")), + (Interval(left=0, right=5), IntervalDtype("int64", "right")), + ( + Timestamp("2011-01-01", tz="US/Eastern"), + DatetimeTZDtype(unit="s", tz="US/Eastern"), + ), + ], + ) + def test_setitem_extension_types(self, obj, dtype): + # GH: 34832 + expected = DataFrame({"idx": [1, 2, 3], "obj": Series([obj] * 3, dtype=dtype)}) + + df = DataFrame({"idx": [1, 2, 3]}) + df["obj"] = obj + + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "ea_name", + [ + dtype.name + for dtype in ea_registry.dtypes + # property would require instantiation + if not isinstance(dtype.name, property) + ] + + ["datetime64[ns, UTC]", "period[D]"], + ) + def test_setitem_with_ea_name(self, ea_name): + # GH 38386 + result = DataFrame([0]) + result[ea_name] = [1] + expected = DataFrame({0: [0], ea_name: [1]}) + tm.assert_frame_equal(result, expected) + + def test_setitem_dt64_ndarray_with_NaT_and_diff_time_units(self): + # GH#7492 + data_ns = np.array([1, "nat"], dtype="datetime64[ns]") + result = Series(data_ns).to_frame() + result["new"] = data_ns + expected = DataFrame({0: [1, None], "new": [1, None]}, dtype="datetime64[ns]") + tm.assert_frame_equal(result, expected) + + # OutOfBoundsDatetime error shouldn't occur; as of 2.0 we preserve "M8[s]" + data_s = np.array([1, "nat"], dtype="datetime64[s]") + result["new"] = data_s + tm.assert_series_equal(result[0], expected[0]) + tm.assert_numpy_array_equal(result["new"].to_numpy(), data_s) + + @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) + def test_frame_setitem_datetime64_col_other_units(self, unit): + # Check that non-nano dt64 values get cast to dt64 on setitem + # into a not-yet-existing column + n = 100 + + dtype = np.dtype(f"M8[{unit}]") + vals = np.arange(n, dtype=np.int64).view(dtype) + if unit in ["s", "ms"]: + # supported unit + ex_vals = vals + else: + # we get the nearest supported units, i.e. "s" + ex_vals = vals.astype("datetime64[s]") + + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) + df[unit] = vals + + assert df[unit].dtype == ex_vals.dtype + assert (df[unit].values == ex_vals).all() + + @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) + def test_frame_setitem_existing_datetime64_col_other_units(self, unit): + # Check that non-nano dt64 values get cast to dt64 on setitem + # into an already-existing dt64 column + n = 100 + + dtype = np.dtype(f"M8[{unit}]") + vals = np.arange(n, dtype=np.int64).view(dtype) + ex_vals = vals.astype("datetime64[ns]") + + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) + df["dates"] = np.arange(n, dtype=np.int64).view("M8[ns]") + + # We overwrite existing dt64 column with new, non-nano dt64 vals + df["dates"] = vals + assert (df["dates"].values == ex_vals).all() + + def test_setitem_dt64tz(self, timezone_frame, using_copy_on_write): + df = timezone_frame + idx = df["B"].rename("foo") + + # setitem + df["C"] = idx + tm.assert_series_equal(df["C"], Series(idx, name="C")) + + df["D"] = "foo" + df["D"] = idx + tm.assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] + + # assert that A & C are not sharing the same base (e.g. they + # are copies) + # Note: This does not hold with Copy on Write (because of lazy copying) + v1 = df._mgr.arrays[1] + v2 = df._mgr.arrays[2] + tm.assert_extension_array_equal(v1, v2) + v1base = v1._ndarray.base + v2base = v2._ndarray.base + if not using_copy_on_write: + assert v1base is None or (id(v1base) != id(v2base)) + else: + assert id(v1base) == id(v2base) + + # with nan + df2 = df.copy() + df2.iloc[1, 1] = NaT + df2.iloc[1, 2] = NaT + result = df2["B"] + tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) + tm.assert_series_equal(df2.dtypes, df.dtypes) + + def test_setitem_periodindex(self): + rng = period_range("1/1/2000", periods=5, name="index") + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), index=rng) + + df["Index"] = rng + rs = Index(df["Index"]) + tm.assert_index_equal(rs, rng, check_names=False) + assert rs.name == "Index" + assert rng.name == "index" + + rs = df.reset_index().set_index("index") + assert isinstance(rs.index, PeriodIndex) + tm.assert_index_equal(rs.index, rng) + + def test_setitem_complete_column_with_array(self): + # GH#37954 + df = DataFrame({"a": ["one", "two", "three"], "b": [1, 2, 3]}) + arr = np.array([[1, 1], [3, 1], [5, 1]]) + df[["c", "d"]] = arr + expected = DataFrame( + { + "a": ["one", "two", "three"], + "b": [1, 2, 3], + "c": [1, 3, 5], + "d": [1, 1, 1], + } + ) + expected["c"] = expected["c"].astype(arr.dtype) + expected["d"] = expected["d"].astype(arr.dtype) + assert expected["c"].dtype == arr.dtype + assert expected["d"].dtype == arr.dtype + tm.assert_frame_equal(df, expected) + + def test_setitem_period_d_dtype(self): + # GH 39763 + rng = period_range("2016-01-01", periods=9, freq="D", name="A") + result = DataFrame(rng) + expected = DataFrame( + {"A": ["NaT", "NaT", "NaT", "NaT", "NaT", "NaT", "NaT", "NaT", "NaT"]}, + dtype="period[D]", + ) + result.iloc[:] = rng._na_value + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"]) + def test_setitem_bool_with_numeric_index(self, dtype): + # GH#36319 + cols = Index([1, 2, 3], dtype=dtype) + df = DataFrame(np.random.default_rng(2).standard_normal((3, 3)), columns=cols) + + df[False] = ["a", "b", "c"] + + expected_cols = Index([1, 2, 3, False], dtype=object) + if dtype == "f8": + expected_cols = Index([1.0, 2.0, 3.0, False], dtype=object) + + tm.assert_index_equal(df.columns, expected_cols) + + @pytest.mark.parametrize("indexer", ["B", ["B"]]) + def test_setitem_frame_length_0_str_key(self, indexer): + # GH#38831 + df = DataFrame(columns=["A", "B"]) + other = DataFrame({"B": [1, 2]}) + df[indexer] = other + expected = DataFrame({"A": [np.nan] * 2, "B": [1, 2]}) + expected["A"] = expected["A"].astype("object") + tm.assert_frame_equal(df, expected) + + def test_setitem_frame_duplicate_columns(self): + # GH#15695 + cols = ["A", "B", "C"] * 2 + df = DataFrame(index=range(3), columns=cols) + df.loc[0, "A"] = (0, 3) + df.loc[:, "B"] = (1, 4) + df["C"] = (2, 5) + expected = DataFrame( + [ + [0, 1, 2, 3, 4, 5], + [np.nan, 1, 2, np.nan, 4, 5], + [np.nan, 1, 2, np.nan, 4, 5], + ], + dtype="object", + ) + + # set these with unique columns to be extra-unambiguous + expected[2] = expected[2].astype(np.int64) + expected[5] = expected[5].astype(np.int64) + expected.columns = cols + + tm.assert_frame_equal(df, expected) + + def test_setitem_frame_duplicate_columns_size_mismatch(self): + # GH#39510 + cols = ["A", "B", "C"] * 2 + df = DataFrame(index=range(3), columns=cols) + with pytest.raises(ValueError, match="Columns must be same length as key"): + df[["A"]] = (0, 3, 5) + + df2 = df.iloc[:, :3] # unique columns + with pytest.raises(ValueError, match="Columns must be same length as key"): + df2[["A"]] = (0, 3, 5) + + @pytest.mark.parametrize("cols", [["a", "b", "c"], ["a", "a", "a"]]) + def test_setitem_df_wrong_column_number(self, cols): + # GH#38604 + df = DataFrame([[1, 2, 3]], columns=cols) + rhs = DataFrame([[10, 11]], columns=["d", "e"]) + msg = "Columns must be same length as key" + with pytest.raises(ValueError, match=msg): + df["a"] = rhs + + def test_setitem_listlike_indexer_duplicate_columns(self): + # GH#38604 + df = DataFrame([[1, 2, 3]], columns=["a", "b", "b"]) + rhs = DataFrame([[10, 11, 12]], columns=["a", "b", "b"]) + df[["a", "b"]] = rhs + expected = DataFrame([[10, 11, 12]], columns=["a", "b", "b"]) + tm.assert_frame_equal(df, expected) + + df[["c", "b"]] = rhs + expected = DataFrame([[10, 11, 12, 10]], columns=["a", "b", "b", "c"]) + tm.assert_frame_equal(df, expected) + + def test_setitem_listlike_indexer_duplicate_columns_not_equal_length(self): + # GH#39403 + df = DataFrame([[1, 2, 3]], columns=["a", "b", "b"]) + rhs = DataFrame([[10, 11]], columns=["a", "b"]) + msg = "Columns must be same length as key" + with pytest.raises(ValueError, match=msg): + df[["a", "b"]] = rhs + + def test_setitem_intervals(self): + df = DataFrame({"A": range(10)}) + ser = cut(df["A"], 5) + assert isinstance(ser.cat.categories, IntervalIndex) + + # B & D end up as Categoricals + # the remainder are converted to in-line objects + # containing an IntervalIndex.values + df["B"] = ser + df["C"] = np.array(ser) + df["D"] = ser.values + df["E"] = np.array(ser.values) + df["F"] = ser.astype(object) + + assert isinstance(df["B"].dtype, CategoricalDtype) + assert isinstance(df["B"].cat.categories.dtype, IntervalDtype) + assert isinstance(df["D"].dtype, CategoricalDtype) + assert isinstance(df["D"].cat.categories.dtype, IntervalDtype) + + # These go through the Series constructor and so get inferred back + # to IntervalDtype + assert isinstance(df["C"].dtype, IntervalDtype) + assert isinstance(df["E"].dtype, IntervalDtype) + + # But the Series constructor doesn't do inference on Series objects, + # so setting df["F"] doesn't get cast back to IntervalDtype + assert is_object_dtype(df["F"]) + + # they compare equal as Index + # when converted to numpy objects + c = lambda x: Index(np.array(x)) + tm.assert_index_equal(c(df.B), c(df.B)) + tm.assert_index_equal(c(df.B), c(df.C), check_names=False) + tm.assert_index_equal(c(df.B), c(df.D), check_names=False) + tm.assert_index_equal(c(df.C), c(df.D), check_names=False) + + # B & D are the same Series + tm.assert_series_equal(df["B"], df["B"]) + tm.assert_series_equal(df["B"], df["D"], check_names=False) + + # C & E are the same Series + tm.assert_series_equal(df["C"], df["C"]) + tm.assert_series_equal(df["C"], df["E"], check_names=False) + + def test_setitem_categorical(self): + # GH#35369 + df = DataFrame({"h": Series(list("mn")).astype("category")}) + df.h = df.h.cat.reorder_categories(["n", "m"]) + expected = DataFrame( + {"h": Categorical(["m", "n"]).reorder_categories(["n", "m"])} + ) + tm.assert_frame_equal(df, expected) + + def test_setitem_with_empty_listlike(self): + # GH#17101 + index = Index([], name="idx") + result = DataFrame(columns=["A"], index=index) + result["A"] = [] + expected = DataFrame(columns=["A"], index=index) + tm.assert_index_equal(result.index, expected.index) + + @pytest.mark.parametrize( + "cols, values, expected", + [ + (["C", "D", "D", "a"], [1, 2, 3, 4], 4), # with duplicates + (["D", "C", "D", "a"], [1, 2, 3, 4], 4), # mixed order + (["C", "B", "B", "a"], [1, 2, 3, 4], 4), # other duplicate cols + (["C", "B", "a"], [1, 2, 3], 3), # no duplicates + (["B", "C", "a"], [3, 2, 1], 1), # alphabetical order + (["C", "a", "B"], [3, 2, 1], 2), # in the middle + ], + ) + def test_setitem_same_column(self, cols, values, expected): + # GH#23239 + df = DataFrame([values], columns=cols) + df["a"] = df["a"] + result = df["a"].values[0] + assert result == expected + + def test_setitem_multi_index(self): + # GH#7655, test that assigning to a sub-frame of a frame + # with multi-index columns aligns both rows and columns + it = ["jim", "joe", "jolie"], ["first", "last"], ["left", "center", "right"] + + cols = MultiIndex.from_product(it) + index = date_range("20141006", periods=20) + vals = np.random.default_rng(2).integers(1, 1000, (len(index), len(cols))) + df = DataFrame(vals, columns=cols, index=index) + + i, j = df.index.values.copy(), it[-1][:] + + np.random.default_rng(2).shuffle(i) + df["jim"] = df["jolie"].loc[i, ::-1] + tm.assert_frame_equal(df["jim"], df["jolie"]) + + np.random.default_rng(2).shuffle(j) + df[("joe", "first")] = df[("jolie", "last")].loc[i, j] + tm.assert_frame_equal(df[("joe", "first")], df[("jolie", "last")]) + + np.random.default_rng(2).shuffle(j) + df[("joe", "last")] = df[("jolie", "first")].loc[i, j] + tm.assert_frame_equal(df[("joe", "last")], df[("jolie", "first")]) + + @pytest.mark.parametrize( + "columns,box,expected", + [ + ( + ["A", "B", "C", "D"], + 7, + DataFrame( + [[7, 7, 7, 7], [7, 7, 7, 7], [7, 7, 7, 7]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["C", "D"], + [7, 8], + DataFrame( + [[1, 2, 7, 8], [3, 4, 7, 8], [5, 6, 7, 8]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["A", "B", "C"], + np.array([7, 8, 9], dtype=np.int64), + DataFrame([[7, 8, 9], [7, 8, 9], [7, 8, 9]], columns=["A", "B", "C"]), + ), + ( + ["B", "C", "D"], + [[7, 8, 9], [10, 11, 12], [13, 14, 15]], + DataFrame( + [[1, 7, 8, 9], [3, 10, 11, 12], [5, 13, 14, 15]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["C", "A", "D"], + np.array([[7, 8, 9], [10, 11, 12], [13, 14, 15]], dtype=np.int64), + DataFrame( + [[8, 2, 7, 9], [11, 4, 10, 12], [14, 6, 13, 15]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["A", "C"], + DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), + DataFrame( + [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] + ), + ), + ], + ) + def test_setitem_list_missing_columns(self, columns, box, expected): + # GH#29334 + df = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df[columns] = box + tm.assert_frame_equal(df, expected) + + def test_setitem_list_of_tuples(self, float_frame): + tuples = list(zip(float_frame["A"], float_frame["B"])) + float_frame["tuples"] = tuples + + result = float_frame["tuples"] + expected = Series(tuples, index=float_frame.index, name="tuples") + tm.assert_series_equal(result, expected) + + def test_setitem_iloc_generator(self): + # GH#39614 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + indexer = (x for x in [1, 2]) + df.iloc[indexer] = 1 + expected = DataFrame({"a": [1, 1, 1], "b": [4, 1, 1]}) + tm.assert_frame_equal(df, expected) + + def test_setitem_iloc_two_dimensional_generator(self): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + indexer = (x for x in [1, 2]) + df.iloc[indexer, 1] = 1 + expected = DataFrame({"a": [1, 2, 3], "b": [4, 1, 1]}) + tm.assert_frame_equal(df, expected) + + def test_setitem_dtypes_bytes_type_to_object(self): + # GH 20734 + index = Series(name="id", dtype="S24") + df = DataFrame(index=index) + df["a"] = Series(name="a", index=index, dtype=np.uint32) + df["b"] = Series(name="b", index=index, dtype="S64") + df["c"] = Series(name="c", index=index, dtype="S64") + df["d"] = Series(name="d", index=index, dtype=np.uint8) + result = df.dtypes + expected = Series([np.uint32, object, object, np.uint8], index=list("abcd")) + tm.assert_series_equal(result, expected) + + def test_boolean_mask_nullable_int64(self): + # GH 28928 + result = DataFrame({"a": [3, 4], "b": [5, 6]}).astype( + {"a": "int64", "b": "Int64"} + ) + mask = Series(False, index=result.index) + result.loc[mask, "a"] = result["a"] + result.loc[mask, "b"] = result["b"] + expected = DataFrame({"a": [3, 4], "b": [5, 6]}).astype( + {"a": "int64", "b": "Int64"} + ) + tm.assert_frame_equal(result, expected) + + def test_setitem_ea_dtype_rhs_series(self): + # GH#47425 + df = DataFrame({"a": [1, 2]}) + df["a"] = Series([1, 2], dtype="Int64") + expected = DataFrame({"a": [1, 2]}, dtype="Int64") + tm.assert_frame_equal(df, expected) + + # TODO(ArrayManager) set column with 2d column array, see #44788 + @td.skip_array_manager_not_yet_implemented + def test_setitem_npmatrix_2d(self): + # GH#42376 + # for use-case df["x"] = sparse.random((10, 10)).mean(axis=1) + expected = DataFrame( + {"np-array": np.ones(10), "np-matrix": np.ones(10)}, index=np.arange(10) + ) + + a = np.ones((10, 1)) + df = DataFrame(index=np.arange(10)) + df["np-array"] = a + + # Instantiation of `np.matrix` gives PendingDeprecationWarning + with tm.assert_produces_warning(PendingDeprecationWarning): + df["np-matrix"] = np.matrix(a) + + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("vals", [{}, {"d": "a"}]) + def test_setitem_aligning_dict_with_index(self, vals): + # GH#47216 + df = DataFrame({"a": [1, 2], "b": [3, 4], **vals}) + df.loc[:, "a"] = {1: 100, 0: 200} + df.loc[:, "c"] = {0: 5, 1: 6} + df.loc[:, "e"] = {1: 5} + expected = DataFrame( + {"a": [200, 100], "b": [3, 4], **vals, "c": [5, 6], "e": [np.nan, 5]} + ) + tm.assert_frame_equal(df, expected) + + def test_setitem_rhs_dataframe(self): + # GH#47578 + df = DataFrame({"a": [1, 2]}) + df["a"] = DataFrame({"a": [10, 11]}, index=[1, 2]) + expected = DataFrame({"a": [np.nan, 10]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2]}) + df.isetitem(0, DataFrame({"a": [10, 11]}, index=[1, 2])) + tm.assert_frame_equal(df, expected) + + def test_setitem_frame_overwrite_with_ea_dtype(self, any_numeric_ea_dtype): + # GH#46896 + df = DataFrame(columns=["a", "b"], data=[[1, 2], [3, 4]]) + df["a"] = DataFrame({"a": [10, 11]}, dtype=any_numeric_ea_dtype) + expected = DataFrame( + { + "a": Series([10, 11], dtype=any_numeric_ea_dtype), + "b": [2, 4], + } + ) + tm.assert_frame_equal(df, expected) + + def test_setitem_string_option_object_index(self): + # GH#55638 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, 2]}) + with pd.option_context("future.infer_string", True): + df["b"] = Index(["a", "b"], dtype=object) + expected = DataFrame({"a": [1, 2], "b": Series(["a", "b"], dtype=object)}) + tm.assert_frame_equal(df, expected) + + def test_setitem_frame_midx_columns(self): + # GH#49121 + df = DataFrame({("a", "b"): [10]}) + expected = df.copy() + col_name = ("a", "b") + df[col_name] = df[[col_name]] + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_ea_dtype(self): + # GH#55604 + df = DataFrame({"a": np.array([10], dtype="i8")}) + df.loc[:, "a"] = Series([11], dtype="Int64") + expected = DataFrame({"a": np.array([11], dtype="i8")}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": np.array([10], dtype="i8")}) + df.iloc[:, 0] = Series([11], dtype="Int64") + tm.assert_frame_equal(df, expected) + + def test_setitem_object_inferring(self): + # GH#56102 + idx = Index([Timestamp("2019-12-31")], dtype=object) + df = DataFrame({"a": [1]}) + with tm.assert_produces_warning(FutureWarning, match="infer"): + df.loc[:, "b"] = idx + with tm.assert_produces_warning(FutureWarning, match="infer"): + df["c"] = idx + + expected = DataFrame( + { + "a": [1], + "b": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), + "c": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), + } + ) + tm.assert_frame_equal(df, expected) + + +class TestSetitemTZAwareValues: + @pytest.fixture + def idx(self): + naive = DatetimeIndex(["2013-1-1 13:00", "2013-1-2 14:00"], name="B") + idx = naive.tz_localize("US/Pacific") + return idx + + @pytest.fixture + def expected(self, idx): + expected = Series(np.array(idx.tolist(), dtype="object"), name="B") + assert expected.dtype == idx.dtype + return expected + + def test_setitem_dt64series(self, idx, expected): + # convert to utc + df = DataFrame(np.random.default_rng(2).standard_normal((2, 1)), columns=["A"]) + df["B"] = idx + df["B"] = idx.to_series(index=[0, 1]).dt.tz_convert(None) + + result = df["B"] + comp = Series(idx.tz_convert("UTC").tz_localize(None), name="B") + tm.assert_series_equal(result, comp) + + def test_setitem_datetimeindex(self, idx, expected): + # setting a DataFrame column with a tzaware DTI retains the dtype + df = DataFrame(np.random.default_rng(2).standard_normal((2, 1)), columns=["A"]) + + # assign to frame + df["B"] = idx + result = df["B"] + tm.assert_series_equal(result, expected) + + def test_setitem_object_array_of_tzaware_datetimes(self, idx, expected): + # setting a DataFrame column with a tzaware DTI retains the dtype + df = DataFrame(np.random.default_rng(2).standard_normal((2, 1)), columns=["A"]) + + # object array of datetimes with a tz + df["B"] = idx.to_pydatetime() + result = df["B"] + tm.assert_series_equal(result, expected) + + +class TestDataFrameSetItemWithExpansion: + def test_setitem_listlike_views(self, using_copy_on_write, warn_copy_on_write): + # GH#38148 + df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]}) + + # get one column as a view of df + ser = df["a"] + + # add columns with list-like indexer + df[["c", "d"]] = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.5]]) + + # edit in place the first column to check view semantics + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 + + if using_copy_on_write: + expected = Series([1, 2, 3], name="a") + else: + expected = Series([100, 2, 3], name="a") + tm.assert_series_equal(ser, expected) + + def test_setitem_string_column_numpy_dtype_raising(self): + # GH#39010 + df = DataFrame([[1, 2], [3, 4]]) + df["0 - Name"] = [5, 6] + expected = DataFrame([[1, 2, 5], [3, 4, 6]], columns=[0, 1, "0 - Name"]) + tm.assert_frame_equal(df, expected) + + def test_setitem_empty_df_duplicate_columns(self, using_copy_on_write): + # GH#38521 + df = DataFrame(columns=["a", "b", "b"], dtype="float64") + df.loc[:, "a"] = list(range(2)) + expected = DataFrame( + [[0, np.nan, np.nan], [1, np.nan, np.nan]], columns=["a", "b", "b"] + ) + tm.assert_frame_equal(df, expected) + + def test_setitem_with_expansion_categorical_dtype(self): + # assignment + df = DataFrame( + { + "value": np.array( + np.random.default_rng(2).integers(0, 10000, 100), dtype="int32" + ) + } + ) + labels = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) + + df = df.sort_values(by=["value"], ascending=True) + ser = cut(df.value, range(0, 10500, 500), right=False, labels=labels) + cat = ser.values + + # setting with a Categorical + df["D"] = cat + result = df.dtypes + expected = Series( + [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], + index=["value", "D"], + ) + tm.assert_series_equal(result, expected) + + # setting with a Series + df["E"] = ser + result = df.dtypes + expected = Series( + [ + np.dtype("int32"), + CategoricalDtype(categories=labels, ordered=False), + CategoricalDtype(categories=labels, ordered=False), + ], + index=["value", "D", "E"], + ) + tm.assert_series_equal(result, expected) + + result1 = df["D"] + result2 = df["E"] + tm.assert_categorical_equal(result1._mgr.array, cat) + + # sorting + ser.name = "E" + tm.assert_series_equal(result2.sort_index(), ser.sort_index()) + + def test_setitem_scalars_no_index(self): + # GH#16823 / GH#17894 + df = DataFrame() + df["foo"] = 1 + expected = DataFrame(columns=["foo"]).astype(np.int64) + tm.assert_frame_equal(df, expected) + + def test_setitem_newcol_tuple_key(self, float_frame): + assert ( + "A", + "B", + ) not in float_frame.columns + float_frame["A", "B"] = float_frame["A"] + assert ("A", "B") in float_frame.columns + + result = float_frame["A", "B"] + expected = float_frame["A"] + tm.assert_series_equal(result, expected, check_names=False) + + def test_frame_setitem_newcol_timestamp(self): + # GH#2155 + columns = date_range(start="1/1/2012", end="2/1/2012", freq=BDay()) + data = DataFrame(columns=columns, index=range(10)) + t = datetime(2012, 11, 1) + ts = Timestamp(t) + data[ts] = np.nan # works, mostly a smoke-test + assert np.isnan(data[ts]).all() + + def test_frame_setitem_rangeindex_into_new_col(self): + # GH#47128 + df = DataFrame({"a": ["a", "b"]}) + df["b"] = df.index + df.loc[[False, True], "b"] = 100 + result = df.loc[[1], :] + expected = DataFrame({"a": ["b"], "b": [100]}, index=[1]) + tm.assert_frame_equal(result, expected) + + def test_setitem_frame_keep_ea_dtype(self, any_numeric_ea_dtype): + # GH#46896 + df = DataFrame(columns=["a", "b"], data=[[1, 2], [3, 4]]) + df["c"] = DataFrame({"a": [10, 11]}, dtype=any_numeric_ea_dtype) + expected = DataFrame( + { + "a": [1, 3], + "b": [2, 4], + "c": Series([10, 11], dtype=any_numeric_ea_dtype), + } + ) + tm.assert_frame_equal(df, expected) + + def test_loc_expansion_with_timedelta_type(self): + result = DataFrame(columns=list("abc")) + result.loc[0] = { + "a": pd.to_timedelta(5, unit="s"), + "b": pd.to_timedelta(72, unit="s"), + "c": "23", + } + expected = DataFrame( + [[pd.Timedelta("0 days 00:00:05"), pd.Timedelta("0 days 00:01:12"), "23"]], + index=Index([0]), + columns=(["a", "b", "c"]), + ) + tm.assert_frame_equal(result, expected) + + +class TestDataFrameSetItemSlicing: + def test_setitem_slice_position(self): + # GH#31469 + df = DataFrame(np.zeros((100, 1))) + df[-4:] = 1 + arr = np.zeros((100, 1)) + arr[-4:] = 1 + expected = DataFrame(arr) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("indexer", [tm.setitem, tm.iloc]) + @pytest.mark.parametrize("box", [Series, np.array, list, pd.array]) + @pytest.mark.parametrize("n", [1, 2, 3]) + def test_setitem_slice_indexer_broadcasting_rhs(self, n, box, indexer): + # GH#40440 + df = DataFrame([[1, 3, 5]] + [[2, 4, 6]] * n, columns=["a", "b", "c"]) + indexer(df)[1:] = box([10, 11, 12]) + expected = DataFrame([[1, 3, 5]] + [[10, 11, 12]] * n, columns=["a", "b", "c"]) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("box", [Series, np.array, list, pd.array]) + @pytest.mark.parametrize("n", [1, 2, 3]) + def test_setitem_list_indexer_broadcasting_rhs(self, n, box): + # GH#40440 + df = DataFrame([[1, 3, 5]] + [[2, 4, 6]] * n, columns=["a", "b", "c"]) + df.iloc[list(range(1, n + 1))] = box([10, 11, 12]) + expected = DataFrame([[1, 3, 5]] + [[10, 11, 12]] * n, columns=["a", "b", "c"]) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("indexer", [tm.setitem, tm.iloc]) + @pytest.mark.parametrize("box", [Series, np.array, list, pd.array]) + @pytest.mark.parametrize("n", [1, 2, 3]) + def test_setitem_slice_broadcasting_rhs_mixed_dtypes(self, n, box, indexer): + # GH#40440 + df = DataFrame( + [[1, 3, 5], ["x", "y", "z"]] + [[2, 4, 6]] * n, columns=["a", "b", "c"] + ) + indexer(df)[1:] = box([10, 11, 12]) + expected = DataFrame( + [[1, 3, 5]] + [[10, 11, 12]] * (n + 1), + columns=["a", "b", "c"], + dtype="object", + ) + tm.assert_frame_equal(df, expected) + + +class TestDataFrameSetItemCallable: + def test_setitem_callable(self): + # GH#12533 + df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}) + df[lambda x: "A"] = [11, 12, 13, 14] + + exp = DataFrame({"A": [11, 12, 13, 14], "B": [5, 6, 7, 8]}) + tm.assert_frame_equal(df, exp) + + def test_setitem_other_callable(self): + # GH#13299 + def inc(x): + return x + 1 + + # Set dtype object straight away to avoid upcast when setting inc below + df = DataFrame([[-1, 1], [1, -1]], dtype=object) + df[df > 0] = inc + + expected = DataFrame([[-1, inc], [inc, -1]]) + tm.assert_frame_equal(df, expected) + + +class TestDataFrameSetItemBooleanMask: + @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values + @pytest.mark.parametrize( + "mask_type", + [lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values], + ids=["dataframe", "array"], + ) + def test_setitem_boolean_mask(self, mask_type, float_frame): + # Test for issue #18582 + df = float_frame.copy() + mask = mask_type(df) + + # index with boolean mask + result = df.copy() + result[mask] = np.nan + + expected = df.values.copy() + expected[np.array(mask)] = np.nan + expected = DataFrame(expected, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(reason="Currently empty indexers are treated as all False") + @pytest.mark.parametrize("box", [list, np.array, Series]) + def test_setitem_loc_empty_indexer_raises_with_non_empty_value(self, box): + # GH#37672 + df = DataFrame({"a": ["a"], "b": [1], "c": [1]}) + if box == Series: + indexer = box([], dtype="object") + else: + indexer = box([]) + msg = "Must have equal len keys and value when setting with an iterable" + with pytest.raises(ValueError, match=msg): + df.loc[indexer, ["b"]] = [1] + + @pytest.mark.parametrize("box", [list, np.array, Series]) + def test_setitem_loc_only_false_indexer_dtype_changed(self, box): + # GH#37550 + # Dtype is only changed when value to set is a Series and indexer is + # empty/bool all False + df = DataFrame({"a": ["a"], "b": [1], "c": [1]}) + indexer = box([False]) + df.loc[indexer, ["b"]] = 10 - df["c"] + expected = DataFrame({"a": ["a"], "b": [1], "c": [1]}) + tm.assert_frame_equal(df, expected) + + df.loc[indexer, ["b"]] = 9 + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("indexer", [tm.setitem, tm.loc]) + def test_setitem_boolean_mask_aligning(self, indexer): + # GH#39931 + df = DataFrame({"a": [1, 4, 2, 3], "b": [5, 6, 7, 8]}) + expected = df.copy() + mask = df["a"] >= 3 + indexer(df)[mask] = indexer(df)[mask].sort_values("a") + tm.assert_frame_equal(df, expected) + + def test_setitem_mask_categorical(self): + # assign multiple rows (mixed values) (-> array) -> exp_multi_row + # changed multiple rows + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values2 = [1, 1, 2, 2, 1, 1, 1] + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) + + catsf = Categorical( + ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] + ) + idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) + valuesf = [1, 1, 3, 3, 1, 1, 1] + df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) + + exp_fancy = exp_multi_row.copy() + exp_fancy["cats"] = exp_fancy["cats"].cat.set_categories(["a", "b", "c"]) + + mask = df["cats"] == "c" + df[mask] = ["b", 2] + # category c is kept in .categories + tm.assert_frame_equal(df, exp_fancy) + + @pytest.mark.parametrize("dtype", ["float", "int64"]) + @pytest.mark.parametrize("kwargs", [{}, {"index": [1]}, {"columns": ["A"]}]) + def test_setitem_empty_frame_with_boolean(self, dtype, kwargs): + # see GH#10126 + kwargs["dtype"] = dtype + df = DataFrame(**kwargs) + + df2 = df.copy() + df[df > df2] = 47 + tm.assert_frame_equal(df, df2) + + def test_setitem_boolean_indexing(self): + idx = list(range(3)) + cols = ["A", "B", "C"] + df1 = DataFrame( + index=idx, + columns=cols, + data=np.array( + [[0.0, 0.5, 1.0], [1.5, 2.0, 2.5], [3.0, 3.5, 4.0]], dtype=float + ), + ) + df2 = DataFrame(index=idx, columns=cols, data=np.ones((len(idx), len(cols)))) + + expected = DataFrame( + index=idx, + columns=cols, + data=np.array([[0.0, 0.5, 1.0], [1.5, 2.0, -1], [-1, -1, -1]], dtype=float), + ) + + df1[df1 > 2.0 * df2] = -1 + tm.assert_frame_equal(df1, expected) + with pytest.raises(ValueError, match="Item wrong length"): + df1[df1.index[:-1] > 2] = -1 + + def test_loc_setitem_all_false_boolean_two_blocks(self): + # GH#40885 + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": "a"}) + expected = df.copy() + indexer = Series([False, False], name="c") + df.loc[indexer, ["b"]] = DataFrame({"b": [5, 6]}, index=[0, 1]) + tm.assert_frame_equal(df, expected) + + def test_setitem_ea_boolean_mask(self): + # GH#47125 + df = DataFrame([[-1, 2], [3, -4]]) + expected = DataFrame([[0, 2], [3, 0]]) + boolean_indexer = DataFrame( + { + 0: Series([True, False], dtype="boolean"), + 1: Series([pd.NA, True], dtype="boolean"), + } + ) + df[boolean_indexer] = 0 + tm.assert_frame_equal(df, expected) + + +class TestDataFrameSetitemCopyViewSemantics: + def test_setitem_always_copy(self, float_frame): + assert "E" not in float_frame.columns + s = float_frame["A"].copy() + float_frame["E"] = s + + float_frame.iloc[5:10, float_frame.columns.get_loc("E")] = np.nan + assert notna(s[5:10]).all() + + @pytest.mark.parametrize("consolidate", [True, False]) + def test_setitem_partial_column_inplace( + self, consolidate, using_array_manager, using_copy_on_write + ): + # This setting should be in-place, regardless of whether frame is + # single-block or multi-block + # GH#304 this used to be incorrectly not-inplace, in which case + # we needed to ensure _item_cache was cleared. + + df = DataFrame( + {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3] + ) + df.insert(2, "z", np.nan) + if not using_array_manager: + if consolidate: + df._consolidate_inplace() + assert len(df._mgr.blocks) == 1 + else: + assert len(df._mgr.blocks) == 2 + + zvals = df["z"]._values + + df.loc[2:, "z"] = 42 + + expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z") + tm.assert_series_equal(df["z"], expected) + + # check setting occurred in-place + if not using_copy_on_write: + tm.assert_numpy_array_equal(zvals, expected.values) + assert np.shares_memory(zvals, df["z"]._values) + + def test_setitem_duplicate_columns_not_inplace(self): + # GH#39510 + cols = ["A", "B"] * 2 + df = DataFrame(0.0, index=[0], columns=cols) + df_copy = df.copy() + df_view = df[:] + df["B"] = (2, 5) + + expected = DataFrame([[0.0, 2, 0.0, 5]], columns=cols) + tm.assert_frame_equal(df_view, df_copy) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "value", [1, np.array([[1], [1]], dtype="int64"), [[1], [1]]] + ) + def test_setitem_same_dtype_not_inplace(self, value, using_array_manager): + # GH#39510 + cols = ["A", "B"] + df = DataFrame(0, index=[0, 1], columns=cols) + df_copy = df.copy() + df_view = df[:] + df[["B"]] = value + + expected = DataFrame([[0, 1], [0, 1]], columns=cols) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df_view, df_copy) + + @pytest.mark.parametrize("value", [1.0, np.array([[1.0], [1.0]]), [[1.0], [1.0]]]) + def test_setitem_listlike_key_scalar_value_not_inplace(self, value): + # GH#39510 + cols = ["A", "B"] + df = DataFrame(0, index=[0, 1], columns=cols) + df_copy = df.copy() + df_view = df[:] + df[["B"]] = value + + expected = DataFrame([[0, 1.0], [0, 1.0]], columns=cols) + tm.assert_frame_equal(df_view, df_copy) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "indexer", + [ + "a", + ["a"], + pytest.param( + [True, False], + marks=pytest.mark.xfail( + reason="Boolean indexer incorrectly setting inplace", + strict=False, # passing on some builds, no obvious pattern + ), + ), + ], + ) + @pytest.mark.parametrize( + "value, set_value", + [ + (1, 5), + (1.0, 5.0), + (Timestamp("2020-12-31"), Timestamp("2021-12-31")), + ("a", "b"), + ], + ) + def test_setitem_not_operating_inplace(self, value, set_value, indexer): + # GH#43406 + df = DataFrame({"a": value}, index=[0, 1]) + expected = df.copy() + view = df[:] + df[indexer] = set_value + tm.assert_frame_equal(view, expected) + + @td.skip_array_manager_invalid_test + def test_setitem_column_update_inplace( + self, using_copy_on_write, warn_copy_on_write + ): + # https://github.com/pandas-dev/pandas/issues/47172 + + labels = [f"c{i}" for i in range(10)] + df = DataFrame({col: np.zeros(len(labels)) for col in labels}, index=labels) + values = df._mgr.blocks[0].values + + with tm.raises_chained_assignment_error(): + for label in df.columns: + df[label][label] = 1 + if not using_copy_on_write: + # diagonal values all updated + assert np.all(values[np.arange(10), np.arange(10)] == 1) + else: + # original dataframe not updated + assert np.all(values[np.arange(10), np.arange(10)] == 0) + + def test_setitem_column_frame_as_category(self): + # GH31581 + df = DataFrame([1, 2, 3]) + df["col1"] = DataFrame([1, 2, 3], dtype="category") + df["col2"] = Series([1, 2, 3], dtype="category") + + expected_types = Series( + ["int64", "category", "category"], index=[0, "col1", "col2"], dtype=object + ) + tm.assert_series_equal(df.dtypes, expected_types) + + @pytest.mark.parametrize("dtype", ["int64", "Int64"]) + def test_setitem_iloc_with_numpy_array(self, dtype): + # GH-33828 + df = DataFrame({"a": np.ones(3)}, dtype=dtype) + df.iloc[np.array([0]), np.array([0])] = np.array([[2]]) + + expected = DataFrame({"a": [2, 1, 1]}, dtype=dtype) + tm.assert_frame_equal(df, expected) + + def test_setitem_frame_dup_cols_dtype(self): + # GH#53143 + df = DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=["a", "b", "a", "c"]) + rhs = DataFrame([[0, 1.5], [2, 2.5]], columns=["a", "a"]) + df["a"] = rhs + expected = DataFrame( + [[0, 2, 1.5, 4], [2, 5, 2.5, 7]], columns=["a", "b", "a", "c"] + ) + tm.assert_frame_equal(df, expected) + + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) + rhs = DataFrame([[0, 1.5], [2, 2.5]], columns=["a", "a"]) + df["a"] = rhs + expected = DataFrame([[0, 1.5, 3], [2, 2.5, 6]], columns=["a", "a", "b"]) + tm.assert_frame_equal(df, expected) + + def test_frame_setitem_empty_dataframe(self): + # GH#28871 + dti = DatetimeIndex(["2000-01-01"], dtype="M8[ns]", name="date") + df = DataFrame({"date": dti}).set_index("date") + df = df[0:0].copy() + + df["3010"] = None + df["2010"] = None + + expected = DataFrame( + [], + columns=["3010", "2010"], + index=dti[:0], + ) + tm.assert_frame_equal(df, expected) + + +def test_full_setter_loc_incompatible_dtype(): + # https://github.com/pandas-dev/pandas/issues/55791 + df = DataFrame({"a": [1, 2]}) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "a"] = True + expected = DataFrame({"a": [True, True]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2]}) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "a"] = {0: 3.5, 1: 4.5} + expected = DataFrame({"a": [3.5, 4.5]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2]}) + df.loc[:, "a"] = {0: 3, 1: 4} + expected = DataFrame({"a": [3, 4]}) + tm.assert_frame_equal(df, expected) + + +def test_setitem_partial_row_multiple_columns(): + # https://github.com/pandas-dev/pandas/issues/56503 + df = DataFrame({"A": [1, 2, 3], "B": [4.0, 5, 6]}) + # should not warn + df.loc[df.index <= 1, ["F", "G"]] = (1, "abc") + expected = DataFrame( + { + "A": [1, 2, 3], + "B": [4.0, 5, 6], + "F": [1.0, 1, float("nan")], + "G": ["abc", "abc", float("nan")], + } + ) + tm.assert_frame_equal(df, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_take.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_take.py new file mode 100644 index 0000000000000000000000000000000000000000..8c172314409171bc102e599bb26ca0d1e0b12078 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_take.py @@ -0,0 +1,92 @@ +import pytest + +import pandas._testing as tm + + +class TestDataFrameTake: + def test_take_slices_deprecated(self, float_frame): + # GH#51539 + df = float_frame + + slc = slice(0, 4, 1) + with tm.assert_produces_warning(FutureWarning): + df.take(slc, axis=0) + with tm.assert_produces_warning(FutureWarning): + df.take(slc, axis=1) + + def test_take(self, float_frame): + # homogeneous + order = [3, 1, 2, 0] + for df in [float_frame]: + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["D", "B", "C", "A"]] + tm.assert_frame_equal(result, expected, check_names=False) + + # negative indices + order = [2, 1, -1] + for df in [float_frame]: + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + result = df.take(order, axis=0) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["C", "B", "D"]] + tm.assert_frame_equal(result, expected, check_names=False) + + # illegal indices + msg = "indices are out-of-bounds" + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, 30], axis=0) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, -31], axis=0) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, 5], axis=1) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, -5], axis=1) + + def test_take_mixed_type(self, float_string_frame): + # mixed-dtype + order = [4, 1, 2, 0, 3] + for df in [float_string_frame]: + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["foo", "B", "C", "A", "D"]] + tm.assert_frame_equal(result, expected) + + # negative indices + order = [4, 1, -2] + for df in [float_string_frame]: + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["foo", "B", "D"]] + tm.assert_frame_equal(result, expected) + + def test_take_mixed_numeric(self, mixed_float_frame, mixed_int_frame): + # by dtype + order = [1, 2, 0, 3] + for df in [mixed_float_frame, mixed_int_frame]: + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["B", "C", "A", "D"]] + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_where.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_where.py new file mode 100644 index 0000000000000000000000000000000000000000..3d36d0471f02f9b13c1da4bfd2826621646e97d5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_where.py @@ -0,0 +1,1099 @@ +from datetime import datetime + +from hypothesis import given +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + Series, + StringDtype, + Timestamp, + date_range, + isna, +) +import pandas._testing as tm +from pandas._testing._hypothesis import OPTIONAL_ONE_OF_ALL + + +@pytest.fixture(params=["default", "float_string", "mixed_float", "mixed_int"]) +def where_frame(request, float_string_frame, mixed_float_frame, mixed_int_frame): + if request.param == "default": + return DataFrame( + np.random.default_rng(2).standard_normal((5, 3)), columns=["A", "B", "C"] + ) + if request.param == "float_string": + return float_string_frame + if request.param == "mixed_float": + return mixed_float_frame + if request.param == "mixed_int": + return mixed_int_frame + + +def _safe_add(df): + # only add to the numeric items + def is_ok(s): + return ( + issubclass(s.dtype.type, (np.integer, np.floating)) and s.dtype != "uint8" + ) + + return DataFrame(dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items())) + + +class TestDataFrameIndexingWhere: + def test_where_get(self, where_frame, float_string_frame): + def _check_get(df, cond, check_dtypes=True): + other1 = _safe_add(df) + rs = df.where(cond, other1) + rs2 = df.where(cond.values, other1) + for k, v in rs.items(): + exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) + tm.assert_series_equal(v, exp, check_names=False) + tm.assert_frame_equal(rs, rs2) + + # dtypes + if check_dtypes: + assert (rs.dtypes == df.dtypes).all() + + # check getting + df = where_frame + if df is float_string_frame: + msg = "'>' not supported between instances of 'str' and 'int'" + with pytest.raises(TypeError, match=msg): + df > 0 + return + cond = df > 0 + _check_get(df, cond) + + def test_where_upcasting(self): + # upcasting case (GH # 2794) + df = DataFrame( + { + c: Series([1] * 3, dtype=c) + for c in ["float32", "float64", "int32", "int64"] + } + ) + df.iloc[1, :] = 0 + result = df.dtypes + expected = Series( + [ + np.dtype("float32"), + np.dtype("float64"), + np.dtype("int32"), + np.dtype("int64"), + ], + index=["float32", "float64", "int32", "int64"], + ) + + # when we don't preserve boolean casts + # + # expected = Series({ 'float32' : 1, 'float64' : 3 }) + + tm.assert_series_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") + def test_where_alignment(self, where_frame, float_string_frame): + # aligning + def _check_align(df, cond, other, check_dtypes=True): + rs = df.where(cond, other) + for i, k in enumerate(rs.columns): + result = rs[k] + d = df[k].values + c = cond[k].reindex(df[k].index).fillna(False).values + + if is_scalar(other): + o = other + elif isinstance(other, np.ndarray): + o = Series(other[:, i], index=result.index).values + else: + o = other[k].values + + new_values = d if c.all() else np.where(c, d, o) + expected = Series(new_values, index=result.index, name=k) + + # since we can't always have the correct numpy dtype + # as numpy doesn't know how to downcast, don't check + tm.assert_series_equal(result, expected, check_dtype=False) + + # dtypes + # can't check dtype when other is an ndarray + + if check_dtypes and not isinstance(other, np.ndarray): + assert (rs.dtypes == df.dtypes).all() + + df = where_frame + if df is float_string_frame: + msg = "'>' not supported between instances of 'str' and 'int'" + with pytest.raises(TypeError, match=msg): + df > 0 + return + + # other is a frame + cond = (df > 0)[1:] + _check_align(df, cond, _safe_add(df)) + + # check other is ndarray + cond = df > 0 + _check_align(df, cond, (_safe_add(df).values)) + + # integers are upcast, so don't check the dtypes + cond = df > 0 + check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) + _check_align(df, cond, np.nan, check_dtypes=check_dtypes) + + # Ignore deprecation warning in Python 3.12 for inverting a bool + @pytest.mark.filterwarnings("ignore::DeprecationWarning") + def test_where_invalid(self): + # invalid conditions + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 3)), columns=["A", "B", "C"] + ) + cond = df > 0 + + err1 = (df + 1).values[0:2, :] + msg = "other must be the same shape as self when an ndarray" + with pytest.raises(ValueError, match=msg): + df.where(cond, err1) + + err2 = cond.iloc[:2, :].values + other1 = _safe_add(df) + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + df.where(err2, other1) + + with pytest.raises(ValueError, match=msg): + df.mask(True) + with pytest.raises(ValueError, match=msg): + df.mask(0) + + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") + def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): + # where inplace + + def _check_set(df, cond, check_dtypes=True): + dfi = df.copy() + econd = cond.reindex_like(df).fillna(True).infer_objects(copy=False) + expected = dfi.mask(~econd) + + return_value = dfi.where(cond, np.nan, inplace=True) + assert return_value is None + tm.assert_frame_equal(dfi, expected) + + # dtypes (and confirm upcasts)x + if check_dtypes: + for k, v in df.dtypes.items(): + if issubclass(v.type, np.integer) and not cond[k].all(): + v = np.dtype("float64") + assert dfi[k].dtype == v + + df = where_frame + if df is float_string_frame: + msg = "'>' not supported between instances of 'str' and 'int'" + with pytest.raises(TypeError, match=msg): + df > 0 + return + if df is mixed_int_frame: + df = df.astype("float64") + + cond = df > 0 + _check_set(df, cond) + + cond = df >= 0 + _check_set(df, cond) + + # aligning + cond = (df >= 0)[1:] + _check_set(df, cond) + + def test_where_series_slicing(self): + # GH 10218 + # test DataFrame.where with Series slicing + df = DataFrame({"a": range(3), "b": range(4, 7)}) + result = df.where(df["a"] == 1) + expected = df[df["a"] == 1].reindex(df.index) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("klass", [list, tuple, np.array]) + def test_where_array_like(self, klass): + # see gh-15414 + df = DataFrame({"a": [1, 2, 3]}) + cond = [[False], [True], [True]] + expected = DataFrame({"a": [np.nan, 2, 3]}) + + result = df.where(klass(cond)) + tm.assert_frame_equal(result, expected) + + df["b"] = 2 + expected["b"] = [2, np.nan, 2] + cond = [[False, True], [True, False], [True, True]] + + result = df.where(klass(cond)) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "cond", + [ + [[1], [0], [1]], + Series([[2], [5], [7]]), + DataFrame({"a": [2, 5, 7]}), + [["True"], ["False"], ["True"]], + [[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]], + ], + ) + def test_where_invalid_input_single(self, cond): + # see gh-15414: only boolean arrays accepted + df = DataFrame({"a": [1, 2, 3]}) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + df.where(cond) + + @pytest.mark.parametrize( + "cond", + [ + [[0, 1], [1, 0], [1, 1]], + Series([[0, 2], [5, 0], [4, 7]]), + [["False", "True"], ["True", "False"], ["True", "True"]], + DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), + [ + [pd.NaT, Timestamp("2017-01-01")], + [Timestamp("2017-01-02"), pd.NaT], + [Timestamp("2017-01-03"), Timestamp("2017-01-03")], + ], + ], + ) + def test_where_invalid_input_multiple(self, cond): + # see gh-15414: only boolean arrays accepted + df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + df.where(cond) + + def test_where_dataframe_col_match(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + cond = DataFrame([[True, False, True], [False, False, True]]) + + result = df.where(cond) + expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]]) + tm.assert_frame_equal(result, expected) + + # this *does* align, though has no matching columns + cond.columns = ["a", "b", "c"] + result = df.where(cond) + expected = DataFrame(np.nan, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + def test_where_ndframe_align(self): + msg = "Array conditional must be same shape as self" + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + + cond = [True] + with pytest.raises(ValueError, match=msg): + df.where(cond) + + expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + + cond = np.array([False, True, False, True]) + with pytest.raises(ValueError, match=msg): + df.where(cond) + + expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + + def test_where_bug(self): + # see gh-2793 + df = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64" + ) + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) + result = df.where(df > 2, np.nan) + tm.assert_frame_equal(result, expected) + + result = df.copy() + return_value = result.where(result > 2, np.nan, inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + def test_where_bug_mixed(self, any_signed_int_numpy_dtype): + # see gh-2793 + df = DataFrame( + { + "a": np.array([1, 2, 3, 4], dtype=any_signed_int_numpy_dtype), + "b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"), + } + ) + + expected = DataFrame( + {"a": [-1, -1, 3, 4], "b": [4.0, 3.0, -1, -1]}, + ).astype({"a": any_signed_int_numpy_dtype, "b": "float64"}) + + result = df.where(df > 2, -1) + tm.assert_frame_equal(result, expected) + + result = df.copy() + return_value = result.where(result > 2, -1, inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + def test_where_bug_transposition(self): + # see gh-7506 + a = DataFrame({0: [1, 2], 1: [3, 4], 2: [5, 6]}) + b = DataFrame({0: [np.nan, 8], 1: [9, np.nan], 2: [np.nan, np.nan]}) + do_not_replace = b.isna() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = a.where(do_not_replace, b) + tm.assert_frame_equal(result, expected) + + a = DataFrame({0: [4, 6], 1: [1, 0]}) + b = DataFrame({0: [np.nan, 3], 1: [3, np.nan]}) + do_not_replace = b.isna() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = a.where(do_not_replace, b) + tm.assert_frame_equal(result, expected) + + def test_where_datetime(self): + # GH 3311 + df = DataFrame( + { + "A": date_range("20130102", periods=5), + "B": date_range("20130104", periods=5), + "C": np.random.default_rng(2).standard_normal(5), + } + ) + + stamp = datetime(2013, 1, 3) + msg = "'>' not supported between instances of 'float' and 'datetime.datetime'" + with pytest.raises(TypeError, match=msg): + df > stamp + + result = df[df.iloc[:, :-1] > stamp] + + expected = df.copy() + expected.loc[[0, 1], "A"] = np.nan + + expected.loc[:, "C"] = np.nan + tm.assert_frame_equal(result, expected) + + def test_where_none(self): + # GH 4667 + # setting with None changes dtype + df = DataFrame({"series": Series(range(10))}).astype(float) + df[df > 7] = None + expected = DataFrame( + {"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])} + ) + tm.assert_frame_equal(df, expected) + + # GH 7656 + df = DataFrame( + [ + {"A": 1, "B": np.nan, "C": "Test"}, + {"A": np.nan, "B": "Test", "C": np.nan}, + ] + ) + + orig = df.copy() + + mask = ~isna(df) + df.where(mask, None, inplace=True) + expected = DataFrame( + { + "A": [1.0, np.nan], + "B": [None, "Test"], + "C": ["Test", None], + } + ) + tm.assert_frame_equal(df, expected) + + df = orig.copy() + df[~mask] = None + tm.assert_frame_equal(df, expected) + + def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): + # see gh-21947 + df = DataFrame(columns=["a"]) + cond = df + assert (cond.dtypes == object).all() + + result = df.where(cond) + tm.assert_frame_equal(result, df) + + def test_where_align(self): + def create(): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 3))) + df.iloc[3:5, 0] = np.nan + df.iloc[4:6, 1] = np.nan + df.iloc[5:8, 2] = np.nan + return df + + # series + df = create() + expected = df.fillna(df.mean()) + result = df.where(pd.notna(df), df.mean(), axis="columns") + tm.assert_frame_equal(result, expected) + + return_value = df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") + assert return_value is None + tm.assert_frame_equal(df, expected) + + df = create().fillna(0) + expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0]) + result = df.where(df > 0, df[0], axis="index") + tm.assert_frame_equal(result, expected) + result = df.where(df > 0, df[0], axis="rows") + tm.assert_frame_equal(result, expected) + + # frame + df = create() + expected = df.fillna(1) + result = df.where( + pd.notna(df), DataFrame(1, index=df.index, columns=df.columns) + ) + tm.assert_frame_equal(result, expected) + + def test_where_complex(self): + # GH 6345 + expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) + df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) + df[df.abs() >= 5] = np.nan + tm.assert_frame_equal(df, expected) + + def test_where_axis(self): + # GH 9736 + df = DataFrame(np.random.default_rng(2).standard_normal((2, 2))) + mask = DataFrame([[False, False], [False, False]]) + ser = Series([0, 1]) + + expected = DataFrame([[0, 0], [1, 1]], dtype="float64") + result = df.where(mask, ser, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.copy() + return_value = result.where(mask, ser, axis="index", inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0, 1], [0, 1]], dtype="float64") + result = df.where(mask, ser, axis="columns") + tm.assert_frame_equal(result, expected) + + result = df.copy() + return_value = result.where(mask, ser, axis="columns", inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + def test_where_axis_with_upcast(self): + # Upcast needed + df = DataFrame([[1, 2], [3, 4]], dtype="int64") + mask = DataFrame([[False, False], [False, False]]) + ser = Series([0, np.nan]) + + expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") + result = df.where(mask, ser, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + return_value = result.where(mask, ser, axis="index", inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0, np.nan], [0, np.nan]]) + result = df.where(mask, ser, axis="columns") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + 0: np.array([0, 0], dtype="int64"), + 1: np.array([np.nan, np.nan], dtype="float64"), + } + ) + result = df.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + return_value = result.where(mask, ser, axis="columns", inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + def test_where_axis_multiple_dtypes(self): + # Multiple dtypes (=> multiple Blocks) + df = pd.concat( + [ + DataFrame(np.random.default_rng(2).standard_normal((10, 2))), + DataFrame( + np.random.default_rng(2).integers(0, 10, size=(10, 2)), + dtype="int64", + ), + ], + ignore_index=True, + axis=1, + ) + mask = DataFrame(False, columns=df.columns, index=df.index) + s1 = Series(1, index=df.columns) + s2 = Series(2, index=df.index) + + result = df.where(mask, s1, axis="columns") + expected = DataFrame(1.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") + tm.assert_frame_equal(result, expected) + + result = df.copy() + return_value = result.where(mask, s1, axis="columns", inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + result = df.where(mask, s2, axis="index") + expected = DataFrame(2.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") + tm.assert_frame_equal(result, expected) + + result = df.copy() + return_value = result.where(mask, s2, axis="index", inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + # DataFrame vs DataFrame + d1 = df.copy().drop(1, axis=0) + # Explicit cast to avoid implicit cast when setting value to np.nan + expected = df.copy().astype("float") + expected.loc[1, :] = np.nan + + result = df.where(mask, d1) + tm.assert_frame_equal(result, expected) + result = df.where(mask, d1, axis="index") + tm.assert_frame_equal(result, expected) + result = df.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + return_value = result.where(mask, d1, inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + result = df.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + return_value = result.where(mask, d1, inplace=True, axis="index") + assert return_value is None + tm.assert_frame_equal(result, expected) + + d2 = df.copy().drop(1, axis=1) + expected = df.copy() + expected.loc[:, 1] = np.nan + + result = df.where(mask, d2) + tm.assert_frame_equal(result, expected) + result = df.where(mask, d2, axis="columns") + tm.assert_frame_equal(result, expected) + result = df.copy() + return_value = result.where(mask, d2, inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + result = df.copy() + return_value = result.where(mask, d2, inplace=True, axis="columns") + assert return_value is None + tm.assert_frame_equal(result, expected) + + def test_where_callable(self): + # GH 12533 + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + result = df.where(lambda x: x > 4, lambda x: x + 1) + exp = DataFrame([[2, 3, 4], [5, 5, 6], [7, 8, 9]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.where(df > 4, df + 1)) + + # return ndarray and scalar + result = df.where(lambda x: (x % 2 == 0).values, lambda x: 99) + exp = DataFrame([[99, 2, 99], [4, 99, 6], [99, 8, 99]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.where(df % 2 == 0, 99)) + + # chain + result = (df + 2).where(lambda x: x > 8, lambda x: x + 10) + exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10)) + + def test_where_tz_values(self, tz_naive_fixture, frame_or_series): + obj1 = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), + columns=["date"], + ) + obj2 = DataFrame( + DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + mask = DataFrame([True, True, False], columns=["date"]) + exp = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + if frame_or_series is Series: + obj1 = obj1["date"] + obj2 = obj2["date"] + mask = mask["date"] + exp = exp["date"] + + result = obj1.where(mask, obj2) + tm.assert_equal(exp, result) + + def test_df_where_change_dtype(self): + # GH#16979 + df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) + mask = np.array([[True, False, False], [False, False, True]]) + + result = df.where(mask) + expected = DataFrame( + [[0, np.nan, np.nan], [np.nan, np.nan, 5]], columns=list("ABC") + ) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("kwargs", [{}, {"other": None}]) + def test_df_where_with_category(self, kwargs): + # GH#16979 + data = np.arange(2 * 3, dtype=np.int64).reshape(2, 3) + df = DataFrame(data, columns=list("ABC")) + mask = np.array([[True, False, False], [False, False, True]]) + + # change type to category + df.A = df.A.astype("category") + df.B = df.B.astype("category") + df.C = df.C.astype("category") + + result = df.where(mask, **kwargs) + A = pd.Categorical([0, np.nan], categories=[0, 3]) + B = pd.Categorical([np.nan, np.nan], categories=[1, 4]) + C = pd.Categorical([np.nan, 5], categories=[2, 5]) + expected = DataFrame({"A": A, "B": B, "C": C}) + + tm.assert_frame_equal(result, expected) + + # Check Series.where while we're here + result = df.A.where(mask[:, 0], **kwargs) + expected = Series(A, name="A") + + tm.assert_series_equal(result, expected) + + def test_where_categorical_filtering(self): + # GH#22609 Verify filtering operations on DataFrames with categorical Series + df = DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) + df["b"] = df["b"].astype("category") + + result = df.where(df["a"] > 0) + # Explicitly cast to 'float' to avoid implicit cast when setting np.nan + expected = df.copy().astype({"a": "float"}) + expected.loc[0, :] = np.nan + + tm.assert_equal(result, expected) + + def test_where_ea_other(self): + # GH#38729/GH#38742 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + arr = pd.array([7, pd.NA, 9]) + ser = Series(arr) + mask = np.ones(df.shape, dtype=bool) + mask[1, :] = False + + # TODO: ideally we would get Int64 instead of object + result = df.where(mask, ser, axis=0) + expected = DataFrame({"A": [1, np.nan, 3], "B": [4, np.nan, 6]}) + tm.assert_frame_equal(result, expected) + + ser2 = Series(arr[:2], index=["A", "B"]) + expected = DataFrame({"A": [1, 7, 3], "B": [4, np.nan, 6]}) + result = df.where(mask, ser2, axis=1) + tm.assert_frame_equal(result, expected) + + def test_where_interval_noop(self): + # GH#44181 + df = DataFrame([pd.Interval(0, 0)]) + res = df.where(df.notna()) + tm.assert_frame_equal(res, df) + + ser = df[0] + res = ser.where(ser.notna()) + tm.assert_series_equal(res, ser) + + def test_where_interval_fullop_downcast(self, frame_or_series): + # GH#45768 + obj = frame_or_series([pd.Interval(0, 0)] * 2) + other = frame_or_series([1.0, 2.0]) + + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = obj.where(~obj.notna(), other) + + # since all entries are being changed, we will downcast result + # from object to ints (not floats) + tm.assert_equal(res, other.astype(np.int64)) + + # unlike where, Block.putmask does not downcast + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + obj.mask(obj.notna(), other, inplace=True) + tm.assert_equal(obj, other.astype(object)) + + @pytest.mark.parametrize( + "dtype", + [ + "timedelta64[ns]", + "datetime64[ns]", + "datetime64[ns, Asia/Tokyo]", + "Period[D]", + ], + ) + def test_where_datetimelike_noop(self, dtype): + # GH#45135, analogue to GH#44181 for Period don't raise on no-op + # For td64/dt64/dt64tz we already don't raise, but also are + # checking that we don't unnecessarily upcast to object. + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype) + df = ser.to_frame() + mask = np.array([False, False, False]) + + res = ser.where(~mask, "foo") + tm.assert_series_equal(res, ser) + + mask2 = mask.reshape(-1, 1) + res2 = df.where(~mask2, "foo") + tm.assert_frame_equal(res2, df) + + res3 = ser.mask(mask, "foo") + tm.assert_series_equal(res3, ser) + + res4 = df.mask(mask2, "foo") + tm.assert_frame_equal(res4, df) + + # opposite case where we are replacing *all* values -> we downcast + # from object dtype # GH#45768 + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + res5 = df.where(mask2, 4) + expected = DataFrame(4, index=df.index, columns=df.columns) + tm.assert_frame_equal(res5, expected) + + # unlike where, Block.putmask does not downcast + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + df.mask(~mask2, 4, inplace=True) + tm.assert_frame_equal(df, expected.astype(object)) + + +def test_where_int_downcasting_deprecated(): + # GH#44597 + arr = np.arange(6).astype(np.int16).reshape(3, 2) + df = DataFrame(arr) + + mask = np.zeros(arr.shape, dtype=bool) + mask[:, 0] = True + + res = df.where(mask, 2**17) + + expected = DataFrame({0: arr[:, 0], 1: np.array([2**17] * 3, dtype=np.int32)}) + tm.assert_frame_equal(res, expected) + + +def test_where_copies_with_noop(frame_or_series): + # GH-39595 + result = frame_or_series([1, 2, 3, 4]) + expected = result.copy() + col = result[0] if frame_or_series is DataFrame else result + + where_res = result.where(col < 5) + where_res *= 2 + + tm.assert_equal(result, expected) + + where_res = result.where(col > 5, [1, 2, 3, 4]) + where_res *= 2 + + tm.assert_equal(result, expected) + + +def test_where_string_dtype(frame_or_series): + # GH40824 + obj = frame_or_series( + ["a", "b", "c", "d"], index=["id1", "id2", "id3", "id4"], dtype=StringDtype() + ) + filtered_obj = frame_or_series( + ["b", "c"], index=["id2", "id3"], dtype=StringDtype() + ) + filter_ser = Series([False, True, True, False]) + + result = obj.where(filter_ser, filtered_obj) + expected = frame_or_series( + [pd.NA, "b", "c", pd.NA], + index=["id1", "id2", "id3", "id4"], + dtype=StringDtype(), + ) + tm.assert_equal(result, expected) + + result = obj.mask(~filter_ser, filtered_obj) + tm.assert_equal(result, expected) + + obj.mask(~filter_ser, filtered_obj, inplace=True) + tm.assert_equal(result, expected) + + +def test_where_bool_comparison(): + # GH 10336 + df_mask = DataFrame( + {"AAA": [True] * 4, "BBB": [False] * 4, "CCC": [True, False, True, False]} + ) + result = df_mask.where(df_mask == False) # noqa: E712 + expected = DataFrame( + { + "AAA": np.array([np.nan] * 4, dtype=object), + "BBB": [False] * 4, + "CCC": [np.nan, False, np.nan, False], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_where_none_nan_coerce(): + # GH 15613 + expected = DataFrame( + { + "A": [Timestamp("20130101"), pd.NaT, Timestamp("20130103")], + "B": [1, 2, np.nan], + } + ) + result = expected.where(expected.notnull(), None) + tm.assert_frame_equal(result, expected) + + +def test_where_duplicate_axes_mixed_dtypes(): + # GH 25399, verify manually masking is not affected anymore by dtype of column for + # duplicate axes. + result = DataFrame(data=[[0, np.nan]], columns=Index(["A", "A"])) + index, columns = result.axes + mask = DataFrame(data=[[True, True]], columns=columns, index=index) + a = result.astype(object).where(mask) + b = result.astype("f8").where(mask) + c = result.T.where(mask.T).T + d = result.where(mask) # used to fail with "cannot reindex from a duplicate axis" + tm.assert_frame_equal(a.astype("f8"), b.astype("f8")) + tm.assert_frame_equal(b.astype("f8"), c.astype("f8")) + tm.assert_frame_equal(c.astype("f8"), d.astype("f8")) + + +def test_where_columns_casting(): + # GH 42295 + + df = DataFrame({"a": [1.0, 2.0], "b": [3, np.nan]}) + expected = df.copy() + result = df.where(pd.notnull(df), None) + # make sure dtypes don't change + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("as_cat", [True, False]) +def test_where_period_invalid_na(frame_or_series, as_cat, request): + # GH#44697 + idx = pd.period_range("2016-01-01", periods=3, freq="D") + if as_cat: + idx = idx.astype("category") + obj = frame_or_series(idx) + + # NA value that we should *not* cast to Period dtype + tdnat = pd.NaT.to_numpy("m8[ns]") + + mask = np.array([True, True, False], ndmin=obj.ndim).T + + if as_cat: + msg = ( + r"Cannot setitem on a Categorical with a new category \(NaT\), " + "set the categories first" + ) + else: + msg = "value should be a 'Period'" + + if as_cat: + with pytest.raises(TypeError, match=msg): + obj.where(mask, tdnat) + + with pytest.raises(TypeError, match=msg): + obj.mask(mask, tdnat) + + with pytest.raises(TypeError, match=msg): + obj.mask(mask, tdnat, inplace=True) + + else: + # With PeriodDtype, ser[i] = tdnat coerces instead of raising, + # so for consistency, ser[mask] = tdnat must as well + expected = obj.astype(object).where(mask, tdnat) + result = obj.where(mask, tdnat) + tm.assert_equal(result, expected) + + expected = obj.astype(object).mask(mask, tdnat) + result = obj.mask(mask, tdnat) + tm.assert_equal(result, expected) + + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + obj.mask(mask, tdnat, inplace=True) + tm.assert_equal(obj, expected) + + +def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): + # GH#44697 + arr = pd.array([1, 2, 3], dtype=any_numeric_ea_dtype) + obj = frame_or_series(arr) + + mask = np.array([True, True, False], ndmin=obj.ndim).T + + msg = r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}" + + for null in tm.NP_NAT_OBJECTS + [pd.NaT]: + # NaT is an NA value that we should *not* cast to pd.NA dtype + with pytest.raises(TypeError, match=msg): + obj.where(mask, null) + + with pytest.raises(TypeError, match=msg): + obj.mask(mask, null) + + +@given(data=OPTIONAL_ONE_OF_ALL) +def test_where_inplace_casting(data): + # GH 22051 + df = DataFrame({"a": data}) + df_copy = df.where(pd.notnull(df), None).copy() + df.where(pd.notnull(df), None, inplace=True) + tm.assert_equal(df, df_copy) + + +def test_where_downcast_to_td64(): + ser = Series([1, 2, 3]) + + mask = np.array([False, False, False]) + + td = pd.Timedelta(days=1) + + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = ser.where(mask, td) + expected = Series([td, td, td], dtype="m8[ns]") + tm.assert_series_equal(res, expected) + + with pd.option_context("future.no_silent_downcasting", True): + with tm.assert_produces_warning(None, match=msg): + res2 = ser.where(mask, td) + expected2 = expected.astype(object) + tm.assert_series_equal(res2, expected2) + + +def _check_where_equivalences(df, mask, other, expected): + # similar to tests.series.indexing.test_setitem.SetitemCastingEquivalences + # but with DataFrame in mind and less fleshed-out + res = df.where(mask, other) + tm.assert_frame_equal(res, expected) + + res = df.mask(~mask, other) + tm.assert_frame_equal(res, expected) + + # Note: frame.mask(~mask, other, inplace=True) takes some more work bc + # Block.putmask does *not* downcast. The change to 'expected' here + # is specific to the cases in test_where_dt64_2d. + df = df.copy() + df.mask(~mask, other, inplace=True) + if not mask.all(): + # with mask.all(), Block.putmask is a no-op, so does not downcast + expected = expected.copy() + expected["A"] = expected["A"].astype(object) + tm.assert_frame_equal(df, expected) + + +def test_where_dt64_2d(): + dti = date_range("2016-01-01", periods=6) + dta = dti._data.reshape(3, 2) + other = dta - dta[0, 0] + + df = DataFrame(dta, columns=["A", "B"]) + + mask = np.asarray(df.isna()).copy() + mask[:, 1] = True + + # setting all of one column, none of the other + expected = DataFrame({"A": other[:, 0], "B": dta[:, 1]}) + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + _check_where_equivalences(df, mask, other, expected) + + # setting part of one column, none of the other + mask[1, 0] = True + expected = DataFrame( + { + "A": np.array([other[0, 0], dta[1, 0], other[2, 0]], dtype=object), + "B": dta[:, 1], + } + ) + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + _check_where_equivalences(df, mask, other, expected) + + # setting nothing in either column + mask[:] = True + expected = df + _check_where_equivalences(df, mask, other, expected) + + +def test_where_producing_ea_cond_for_np_dtype(): + # GH#44014 + df = DataFrame({"a": Series([1, pd.NA, 2], dtype="Int64"), "b": [1, 2, 3]}) + result = df.where(lambda x: x.apply(lambda y: y > 1, axis=1)) + expected = DataFrame( + {"a": Series([pd.NA, pd.NA, 2], dtype="Int64"), "b": [np.nan, 2, 3]} + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] +) +def test_where_int_overflow(replacement, using_infer_string, request): + # GH 31687 + df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) + if using_infer_string and replacement not in (None, "snake"): + request.node.add_marker( + pytest.mark.xfail(reason="Can't set non-string into string column") + ) + result = df.where(pd.notnull(df), replacement) + expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) + + tm.assert_frame_equal(result, expected) + + +def test_where_inplace_no_other(): + # GH#51685 + df = DataFrame({"a": [1.0, 2.0], "b": ["x", "y"]}) + cond = DataFrame({"a": [True, False], "b": [False, True]}) + df.where(cond, inplace=True) + expected = DataFrame({"a": [1, np.nan], "b": [np.nan, "y"]}) + tm.assert_frame_equal(df, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_xs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_xs.py new file mode 100644 index 0000000000000000000000000000000000000000..be809e3a17c8e19bb1b2c68502a7537d29f5b45d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/indexing/test_xs.py @@ -0,0 +1,444 @@ +import re + +import numpy as np +import pytest + +from pandas.errors import SettingWithCopyError + +from pandas import ( + DataFrame, + Index, + IndexSlice, + MultiIndex, + Series, + concat, +) +import pandas._testing as tm + +from pandas.tseries.offsets import BDay + + +@pytest.fixture +def four_level_index_dataframe(): + arr = np.array( + [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + [-0.6662, -0.5243, -0.358, 0.89145, 2.5838], + ] + ) + index = MultiIndex( + levels=[["a", "x"], ["b", "q"], [10.0032, 20.0, 30.0], [3, 4, 5]], + codes=[[0, 0, 1], [0, 1, 1], [0, 1, 2], [2, 1, 0]], + names=["one", "two", "three", "four"], + ) + return DataFrame(arr, index=index, columns=list("ABCDE")) + + +class TestXS: + def test_xs( + self, float_frame, datetime_frame, using_copy_on_write, warn_copy_on_write + ): + float_frame_orig = float_frame.copy() + idx = float_frame.index[5] + xs = float_frame.xs(idx) + for item, value in xs.items(): + if np.isnan(value): + assert np.isnan(float_frame[item][idx]) + else: + assert value == float_frame[item][idx] + + # mixed-type xs + test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} + frame = DataFrame(test_data) + xs = frame.xs("1") + assert xs.dtype == np.object_ + assert xs["A"] == 1 + assert xs["B"] == "1" + + with pytest.raises( + KeyError, match=re.escape("Timestamp('1999-12-31 00:00:00')") + ): + datetime_frame.xs(datetime_frame.index[0] - BDay()) + + # xs get column + series = float_frame.xs("A", axis=1) + expected = float_frame["A"] + tm.assert_series_equal(series, expected) + + # view is returned if possible + series = float_frame.xs("A", axis=1) + with tm.assert_cow_warning(warn_copy_on_write): + series[:] = 5 + if using_copy_on_write: + # but with CoW the view shouldn't propagate mutations + tm.assert_series_equal(float_frame["A"], float_frame_orig["A"]) + assert not (expected == 5).all() + else: + assert (expected == 5).all() + + def test_xs_corner(self): + # pathological mixed-type reordering case + df = DataFrame(index=[0]) + df["A"] = 1.0 + df["B"] = "foo" + df["C"] = 2.0 + df["D"] = "bar" + df["E"] = 3.0 + + xs = df.xs(0) + exp = Series([1.0, "foo", 2.0, "bar", 3.0], index=list("ABCDE"), name=0) + tm.assert_series_equal(xs, exp) + + # no columns but Index(dtype=object) + df = DataFrame(index=["a", "b", "c"]) + result = df.xs("a") + expected = Series([], name="a", dtype=np.float64) + tm.assert_series_equal(result, expected) + + def test_xs_duplicates(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 2)), + index=["b", "b", "c", "b", "a"], + ) + + cross = df.xs("c") + exp = df.iloc[2] + tm.assert_series_equal(cross, exp) + + def test_xs_keep_level(self): + df = DataFrame( + { + "day": {0: "sat", 1: "sun"}, + "flavour": {0: "strawberry", 1: "strawberry"}, + "sales": {0: 10, 1: 12}, + "year": {0: 2008, 1: 2008}, + } + ).set_index(["year", "flavour", "day"]) + result = df.xs("sat", level="day", drop_level=False) + expected = df[:1] + tm.assert_frame_equal(result, expected) + + result = df.xs((2008, "sat"), level=["year", "day"], drop_level=False) + tm.assert_frame_equal(result, expected) + + def test_xs_view( + self, using_array_manager, using_copy_on_write, warn_copy_on_write + ): + # in 0.14 this will return a view if possible a copy otherwise, but + # this is numpy dependent + + dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5)) + df_orig = dm.copy() + + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + dm.xs(2)[:] = 20 + tm.assert_frame_equal(dm, df_orig) + elif using_array_manager: + # INFO(ArrayManager) with ArrayManager getting a row as a view is + # not possible + msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(SettingWithCopyError, match=msg): + dm.xs(2)[:] = 20 + assert not (dm.xs(2) == 20).any() + else: + with tm.raises_chained_assignment_error(): + dm.xs(2)[:] = 20 + assert (dm.xs(2) == 20).all() + + +class TestXSWithMultiIndex: + def test_xs_doc_example(self): + # TODO: more descriptive name + # based on example in advanced.rst + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = list(zip(*arrays)) + + index = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 8)), + index=["A", "B", "C"], + columns=index, + ) + + result = df.xs(("one", "bar"), level=("second", "first"), axis=1) + + expected = df.iloc[:, [0]] + tm.assert_frame_equal(result, expected) + + def test_xs_integer_key(self): + # see GH#2107 + dates = range(20111201, 20111205) + ids = list("abcde") + index = MultiIndex.from_product([dates, ids], names=["date", "secid"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((len(index), 3)), + index, + ["X", "Y", "Z"], + ) + + result = df.xs(20111201, level="date") + expected = df.loc[20111201, :] + tm.assert_frame_equal(result, expected) + + def test_xs_level(self, multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + result = df.xs("two", level="second") + expected = df[df.index.get_level_values(1) == "two"] + expected.index = Index(["foo", "bar", "baz", "qux"], name="first") + tm.assert_frame_equal(result, expected) + + def test_xs_level_eq_2(self): + arr = np.random.default_rng(2).standard_normal((3, 5)) + index = MultiIndex( + levels=[["a", "p", "x"], ["b", "q", "y"], ["c", "r", "z"]], + codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]], + ) + df = DataFrame(arr, index=index) + expected = DataFrame(arr[1:2], index=[["a"], ["b"]]) + result = df.xs("c", level=2) + tm.assert_frame_equal(result, expected) + + def test_xs_setting_with_copy_error( + self, + multiindex_dataframe_random_data, + using_copy_on_write, + warn_copy_on_write, + ): + # this is a copy in 0.14 + df = multiindex_dataframe_random_data + df_orig = df.copy() + result = df.xs("two", level="second") + + if using_copy_on_write or warn_copy_on_write: + result[:] = 10 + else: + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(SettingWithCopyError, match=msg): + result[:] = 10 + tm.assert_frame_equal(df, df_orig) + + def test_xs_setting_with_copy_error_multiple( + self, four_level_index_dataframe, using_copy_on_write, warn_copy_on_write + ): + # this is a copy in 0.14 + df = four_level_index_dataframe + df_orig = df.copy() + result = df.xs(("a", 4), level=["one", "four"]) + + if using_copy_on_write or warn_copy_on_write: + result[:] = 10 + else: + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(SettingWithCopyError, match=msg): + result[:] = 10 + tm.assert_frame_equal(df, df_orig) + + @pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])]) + def test_xs_with_duplicates(self, key, level, multiindex_dataframe_random_data): + # see GH#13719 + frame = multiindex_dataframe_random_data + df = concat([frame] * 2) + assert df.index.is_unique is False + expected = concat([frame.xs("one", level="second")] * 2) + + if isinstance(key, list): + result = df.xs(tuple(key), level=level) + else: + result = df.xs(key, level=level) + tm.assert_frame_equal(result, expected) + + def test_xs_missing_values_in_index(self): + # see GH#6574 + # missing values in returned index should be preserved + acc = [ + ("a", "abcde", 1), + ("b", "bbcde", 2), + ("y", "yzcde", 25), + ("z", "xbcde", 24), + ("z", None, 26), + ("z", "zbcde", 25), + ("z", "ybcde", 26), + ] + df = DataFrame(acc, columns=["a1", "a2", "cnt"]).set_index(["a1", "a2"]) + expected = DataFrame( + {"cnt": [24, 26, 25, 26]}, + index=Index(["xbcde", np.nan, "zbcde", "ybcde"], name="a2"), + ) + + result = df.xs("z", level="a1") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "key, level, exp_arr, exp_index", + [ + ("a", "lvl0", lambda x: x[:, 0:2], Index(["bar", "foo"], name="lvl1")), + ("foo", "lvl1", lambda x: x[:, 1:2], Index(["a"], name="lvl0")), + ], + ) + def test_xs_named_levels_axis_eq_1(self, key, level, exp_arr, exp_index): + # see GH#2903 + arr = np.random.default_rng(2).standard_normal((4, 4)) + index = MultiIndex( + levels=[["a", "b"], ["bar", "foo", "hello", "world"]], + codes=[[0, 0, 1, 1], [0, 1, 2, 3]], + names=["lvl0", "lvl1"], + ) + df = DataFrame(arr, columns=index) + result = df.xs(key, level=level, axis=1) + expected = DataFrame(exp_arr(arr), columns=exp_index) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "indexer", + [ + lambda df: df.xs(("a", 4), level=["one", "four"]), + lambda df: df.xs("a").xs(4, level="four"), + ], + ) + def test_xs_level_multiple(self, indexer, four_level_index_dataframe): + df = four_level_index_dataframe + expected_values = [[0.4473, 1.4152, 0.2834, 1.00661, 0.1744]] + expected_index = MultiIndex( + levels=[["q"], [20.0]], codes=[[0], [0]], names=["two", "three"] + ) + expected = DataFrame( + expected_values, index=expected_index, columns=list("ABCDE") + ) + result = indexer(df) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "indexer", [lambda df: df.xs("a", level=0), lambda df: df.xs("a")] + ) + def test_xs_level0(self, indexer, four_level_index_dataframe): + df = four_level_index_dataframe + expected_values = [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + ] + expected_index = MultiIndex( + levels=[["b", "q"], [10.0032, 20.0], [4, 5]], + codes=[[0, 1], [0, 1], [1, 0]], + names=["two", "three", "four"], + ) + expected = DataFrame( + expected_values, index=expected_index, columns=list("ABCDE") + ) + + result = indexer(df) + tm.assert_frame_equal(result, expected) + + def test_xs_values(self, multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + result = df.xs(("bar", "two")).values + expected = df.values[4] + tm.assert_almost_equal(result, expected) + + def test_xs_loc_equality(self, multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data + result = df.xs(("bar", "two")) + expected = df.loc[("bar", "two")] + tm.assert_series_equal(result, expected) + + def test_xs_IndexSlice_argument_not_implemented(self, frame_or_series): + # GH#35301 + + index = MultiIndex( + levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + ) + + obj = DataFrame(np.random.default_rng(2).standard_normal((6, 4)), index=index) + if frame_or_series is Series: + obj = obj[0] + + expected = obj.iloc[-2:].droplevel(0) + + result = obj.xs(IndexSlice[("foo", "qux", 0), :]) + tm.assert_equal(result, expected) + + result = obj.loc[IndexSlice[("foo", "qux", 0), :]] + tm.assert_equal(result, expected) + + def test_xs_levels_raises(self, frame_or_series): + obj = DataFrame({"A": [1, 2, 3]}) + if frame_or_series is Series: + obj = obj["A"] + + msg = "Index must be a MultiIndex" + with pytest.raises(TypeError, match=msg): + obj.xs(0, level="as") + + def test_xs_multiindex_droplevel_false(self): + # GH#19056 + mi = MultiIndex.from_tuples( + [("a", "x"), ("a", "y"), ("b", "x")], names=["level1", "level2"] + ) + df = DataFrame([[1, 2, 3]], columns=mi) + result = df.xs("a", axis=1, drop_level=False) + expected = DataFrame( + [[1, 2]], + columns=MultiIndex.from_tuples( + [("a", "x"), ("a", "y")], names=["level1", "level2"] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_xs_droplevel_false(self): + # GH#19056 + df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"])) + result = df.xs("a", axis=1, drop_level=False) + expected = DataFrame({"a": [1]}) + tm.assert_frame_equal(result, expected) + + def test_xs_droplevel_false_view( + self, using_array_manager, using_copy_on_write, warn_copy_on_write + ): + # GH#37832 + df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"])) + result = df.xs("a", axis=1, drop_level=False) + # check that result still views the same data as df + assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values) + + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 2 + if using_copy_on_write: + # with copy on write the subset is never modified + expected = DataFrame({"a": [1]}) + else: + # modifying original df also modifies result when having a single block + expected = DataFrame({"a": [2]}) + tm.assert_frame_equal(result, expected) + + # with mixed dataframe, modifying the parent doesn't modify result + # TODO the "split" path behaves differently here as with single block + df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"])) + result = df.xs("a", axis=1, drop_level=False) + df.iloc[0, 0] = 2 + if using_copy_on_write: + # with copy on write the subset is never modified + expected = DataFrame({"a": [1]}) + elif using_array_manager: + # Here the behavior is consistent + expected = DataFrame({"a": [2]}) + else: + # FIXME: iloc does not update the array inplace using + # "split" path + expected = DataFrame({"a": [1]}) + tm.assert_frame_equal(result, expected) + + def test_xs_list_indexer_droplevel_false(self): + # GH#41760 + mi = MultiIndex.from_tuples([("x", "m", "a"), ("x", "n", "b"), ("y", "o", "c")]) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=mi) + with pytest.raises(KeyError, match="y"): + df.xs(("x", "y"), drop_level=False, axis=1) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..245594bfdc9e72ff5cb3a4799e9055c7cd6b5a3e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__init__.py @@ -0,0 +1,7 @@ +""" +Test files dedicated to individual (stand-alone) DataFrame methods + +Ideally these files/tests should correspond 1-to-1 with tests.series.methods + +These may also present opportunities for sharing/de-duplicating test code. +""" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_align.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_align.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c4cbc7d46b5014bc40297efdab0e607a84f9c2f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_align.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_asof.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_asof.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dac96ac5b17911c39c9f9f8436a5de6eb24dd7ec Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_asof.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_assign.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_assign.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ffd3dd89f2de8d19a18b827c1aa8cba8b74e994 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_assign.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8913185df3177bbe52c2a0631c6e359fc816257f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_at_time.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_at_time.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f4d59c4812353a852e51fa527217157f8cabe36 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_at_time.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_between_time.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_between_time.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3aedb3c946fe1d0bac839ae7423be19c0d38ccfc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_between_time.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_combine_first.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_combine_first.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e42dd4e769b6055eb0075a81af61a394db19cdb4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_combine_first.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_compare.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_compare.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a61c3440afeb51e75384bf0c73ee311c67855de4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_compare.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_copy.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_copy.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d53c0225019e9eb30cc2d4ba41af307f8f605dc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_copy.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_count.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_count.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13ea521f439cb4a9f68faf689263f9e0e479cd07 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_count.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_cov_corr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_cov_corr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cb91b178c0ebd974d65532c2107e84a23912a11 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_cov_corr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_describe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_describe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a15dda9061bea8fefbf987f5603997fa13c8b0d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_describe.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_dot.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_dot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfb15e1480d6fe955d70ce0075fa19d34f2ea8c8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_dot.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_drop.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_drop.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9faa35cf46d2e2077b56b3dbc09f44c8e08bbe1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_drop.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_drop_duplicates.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_drop_duplicates.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36335ef72c54552e562a3328fb33f3b063015b73 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_drop_duplicates.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_droplevel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_droplevel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d183c74c6d80e121fa8d0057d4296e2583d7a00 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_droplevel.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_dropna.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_dropna.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba6786b30087bff0e14806714e6f5591b6199826 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_dropna.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_dtypes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_dtypes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a111374c6b4038ade2ac3be964959431db644ae6 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_dtypes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_duplicated.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_duplicated.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99e27bf6d2a856ab6277361d47429dfc58e2b220 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_duplicated.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_explode.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_explode.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef4bc6267653c9dc0a9911e39e84bd573af66478 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_explode.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_fillna.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_fillna.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a3869a54d7f3e038409c9b69b05b3645e450fb7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_fillna.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_info.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_info.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a35f79b178d0207cfbe58c3ae9555f0df81f3d2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_info.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_interpolate.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_interpolate.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e99646bf141c75316632b885fda69247c9784654 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_interpolate.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_is_homogeneous_dtype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_is_homogeneous_dtype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..928828c52dcf5e3643f70d83738d206a193d5b51 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_is_homogeneous_dtype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_isin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_isin.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3900b118b8439696382e098497cd84cfdf64697d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_isin.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_join.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_join.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c5b240b8e233527d832606cece48026b6c485db Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_join.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_nlargest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_nlargest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99a38552b901e671ec632204a217c9eb441a50a7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_nlargest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_pop.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_pop.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4530112fca7ae661efb21d5bd2467adc2f77f440 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_pop.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_quantile.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_quantile.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb7188e2e3c1112daa04b2a7d8d2c9ab79c0e4dd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_quantile.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reindex_like.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reindex_like.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24fda12306bcc13d100575bd125628d09cb95fec Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reindex_like.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_rename.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_rename.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29b4d482d290f624f8cef91d24c37a56ee63fe42 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_rename.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_rename_axis.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_rename_axis.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea655d024bcd2bdb1e4d3c41287ff8f0298f95da Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_rename_axis.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_replace.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_replace.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7bb3d8e7c0fe00f79802b07d0a0f7d38083a8b8b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_replace.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reset_index.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reset_index.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..121726ab78a2eb9ca91beea80974fd1a73fa9d8a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_reset_index.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_round.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_round.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09a0124808d80aa67b5bc659a2b7f20f84f93899 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_round.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_select_dtypes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_select_dtypes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..428d34e6920f80cbf2bcfe08736ee2c2d463230a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_select_dtypes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_size.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_size.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7084167bd05500ab54ae1b8b63772a8beb107d0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_size.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_sort_index.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_sort_index.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d5ba3872da6750329300b557ef2fb30056ddce9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_sort_index.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_swapaxes.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_swapaxes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21a168bff173436136a744004e9f59d680ac75a8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_swapaxes.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_swaplevel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_swaplevel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be7a90b4dc64ae6958c79502c690c2ff051137bd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_swaplevel.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_dict.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_dict.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..980c32eb4714e4f9fc8bf6668f2546828a71c0c4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_dict.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_dict_of_blocks.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_dict_of_blocks.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca31796c59b7dd24566c6360732632c08ddd9433 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_dict_of_blocks.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_period.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_period.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06dc4956210647a0ba0d1fd7f3d4786c9d0fbaa8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_period.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_records.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_records.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5bdbc97fe0132d9a8a97d4871090eaa0a357566 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_to_records.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_transpose.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_transpose.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b913353cb9645528e4c6229efb7ce62e26d8de0 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_transpose.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_tz_convert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_tz_convert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1ee92e6bb1ae62cc528284f3f64cce40b849316 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_tz_convert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_tz_localize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_tz_localize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d433c8bd4c9da9e15c9364415320416ffd453223 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/__pycache__/test_tz_localize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_add_prefix_suffix.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_add_prefix_suffix.py new file mode 100644 index 0000000000000000000000000000000000000000..92d7cdd7990e168721610b7f52f653a69ac1e078 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_add_prefix_suffix.py @@ -0,0 +1,49 @@ +import pytest + +from pandas import Index +import pandas._testing as tm + + +def test_add_prefix_suffix(float_frame): + with_prefix = float_frame.add_prefix("foo#") + expected = Index([f"foo#{c}" for c in float_frame.columns]) + tm.assert_index_equal(with_prefix.columns, expected) + + with_suffix = float_frame.add_suffix("#foo") + expected = Index([f"{c}#foo" for c in float_frame.columns]) + tm.assert_index_equal(with_suffix.columns, expected) + + with_pct_prefix = float_frame.add_prefix("%") + expected = Index([f"%{c}" for c in float_frame.columns]) + tm.assert_index_equal(with_pct_prefix.columns, expected) + + with_pct_suffix = float_frame.add_suffix("%") + expected = Index([f"{c}%" for c in float_frame.columns]) + tm.assert_index_equal(with_pct_suffix.columns, expected) + + +def test_add_prefix_suffix_axis(float_frame): + # GH 47819 + with_prefix = float_frame.add_prefix("foo#", axis=0) + expected = Index([f"foo#{c}" for c in float_frame.index]) + tm.assert_index_equal(with_prefix.index, expected) + + with_prefix = float_frame.add_prefix("foo#", axis=1) + expected = Index([f"foo#{c}" for c in float_frame.columns]) + tm.assert_index_equal(with_prefix.columns, expected) + + with_pct_suffix = float_frame.add_suffix("#foo", axis=0) + expected = Index([f"{c}#foo" for c in float_frame.index]) + tm.assert_index_equal(with_pct_suffix.index, expected) + + with_pct_suffix = float_frame.add_suffix("#foo", axis=1) + expected = Index([f"{c}#foo" for c in float_frame.columns]) + tm.assert_index_equal(with_pct_suffix.columns, expected) + + +def test_add_prefix_suffix_invalid_axis(float_frame): + with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"): + float_frame.add_prefix("foo#", axis=2) + + with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"): + float_frame.add_suffix("foo#", axis=2) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_align.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_align.py new file mode 100644 index 0000000000000000000000000000000000000000..5a9c47866dae8102fdf51541aaae5c61eeb7e84c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_align.py @@ -0,0 +1,484 @@ +from datetime import timezone + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + Series, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameAlign: + def test_align_asfreq_method_raises(self): + df = DataFrame({"A": [1, np.nan, 2]}) + msg = "Invalid fill method" + msg2 = "The 'method', 'limit', and 'fill_axis' keywords" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg2): + df.align(df.iloc[::-1], method="asfreq") + + def test_frame_align_aware(self): + idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern") + idx2 = date_range("2001", periods=5, freq="2h", tz="US/Eastern") + df1 = DataFrame(np.random.default_rng(2).standard_normal((len(idx1), 3)), idx1) + df2 = DataFrame(np.random.default_rng(2).standard_normal((len(idx2), 3)), idx2) + new1, new2 = df1.align(df2) + assert df1.index.tz == new1.index.tz + assert df2.index.tz == new2.index.tz + + # different timezones convert to UTC + + # frame with frame + df1_central = df1.tz_convert("US/Central") + new1, new2 = df1.align(df1_central) + assert new1.index.tz is timezone.utc + assert new2.index.tz is timezone.utc + + # frame with Series + new1, new2 = df1.align(df1_central[0], axis=0) + assert new1.index.tz is timezone.utc + assert new2.index.tz is timezone.utc + + df1[0].align(df1_central, axis=0) + assert new1.index.tz is timezone.utc + assert new2.index.tz is timezone.utc + + def test_align_float(self, float_frame, using_copy_on_write): + af, bf = float_frame.align(float_frame) + assert af._mgr is not float_frame._mgr + + af, bf = float_frame.align(float_frame, copy=False) + if not using_copy_on_write: + assert af._mgr is float_frame._mgr + else: + assert af._mgr is not float_frame._mgr + + # axis = 0 + other = float_frame.iloc[:-5, :3] + af, bf = float_frame.align(other, axis=0, fill_value=-1) + + tm.assert_index_equal(bf.columns, other.columns) + + # test fill value + join_idx = float_frame.index.join(other.index) + diff_a = float_frame.index.difference(join_idx) + diff_a_vals = af.reindex(diff_a).values + assert (diff_a_vals == -1).all() + + af, bf = float_frame.align(other, join="right", axis=0) + tm.assert_index_equal(bf.columns, other.columns) + tm.assert_index_equal(bf.index, other.index) + tm.assert_index_equal(af.index, other.index) + + # axis = 1 + other = float_frame.iloc[:-5, :3].copy() + af, bf = float_frame.align(other, axis=1) + tm.assert_index_equal(bf.columns, float_frame.columns) + tm.assert_index_equal(bf.index, other.index) + + # test fill value + join_idx = float_frame.index.join(other.index) + diff_a = float_frame.index.difference(join_idx) + diff_a_vals = af.reindex(diff_a).values + + assert (diff_a_vals == -1).all() + + af, bf = float_frame.align(other, join="inner", axis=1) + tm.assert_index_equal(bf.columns, other.columns) + + msg = ( + "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " + "are deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + af, bf = float_frame.align(other, join="inner", axis=1, method="pad") + tm.assert_index_equal(bf.columns, other.columns) + + msg = ( + "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " + "are deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None + ) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) + + msg = ( + "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " + "are deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) + + # Try to align DataFrame to Series along bad axis + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + float_frame.align(af.iloc[0, :3], join="inner", axis=2) + + def test_align_frame_with_series(self, float_frame): + # align dataframe to series with broadcast or not + idx = float_frame.index + s = Series(range(len(idx)), index=idx) + + left, right = float_frame.align(s, axis=0) + tm.assert_index_equal(left.index, float_frame.index) + tm.assert_index_equal(right.index, float_frame.index) + assert isinstance(right, Series) + + msg = "The 'broadcast_axis' keyword in DataFrame.align is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + left, right = float_frame.align(s, broadcast_axis=1) + tm.assert_index_equal(left.index, float_frame.index) + expected = {c: s for c in float_frame.columns} + expected = DataFrame( + expected, index=float_frame.index, columns=float_frame.columns + ) + tm.assert_frame_equal(right, expected) + + def test_align_series_condition(self): + # see gh-9558 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df[df["a"] == 2] + expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + result = df.where(df["a"] == 2, 0) + expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) + tm.assert_frame_equal(result, expected) + + def test_align_int(self, int_frame): + # test other non-float types + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + msg = ( + "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " + "are deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + af, bf = int_frame.align(other, join="inner", axis=1, method="pad") + tm.assert_index_equal(bf.columns, other.columns) + + def test_align_mixed_type(self, float_string_frame): + msg = ( + "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " + "are deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + af, bf = float_string_frame.align( + float_string_frame, join="inner", axis=1, method="pad" + ) + tm.assert_index_equal(bf.columns, float_string_frame.columns) + + def test_align_mixed_float(self, mixed_float_frame): + # mixed floats/ints + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + msg = ( + "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " + "are deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + af, bf = mixed_float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([])) + + def test_align_mixed_int(self, mixed_int_frame): + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + msg = ( + "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " + "are deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + af, bf = mixed_int_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([])) + + @pytest.mark.parametrize( + "l_ordered,r_ordered,expected", + [ + [True, True, pd.CategoricalIndex], + [True, False, Index], + [False, True, Index], + [False, False, pd.CategoricalIndex], + ], + ) + def test_align_categorical(self, l_ordered, r_ordered, expected): + # GH-28397 + df_1 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype( + pd.CategoricalDtype(list("cab"), ordered=l_ordered) + ), + } + ).set_index("B") + df_2 = DataFrame( + { + "A": np.arange(5, dtype="int64"), + "B": Series(list("babca")).astype( + pd.CategoricalDtype(list("cab"), ordered=r_ordered) + ), + } + ).set_index("B") + + aligned_1, aligned_2 = df_1.align(df_2) + assert isinstance(aligned_1.index, expected) + assert isinstance(aligned_2.index, expected) + tm.assert_index_equal(aligned_1.index, aligned_2.index) + + def test_align_multiindex(self): + # GH#10665 + # same test cases as test_align_multiindex in test_series.py + + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = Index(range(2), name="b") + df1 = DataFrame(np.arange(12, dtype="int64"), index=midx) + df2 = DataFrame(np.arange(2, dtype="int64"), index=idx) + + # these must be the same results (but flipped) + res1l, res1r = df1.align(df2, join="left") + res2l, res2r = df2.align(df1, join="right") + + expl = df1 + tm.assert_frame_equal(expl, res1l) + tm.assert_frame_equal(expl, res2r) + expr = DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + tm.assert_frame_equal(expr, res1r) + tm.assert_frame_equal(expr, res2l) + + res1l, res1r = df1.align(df2, join="right") + res2l, res2r = df2.align(df1, join="left") + + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) + expl = DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + tm.assert_frame_equal(expl, res1l) + tm.assert_frame_equal(expl, res2r) + expr = DataFrame([0, 0, 1, 1] * 2, index=exp_idx) + tm.assert_frame_equal(expr, res1r) + tm.assert_frame_equal(expr, res2l) + + def test_align_series_combinations(self): + df = DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) + s = Series([1, 2, 4], index=list("ABD"), name="x") + + # frame + series + res1, res2 = df.align(s, axis=0) + exp1 = DataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) + exp2 = Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") + + tm.assert_frame_equal(res1, exp1) + tm.assert_series_equal(res2, exp2) + + # series + frame + res1, res2 = s.align(df) + tm.assert_series_equal(res1, exp2) + tm.assert_frame_equal(res2, exp1) + + def test_multiindex_align_to_series_with_common_index_level(self): + # GH-46001 + foo_index = Index([1, 2, 3], name="foo") + bar_index = Index([1, 2], name="bar") + + series = Series([1, 2], index=bar_index, name="foo_series") + df = DataFrame( + {"col": np.arange(6)}, + index=pd.MultiIndex.from_product([foo_index, bar_index]), + ) + + expected_r = Series([1, 2] * 3, index=df.index, name="foo_series") + result_l, result_r = df.align(series, axis=0) + + tm.assert_frame_equal(result_l, df) + tm.assert_series_equal(result_r, expected_r) + + def test_multiindex_align_to_series_with_common_index_level_missing_in_left(self): + # GH-46001 + foo_index = Index([1, 2, 3], name="foo") + bar_index = Index([1, 2], name="bar") + + series = Series( + [1, 2, 3, 4], index=Index([1, 2, 3, 4], name="bar"), name="foo_series" + ) + df = DataFrame( + {"col": np.arange(6)}, + index=pd.MultiIndex.from_product([foo_index, bar_index]), + ) + + expected_r = Series([1, 2] * 3, index=df.index, name="foo_series") + result_l, result_r = df.align(series, axis=0) + + tm.assert_frame_equal(result_l, df) + tm.assert_series_equal(result_r, expected_r) + + def test_multiindex_align_to_series_with_common_index_level_missing_in_right(self): + # GH-46001 + foo_index = Index([1, 2, 3], name="foo") + bar_index = Index([1, 2, 3, 4], name="bar") + + series = Series([1, 2], index=Index([1, 2], name="bar"), name="foo_series") + df = DataFrame( + {"col": np.arange(12)}, + index=pd.MultiIndex.from_product([foo_index, bar_index]), + ) + + expected_r = Series( + [1, 2, np.nan, np.nan] * 3, index=df.index, name="foo_series" + ) + result_l, result_r = df.align(series, axis=0) + + tm.assert_frame_equal(result_l, df) + tm.assert_series_equal(result_r, expected_r) + + def test_multiindex_align_to_series_with_common_index_level_missing_in_both(self): + # GH-46001 + foo_index = Index([1, 2, 3], name="foo") + bar_index = Index([1, 3, 4], name="bar") + + series = Series( + [1, 2, 3], index=Index([1, 2, 4], name="bar"), name="foo_series" + ) + df = DataFrame( + {"col": np.arange(9)}, + index=pd.MultiIndex.from_product([foo_index, bar_index]), + ) + + expected_r = Series([1, np.nan, 3] * 3, index=df.index, name="foo_series") + result_l, result_r = df.align(series, axis=0) + + tm.assert_frame_equal(result_l, df) + tm.assert_series_equal(result_r, expected_r) + + def test_multiindex_align_to_series_with_common_index_level_non_unique_cols(self): + # GH-46001 + foo_index = Index([1, 2, 3], name="foo") + bar_index = Index([1, 2], name="bar") + + series = Series([1, 2], index=bar_index, name="foo_series") + df = DataFrame( + np.arange(18).reshape(6, 3), + index=pd.MultiIndex.from_product([foo_index, bar_index]), + ) + df.columns = ["cfoo", "cbar", "cfoo"] + + expected = Series([1, 2] * 3, index=df.index, name="foo_series") + result_left, result_right = df.align(series, axis=0) + + tm.assert_series_equal(result_right, expected) + tm.assert_index_equal(result_left.columns, df.columns) + + def test_missing_axis_specification_exception(self): + df = DataFrame(np.arange(50).reshape((10, 5))) + series = Series(np.arange(5)) + + with pytest.raises(ValueError, match=r"axis=0 or 1"): + df.align(series) + + @pytest.mark.parametrize("method", ["pad", "bfill"]) + @pytest.mark.parametrize("axis", [0, 1, None]) + @pytest.mark.parametrize("fill_axis", [0, 1]) + @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) + @pytest.mark.parametrize( + "left_slice", + [ + [slice(4), slice(10)], + [slice(0), slice(0)], + ], + ) + @pytest.mark.parametrize( + "right_slice", + [ + [slice(2, None), slice(6, None)], + [slice(0), slice(0)], + ], + ) + @pytest.mark.parametrize("limit", [1, None]) + def test_align_fill_method( + self, how, method, axis, fill_axis, float_frame, left_slice, right_slice, limit + ): + frame = float_frame + left = frame.iloc[left_slice[0], left_slice[1]] + right = frame.iloc[right_slice[0], right_slice[1]] + + msg = ( + "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " + "are deprecated" + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + aa, ab = left.align( + right, + axis=axis, + join=how, + method=method, + limit=limit, + fill_axis=fill_axis, + ) + + join_index, join_columns = None, None + + ea, eb = left, right + if axis is None or axis == 0: + join_index = left.index.join(right.index, how=how) + ea = ea.reindex(index=join_index) + eb = eb.reindex(index=join_index) + + if axis is None or axis == 1: + join_columns = left.columns.join(right.columns, how=how) + ea = ea.reindex(columns=join_columns) + eb = eb.reindex(columns=join_columns) + + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ea = ea.fillna(axis=fill_axis, method=method, limit=limit) + eb = eb.fillna(axis=fill_axis, method=method, limit=limit) + + tm.assert_frame_equal(aa, ea) + tm.assert_frame_equal(ab, eb) + + def test_align_series_check_copy(self): + # GH# + df = DataFrame({0: [1, 2]}) + ser = Series([1], name=0) + expected = ser.copy() + result, other = df.align(ser, axis=1) + ser.iloc[0] = 100 + tm.assert_series_equal(other, expected) + + def test_align_identical_different_object(self): + # GH#51032 + df = DataFrame({"a": [1, 2]}) + ser = Series([3, 4]) + result, result2 = df.align(ser, axis=0) + tm.assert_frame_equal(result, df) + tm.assert_series_equal(result2, ser) + assert df is not result + assert ser is not result2 + + def test_align_identical_different_object_columns(self): + # GH#51032 + df = DataFrame({"a": [1, 2]}) + ser = Series([1], index=["a"]) + result, result2 = df.align(ser, axis=1) + tm.assert_frame_equal(result, df) + tm.assert_series_equal(result2, ser) + assert df is not result + assert ser is not result2 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_asfreq.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_asfreq.py new file mode 100644 index 0000000000000000000000000000000000000000..ef72ca1ac86b9a6eb395a5a64bbfb99aef76a02a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_asfreq.py @@ -0,0 +1,263 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas._libs.tslibs.offsets import MonthEnd + +from pandas import ( + DataFrame, + DatetimeIndex, + Series, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + +from pandas.tseries import offsets + + +class TestAsFreq: + @pytest.fixture(params=["s", "ms", "us", "ns"]) + def unit(self, request): + return request.param + + def test_asfreq2(self, frame_or_series): + ts = frame_or_series( + [0.0, 1.0, 2.0], + index=DatetimeIndex( + [ + datetime(2009, 10, 30), + datetime(2009, 11, 30), + datetime(2009, 12, 31), + ], + dtype="M8[ns]", + freq="BME", + ), + ) + + daily_ts = ts.asfreq("B") + monthly_ts = daily_ts.asfreq("BME") + tm.assert_equal(monthly_ts, ts) + + daily_ts = ts.asfreq("B", method="pad") + monthly_ts = daily_ts.asfreq("BME") + tm.assert_equal(monthly_ts, ts) + + daily_ts = ts.asfreq(offsets.BDay()) + monthly_ts = daily_ts.asfreq(offsets.BMonthEnd()) + tm.assert_equal(monthly_ts, ts) + + result = ts[:0].asfreq("ME") + assert len(result) == 0 + assert result is not ts + + if frame_or_series is Series: + daily_ts = ts.asfreq("D", fill_value=-1) + result = daily_ts.value_counts().sort_index() + expected = Series( + [60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0], name="count" + ).sort_index() + tm.assert_series_equal(result, expected) + + def test_asfreq_datetimeindex_empty(self, frame_or_series): + # GH#14320 + index = DatetimeIndex(["2016-09-29 11:00"]) + expected = frame_or_series(index=index, dtype=object).asfreq("h") + result = frame_or_series([3], index=index.copy()).asfreq("h") + tm.assert_index_equal(expected.index, result.index) + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_tz_aware_asfreq_smoke(self, tz, frame_or_series): + dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=tz) + + obj = frame_or_series( + np.random.default_rng(2).standard_normal(len(dr)), index=dr + ) + + # it works! + obj.asfreq("min") + + def test_asfreq_normalize(self, frame_or_series): + rng = date_range("1/1/2000 09:30", periods=20) + norm = date_range("1/1/2000", periods=20) + + vals = np.random.default_rng(2).standard_normal((20, 3)) + + obj = DataFrame(vals, index=rng) + expected = DataFrame(vals, index=norm) + if frame_or_series is Series: + obj = obj[0] + expected = expected[0] + + result = obj.asfreq("D", normalize=True) + tm.assert_equal(result, expected) + + def test_asfreq_keep_index_name(self, frame_or_series): + # GH#9854 + index_name = "bar" + index = date_range("20130101", periods=20, name=index_name) + obj = DataFrame(list(range(20)), columns=["foo"], index=index) + obj = tm.get_obj(obj, frame_or_series) + + assert index_name == obj.index.name + assert index_name == obj.asfreq("10D").index.name + + def test_asfreq_ts(self, frame_or_series): + index = period_range(freq="Y", start="1/1/2001", end="12/31/2010") + obj = DataFrame( + np.random.default_rng(2).standard_normal((len(index), 3)), index=index + ) + obj = tm.get_obj(obj, frame_or_series) + + result = obj.asfreq("D", how="end") + exp_index = index.asfreq("D", how="end") + assert len(result) == len(obj) + tm.assert_index_equal(result.index, exp_index) + + result = obj.asfreq("D", how="start") + exp_index = index.asfreq("D", how="start") + assert len(result) == len(obj) + tm.assert_index_equal(result.index, exp_index) + + def test_asfreq_resample_set_correct_freq(self, frame_or_series): + # GH#5613 + # we test if .asfreq() and .resample() set the correct value for .freq + dti = to_datetime(["2012-01-01", "2012-01-02", "2012-01-03"]) + obj = DataFrame({"col": [1, 2, 3]}, index=dti) + obj = tm.get_obj(obj, frame_or_series) + + # testing the settings before calling .asfreq() and .resample() + assert obj.index.freq is None + assert obj.index.inferred_freq == "D" + + # does .asfreq() set .freq correctly? + assert obj.asfreq("D").index.freq == "D" + + # does .resample() set .freq correctly? + assert obj.resample("D").asfreq().index.freq == "D" + + def test_asfreq_empty(self, datetime_frame): + # test does not blow up on length-0 DataFrame + zero_length = datetime_frame.reindex([]) + result = zero_length.asfreq("BME") + assert result is not zero_length + + def test_asfreq(self, datetime_frame): + offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd()) + rule_monthly = datetime_frame.asfreq("BME") + + tm.assert_frame_equal(offset_monthly, rule_monthly) + + rule_monthly.asfreq("B", method="pad") + # TODO: actually check that this worked. + + # don't forget! + rule_monthly.asfreq("B", method="pad") + + def test_asfreq_datetimeindex(self): + df = DataFrame( + {"A": [1, 2, 3]}, + index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)], + ) + df = df.asfreq("B") + assert isinstance(df.index, DatetimeIndex) + + ts = df["A"].asfreq("B") + assert isinstance(ts.index, DatetimeIndex) + + def test_asfreq_fillvalue(self): + # test for fill value during upsampling, related to issue 3715 + + # setup + rng = date_range("1/1/2016", periods=10, freq="2s") + # Explicit cast to 'float' to avoid implicit cast when setting None + ts = Series(np.arange(len(rng)), index=rng, dtype="float") + df = DataFrame({"one": ts}) + + # insert pre-existing missing value + df.loc["2016-01-01 00:00:08", "one"] = None + + actual_df = df.asfreq(freq="1s", fill_value=9.0) + expected_df = df.asfreq(freq="1s").fillna(9.0) + expected_df.loc["2016-01-01 00:00:08", "one"] = None + tm.assert_frame_equal(expected_df, actual_df) + + expected_series = ts.asfreq(freq="1s").fillna(9.0) + actual_series = ts.asfreq(freq="1s", fill_value=9.0) + tm.assert_series_equal(expected_series, actual_series) + + def test_asfreq_with_date_object_index(self, frame_or_series): + rng = date_range("1/1/2000", periods=20) + ts = frame_or_series(np.random.default_rng(2).standard_normal(20), index=rng) + + ts2 = ts.copy() + ts2.index = [x.date() for x in ts2.index] + + result = ts2.asfreq("4h", method="ffill") + expected = ts.asfreq("4h", method="ffill") + tm.assert_equal(result, expected) + + def test_asfreq_with_unsorted_index(self, frame_or_series): + # GH#39805 + # Test that rows are not dropped when the datetime index is out of order + index = to_datetime(["2021-01-04", "2021-01-02", "2021-01-03", "2021-01-01"]) + result = frame_or_series(range(4), index=index) + + expected = result.reindex(sorted(index)) + expected.index = expected.index._with_freq("infer") + + result = result.asfreq("D") + tm.assert_equal(result, expected) + + def test_asfreq_after_normalize(self, unit): + # https://github.com/pandas-dev/pandas/issues/50727 + result = DatetimeIndex( + date_range("2000", periods=2).as_unit(unit).normalize(), freq="D" + ) + expected = DatetimeIndex(["2000-01-01", "2000-01-02"], freq="D").as_unit(unit) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq, freq_half", + [ + ("2ME", "ME"), + (MonthEnd(2), MonthEnd(1)), + ], + ) + def test_asfreq_2ME(self, freq, freq_half): + index = date_range("1/1/2000", periods=6, freq=freq_half) + df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], index=index)}) + expected = df.asfreq(freq=freq) + + index = date_range("1/1/2000", periods=3, freq=freq) + result = DataFrame({"s": Series([0.0, 2.0, 4.0], index=index)}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "freq, freq_depr", + [ + ("2ME", "2M"), + ("2QE", "2Q"), + ("2QE-SEP", "2Q-SEP"), + ("1BQE", "1BQ"), + ("2BQE-SEP", "2BQ-SEP"), + ("1YE", "1Y"), + ("2YE-MAR", "2Y-MAR"), + ("1YE", "1A"), + ("2YE-MAR", "2A-MAR"), + ("2BYE-MAR", "2BA-MAR"), + ], + ) + def test_asfreq_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): + # GH#9586, #55978 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}") + df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) + expected = df.asfreq(freq=freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = df.asfreq(freq=freq_depr) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_asof.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_asof.py new file mode 100644 index 0000000000000000000000000000000000000000..4a8adf89b3aef83001f6bb7669d8a9eae12529ea --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_asof.py @@ -0,0 +1,198 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs import IncompatibleFrequency + +from pandas import ( + DataFrame, + Period, + Series, + Timestamp, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + + +@pytest.fixture +def date_range_frame(): + """ + Fixture for DataFrame of ints with date_range index + + Columns are ['A', 'B']. + """ + N = 50 + rng = date_range("1/1/1990", periods=N, freq="53s") + return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng) + + +class TestFrameAsof: + def test_basic(self, date_range_frame): + # Explicitly cast to float to avoid implicit cast when setting np.nan + df = date_range_frame.astype({"A": "float"}) + N = 50 + df.loc[df.index[15:30], "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + result = df.asof(dates) + assert result.notna().all(1).all() + lb = df.index[14] + ub = df.index[30] + + dates = list(dates) + + result = df.asof(dates) + assert result.notna().all(1).all() + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + assert (rs == 14).all(1).all() + + def test_subset(self, date_range_frame): + N = 10 + # explicitly cast to float to avoid implicit upcast when setting to np.nan + df = date_range_frame.iloc[:N].copy().astype({"A": "float"}) + df.loc[df.index[4:8], "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + # with a subset of A should be the same + result = df.asof(dates, subset="A") + expected = df.asof(dates) + tm.assert_frame_equal(result, expected) + + # same with A/B + result = df.asof(dates, subset=["A", "B"]) + expected = df.asof(dates) + tm.assert_frame_equal(result, expected) + + # B gives df.asof + result = df.asof(dates, subset="B") + expected = df.resample("25s", closed="right").ffill().reindex(dates) + expected.iloc[20:] = 9 + # no "missing", so "B" can retain int dtype (df["A"].dtype platform-dependent) + expected["B"] = expected["B"].astype(df["B"].dtype) + + tm.assert_frame_equal(result, expected) + + def test_missing(self, date_range_frame): + # GH 15118 + # no match found - `where` value before earliest date in index + N = 10 + # Cast to 'float64' to avoid upcast when introducing nan in df.asof + df = date_range_frame.iloc[:N].copy().astype("float64") + + result = df.asof("1989-12-31") + + expected = Series( + index=["A", "B"], name=Timestamp("1989-12-31"), dtype=np.float64 + ) + tm.assert_series_equal(result, expected) + + result = df.asof(to_datetime(["1989-12-31"])) + expected = DataFrame( + index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64" + ) + tm.assert_frame_equal(result, expected) + + # Check that we handle PeriodIndex correctly, dont end up with + # period.ordinal for series name + df = df.to_period("D") + result = df.asof("1989-12-31") + assert isinstance(result.name, Period) + + def test_asof_all_nans(self, frame_or_series): + # GH 15713 + # DataFrame/Series is all nans + result = frame_or_series([np.nan]).asof([0]) + expected = frame_or_series([np.nan]) + tm.assert_equal(result, expected) + + def test_all_nans(self, date_range_frame): + # GH 15713 + # DataFrame is all nans + + # testing non-default indexes, multiple inputs + N = 150 + rng = date_range_frame.index + dates = date_range("1/1/1990", periods=N, freq="25s") + result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates) + expected = DataFrame(np.nan, index=dates, columns=["A"]) + tm.assert_frame_equal(result, expected) + + # testing multiple columns + dates = date_range("1/1/1990", periods=N, freq="25s") + result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates) + expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + # testing scalar input + result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3]) + expected = DataFrame(np.nan, index=[3], columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3) + expected = Series(np.nan, index=["A", "B"], name=3) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "stamp,expected", + [ + ( + Timestamp("2018-01-01 23:22:43.325+00:00"), + Series(2, name=Timestamp("2018-01-01 23:22:43.325+00:00")), + ), + ( + Timestamp("2018-01-01 22:33:20.682+01:00"), + Series(1, name=Timestamp("2018-01-01 22:33:20.682+01:00")), + ), + ], + ) + def test_time_zone_aware_index(self, stamp, expected): + # GH21194 + # Testing awareness of DataFrame index considering different + # UTC and timezone + df = DataFrame( + data=[1, 2], + index=[ + Timestamp("2018-01-01 21:00:05.001+00:00"), + Timestamp("2018-01-01 22:35:10.550+00:00"), + ], + ) + + result = df.asof(stamp) + tm.assert_series_equal(result, expected) + + def test_is_copy(self, date_range_frame): + # GH-27357, GH-30784: ensure the result of asof is an actual copy and + # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings + df = date_range_frame.astype({"A": "float"}) + N = 50 + df.loc[df.index[15:30], "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + result = df.asof(dates) + + with tm.assert_produces_warning(None): + result["C"] = 1 + + def test_asof_periodindex_mismatched_freq(self): + N = 50 + rng = period_range("1/1/1990", periods=N, freq="h") + df = DataFrame(np.random.default_rng(2).standard_normal(N), index=rng) + + # Mismatched freq + msg = "Input has different freq" + with pytest.raises(IncompatibleFrequency, match=msg): + df.asof(rng.asfreq("D")) + + def test_asof_preserves_bool_dtype(self): + # GH#16063 was casting bools to floats + dti = date_range("2017-01-01", freq="MS", periods=4) + ser = Series([True, False, True], index=dti[:-1]) + + ts = dti[-1] + res = ser.asof([ts]) + + expected = Series([True], index=[ts]) + tm.assert_series_equal(res, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_assign.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_assign.py new file mode 100644 index 0000000000000000000000000000000000000000..0ae501d43e74252a420acf96b9428ab4b8f5f211 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_assign.py @@ -0,0 +1,84 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestAssign: + def test_assign(self): + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + original = df.copy() + result = df.assign(C=df.B / df.A) + expected = df.copy() + expected["C"] = [4, 2.5, 2] + tm.assert_frame_equal(result, expected) + + # lambda syntax + result = df.assign(C=lambda x: x.B / x.A) + tm.assert_frame_equal(result, expected) + + # original is unmodified + tm.assert_frame_equal(df, original) + + # Non-Series array-like + result = df.assign(C=[4, 2.5, 2]) + tm.assert_frame_equal(result, expected) + # original is unmodified + tm.assert_frame_equal(df, original) + + result = df.assign(B=df.B / df.A) + expected = expected.drop("B", axis=1).rename(columns={"C": "B"}) + tm.assert_frame_equal(result, expected) + + # overwrite + result = df.assign(A=df.A + df.B) + expected = df.copy() + expected["A"] = [5, 7, 9] + tm.assert_frame_equal(result, expected) + + # lambda + result = df.assign(A=lambda x: x.A + x.B) + tm.assert_frame_equal(result, expected) + + def test_assign_multiple(self): + df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"]) + result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B) + expected = DataFrame( + [[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE") + ) + tm.assert_frame_equal(result, expected) + + def test_assign_order(self): + # GH 9818 + df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + result = df.assign(D=df.A + df.B, C=df.A - df.B) + + expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC")) + tm.assert_frame_equal(result, expected) + result = df.assign(C=df.A - df.B, D=df.A + df.B) + + expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD")) + + tm.assert_frame_equal(result, expected) + + def test_assign_bad(self): + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + + # non-keyword argument + msg = r"assign\(\) takes 1 positional argument but 2 were given" + with pytest.raises(TypeError, match=msg): + df.assign(lambda x: x.A) + msg = "'DataFrame' object has no attribute 'C'" + with pytest.raises(AttributeError, match=msg): + df.assign(C=df.A, D=df.A + df.C) + + def test_assign_dependent(self): + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + + result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"]) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD")) + tm.assert_frame_equal(result, expected) + + result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD")) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_astype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_astype.py new file mode 100644 index 0000000000000000000000000000000000000000..5a1e3cd786f84f3ddd60acbb4a10b214b602658e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_astype.py @@ -0,0 +1,911 @@ +import re + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + Categorical, + CategoricalDtype, + DataFrame, + DatetimeTZDtype, + Index, + Interval, + IntervalDtype, + NaT, + Series, + Timedelta, + Timestamp, + concat, + date_range, + option_context, +) +import pandas._testing as tm + + +def _check_cast(df, v): + """ + Check if all dtypes of df are equal to v + """ + assert all(s.dtype.name == v for _, s in df.items()) + + +class TestAstype: + def test_astype_float(self, float_frame): + casted = float_frame.astype(int) + expected = DataFrame( + float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(casted, expected) + + casted = float_frame.astype(np.int32) + expected = DataFrame( + float_frame.values.astype(np.int32), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(casted, expected) + + float_frame["foo"] = "5" + casted = float_frame.astype(int) + expected = DataFrame( + float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(casted, expected) + + def test_astype_mixed_float(self, mixed_float_frame): + # mixed casting + casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32") + _check_cast(casted, "float32") + + casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16") + _check_cast(casted, "float16") + + def test_astype_mixed_type(self): + # mixed casting + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) + mn = df._get_numeric_data().copy() + mn["little_float"] = np.array(12345.0, dtype="float16") + mn["big_float"] = np.array(123456789101112.0, dtype="float64") + + casted = mn.astype("float64") + _check_cast(casted, "float64") + + casted = mn.astype("int64") + _check_cast(casted, "int64") + + casted = mn.reindex(columns=["little_float"]).astype("float16") + _check_cast(casted, "float16") + + casted = mn.astype("float32") + _check_cast(casted, "float32") + + casted = mn.astype("int32") + _check_cast(casted, "int32") + + # to object + casted = mn.astype("O") + _check_cast(casted, "object") + + def test_astype_with_exclude_string(self, float_frame): + df = float_frame.copy() + expected = float_frame.astype(int) + df["string"] = "foo" + casted = df.astype(int, errors="ignore") + + expected["string"] = "foo" + tm.assert_frame_equal(casted, expected) + + df = float_frame.copy() + expected = float_frame.astype(np.int32) + df["string"] = "foo" + casted = df.astype(np.int32, errors="ignore") + + expected["string"] = "foo" + tm.assert_frame_equal(casted, expected) + + def test_astype_with_view_float(self, float_frame): + # this is the only real reason to do it this way + tf = np.round(float_frame).astype(np.int32) + tf.astype(np.float32, copy=False) + + # TODO(wesm): verification? + tf = float_frame.astype(np.float64) + tf.astype(np.int64, copy=False) + + def test_astype_with_view_mixed_float(self, mixed_float_frame): + tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) + + tf.astype(np.int64) + tf.astype(np.float32) + + @pytest.mark.parametrize("dtype", [np.int32, np.int64]) + @pytest.mark.parametrize("val", [np.nan, np.inf]) + def test_astype_cast_nan_inf_int(self, val, dtype): + # see GH#14265 + # + # Check NaN and inf --> raise error when converting to int. + msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" + df = DataFrame([val]) + + with pytest.raises(ValueError, match=msg): + df.astype(dtype) + + def test_astype_str(self): + # see GH#9757 + a = Series(date_range("2010-01-04", periods=5)) + b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) + c = Series([Timedelta(x, unit="d") for x in range(5)]) + d = Series(range(5)) + e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + + df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) + + # Datetime-like + result = df.astype(str) + + expected = DataFrame( + { + "a": list(map(str, (Timestamp(x)._date_repr for x in a._values))), + "b": list(map(str, map(Timestamp, b._values))), + "c": [Timedelta(x)._repr_base() for x in c._values], + "d": list(map(str, d._values)), + "e": list(map(str, e._values)), + }, + dtype="object", + ) + + tm.assert_frame_equal(result, expected) + + def test_astype_str_float(self): + # see GH#11302 + result = DataFrame([np.nan]).astype(str) + expected = DataFrame(["nan"], dtype="object") + + tm.assert_frame_equal(result, expected) + result = DataFrame([1.12345678901234567890]).astype(str) + + val = "1.1234567890123457" + expected = DataFrame([val], dtype="object") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype_class", [dict, Series]) + def test_astype_dict_like(self, dtype_class): + # GH7271 & GH16717 + a = Series(date_range("2010-01-04", periods=5)) + b = Series(range(5)) + c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + d = Series(["1.0", "2", "3.14", "4", "5.4"]) + df = DataFrame({"a": a, "b": b, "c": c, "d": d}) + original = df.copy(deep=True) + + # change type of a subset of columns + dt1 = dtype_class({"b": "str", "d": "float32"}) + result = df.astype(dt1) + expected = DataFrame( + { + "a": a, + "b": Series(["0", "1", "2", "3", "4"], dtype="object"), + "c": c, + "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), + } + ) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, original) + + dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64}) + result = df.astype(dt2) + expected = DataFrame( + { + "a": a, + "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"), + "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"), + "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"), + } + ) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, original) + + # change all columns + dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str}) + tm.assert_frame_equal(df.astype(dt3), df.astype(str)) + tm.assert_frame_equal(df, original) + + # error should be raised when using something other than column labels + # in the keys of the dtype dict + dt4 = dtype_class({"b": str, 2: str}) + dt5 = dtype_class({"e": str}) + msg_frame = ( + "Only a column name can be used for the key in a dtype mappings argument. " + "'{}' not found in columns." + ) + with pytest.raises(KeyError, match=msg_frame.format(2)): + df.astype(dt4) + with pytest.raises(KeyError, match=msg_frame.format("e")): + df.astype(dt5) + tm.assert_frame_equal(df, original) + + # if the dtypes provided are the same as the original dtypes, the + # resulting DataFrame should be the same as the original DataFrame + dt6 = dtype_class({col: df[col].dtype for col in df.columns}) + equiv = df.astype(dt6) + tm.assert_frame_equal(df, equiv) + tm.assert_frame_equal(df, original) + + # GH#16717 + # if dtypes provided is empty, the resulting DataFrame + # should be the same as the original DataFrame + dt7 = dtype_class({}) if dtype_class is dict else dtype_class({}, dtype=object) + equiv = df.astype(dt7) + tm.assert_frame_equal(df, equiv) + tm.assert_frame_equal(df, original) + + def test_astype_duplicate_col(self): + a1 = Series([1, 2, 3, 4, 5], name="a") + b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b") + a2 = Series([0, 1, 2, 3, 4], name="a") + df = concat([a1, b, a2], axis=1) + + result = df.astype(str) + a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") + b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") + a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") + expected = concat([a1_str, b_str, a2_str], axis=1) + tm.assert_frame_equal(result, expected) + + result = df.astype({"a": "str"}) + expected = concat([a1_str, b, a2_str], axis=1) + tm.assert_frame_equal(result, expected) + + def test_astype_duplicate_col_series_arg(self): + # GH#44417 + vals = np.random.default_rng(2).standard_normal((3, 4)) + df = DataFrame(vals, columns=["A", "B", "C", "A"]) + dtypes = df.dtypes + dtypes.iloc[0] = str + dtypes.iloc[2] = "Float64" + + result = df.astype(dtypes) + expected = DataFrame( + { + 0: Series(vals[:, 0].astype(str), dtype=object), + 1: vals[:, 1], + 2: pd.array(vals[:, 2], dtype="Float64"), + 3: vals[:, 3], + } + ) + expected.columns = df.columns + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + CategoricalDtype(ordered=True), + CategoricalDtype(ordered=False), + CategoricalDtype(categories=list("abcdef")), + CategoricalDtype(categories=list("edba"), ordered=False), + CategoricalDtype(categories=list("edcb"), ordered=True), + ], + ids=repr, + ) + def test_astype_categorical(self, dtype): + # GH#18099 + d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} + df = DataFrame(d) + result = df.astype(dtype) + expected = DataFrame({k: Categorical(v, dtype=dtype) for k, v in d.items()}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype]) + def test_astype_categoricaldtype_class_raises(self, cls): + df = DataFrame({"A": ["a", "a", "b", "c"]}) + xpr = f"Expected an instance of {cls.__name__}" + with pytest.raises(TypeError, match=xpr): + df.astype({"A": cls}) + + with pytest.raises(TypeError, match=xpr): + df["A"].astype(cls) + + @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) + def test_astype_extension_dtypes(self, dtype): + # GH#22578 + df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) + + expected1 = DataFrame( + { + "a": pd.array([1, 3, 5], dtype=dtype), + "b": pd.array([2, 4, 6], dtype=dtype), + } + ) + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + tm.assert_frame_equal(df.astype(dtype).astype("float64"), df) + + df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) + df["b"] = df["b"].astype(dtype) + expected2 = DataFrame( + {"a": [1.0, 3.0, 5.0], "b": pd.array([2, 4, 6], dtype=dtype)} + ) + tm.assert_frame_equal(df, expected2) + + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + + @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) + def test_astype_extension_dtypes_1d(self, dtype): + # GH#22578 + df = DataFrame({"a": [1.0, 2.0, 3.0]}) + + expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)}) + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + + df = DataFrame({"a": [1.0, 2.0, 3.0]}) + df["a"] = df["a"].astype(dtype) + expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)}) + tm.assert_frame_equal(df, expected2) + + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + + @pytest.mark.parametrize("dtype", ["category", "Int64"]) + def test_astype_extension_dtypes_duplicate_col(self, dtype): + # GH#24704 + a1 = Series([0, np.nan, 4], name="a") + a2 = Series([np.nan, 3, 5], name="a") + df = concat([a1, a2], axis=1) + + result = df.astype(dtype) + expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"] + ) + def test_astype_column_metadata(self, dtype): + # GH#19920 + columns = Index([100, 200, 300], dtype=np.uint64, name="foo") + df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) + df = df.astype(dtype) + tm.assert_index_equal(df.columns, columns) + + @pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"]) + def test_astype_from_object_to_datetime_unit(self, unit): + vals = [ + ["2015-01-01", "2015-01-02", "2015-01-03"], + ["2017-01-01", "2017-01-02", "2017-02-03"], + ] + df = DataFrame(vals, dtype=object) + msg = ( + rf"Unexpected value for 'dtype': 'datetime64\[{unit}\]'. " + r"Must be 'datetime64\[s\]', 'datetime64\[ms\]', 'datetime64\[us\]', " + r"'datetime64\[ns\]' or DatetimeTZDtype" + ) + with pytest.raises(ValueError, match=msg): + df.astype(f"M8[{unit}]") + + @pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"]) + def test_astype_from_object_to_timedelta_unit(self, unit): + vals = [ + ["1 Day", "2 Days", "3 Days"], + ["4 Days", "5 Days", "6 Days"], + ] + df = DataFrame(vals, dtype=object) + msg = ( + r"Cannot convert from timedelta64\[ns\] to timedelta64\[.*\]. " + "Supported resolutions are 's', 'ms', 'us', 'ns'" + ) + with pytest.raises(ValueError, match=msg): + # TODO: this is ValueError while for DatetimeArray it is TypeError; + # get these consistent + df.astype(f"m8[{unit}]") + + @pytest.mark.parametrize("dtype", ["M8", "m8"]) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_from_datetimelike_to_object(self, dtype, unit): + # tests astype to object dtype + # GH#19223 / GH#12425 + dtype = f"{dtype}[{unit}]" + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(object) + assert (result.dtypes == object).all() + + if dtype.startswith("M8"): + assert result.iloc[0, 0] == Timestamp(1, unit=unit) + else: + assert result.iloc[0, 0] == Timedelta(1, unit=unit) + + @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) + @pytest.mark.parametrize("dtype", ["M8", "m8"]) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): + # tests all units from numeric origination + # GH#19223 / GH#12425 + dtype = f"{dtype}[{unit}]" + arr = np.array([[1, 2, 3]], dtype=arr_dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(arr.astype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_to_datetime_unit(self, unit): + # tests all units from datetime origination + # GH#19223 + dtype = f"M8[{unit}]" + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + ser = df.iloc[:, 0] + idx = Index(ser) + dta = ser._values + + if unit in ["ns", "us", "ms", "s"]: + # GH#48928 + result = df.astype(dtype) + else: + # we use the nearest supported dtype (i.e. M8[s]) + msg = rf"Cannot cast DatetimeArray to dtype datetime64\[{unit}\]" + with pytest.raises(TypeError, match=msg): + df.astype(dtype) + + with pytest.raises(TypeError, match=msg): + ser.astype(dtype) + + with pytest.raises(TypeError, match=msg.replace("Array", "Index")): + idx.astype(dtype) + + with pytest.raises(TypeError, match=msg): + dta.astype(dtype) + + return + + exp_df = DataFrame(arr.astype(dtype)) + assert (exp_df.dtypes == dtype).all() + tm.assert_frame_equal(result, exp_df) + + res_ser = ser.astype(dtype) + exp_ser = exp_df.iloc[:, 0] + assert exp_ser.dtype == dtype + tm.assert_series_equal(res_ser, exp_ser) + + exp_dta = exp_ser._values + + res_index = idx.astype(dtype) + exp_index = Index(exp_ser) + assert exp_index.dtype == dtype + tm.assert_index_equal(res_index, exp_index) + + res_dta = dta.astype(dtype) + assert exp_dta.dtype == dtype + tm.assert_extension_array_equal(res_dta, exp_dta) + + @pytest.mark.parametrize("unit", ["ns"]) + def test_astype_to_timedelta_unit_ns(self, unit): + # preserver the timedelta conversion + # GH#19223 + dtype = f"m8[{unit}]" + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(arr.astype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) + def test_astype_to_timedelta_unit(self, unit): + # coerce to float + # GH#19223 until 2.0 used to coerce to float + dtype = f"m8[{unit}]" + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + ser = df.iloc[:, 0] + tdi = Index(ser) + tda = tdi._values + + if unit in ["us", "ms", "s"]: + assert (df.dtypes == dtype).all() + result = df.astype(dtype) + else: + # We get the nearest supported unit, i.e. "s" + assert (df.dtypes == "m8[s]").all() + + msg = ( + rf"Cannot convert from timedelta64\[s\] to timedelta64\[{unit}\]. " + "Supported resolutions are 's', 'ms', 'us', 'ns'" + ) + with pytest.raises(ValueError, match=msg): + df.astype(dtype) + with pytest.raises(ValueError, match=msg): + ser.astype(dtype) + with pytest.raises(ValueError, match=msg): + tdi.astype(dtype) + with pytest.raises(ValueError, match=msg): + tda.astype(dtype) + + return + + result = df.astype(dtype) + # The conversion is a no-op, so we just get a copy + expected = df + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_to_incorrect_datetimelike(self, unit): + # trying to astype a m to a M, or vice-versa + # GH#19224 + dtype = f"M8[{unit}]" + other = f"m8[{unit}]" + + df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) + msg = "|".join( + [ + # BlockManager path + rf"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]", + # ArrayManager path + "cannot astype a datetimelike from " + rf"\[datetime64\[ns\]\] to \[timedelta64\[{unit}\]\]", + ] + ) + with pytest.raises(TypeError, match=msg): + df.astype(other) + + msg = "|".join( + [ + # BlockManager path + rf"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]", + # ArrayManager path + "cannot astype a timedelta from " + rf"\[timedelta64\[ns\]\] to \[datetime64\[{unit}\]\]", + ] + ) + df = DataFrame(np.array([[1, 2, 3]], dtype=other)) + with pytest.raises(TypeError, match=msg): + df.astype(dtype) + + def test_astype_arg_for_errors(self): + # GH#14878 + + df = DataFrame([1, 2, 3]) + + msg = ( + "Expected value of kwarg 'errors' to be one of " + "['raise', 'ignore']. Supplied value is 'True'" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + df.astype(np.float64, errors=True) + + df.astype(np.int8, errors="ignore") + + def test_astype_invalid_conversion(self): + # GH#47571 + df = DataFrame({"a": [1, 2, "text"], "b": [1, 2, 3]}) + + msg = ( + "invalid literal for int() with base 10: 'text': " + "Error while type casting for column 'a'" + ) + + with pytest.raises(ValueError, match=re.escape(msg)): + df.astype({"a": int}) + + def test_astype_arg_for_errors_dictlist(self): + # GH#25905 + df = DataFrame( + [ + {"a": "1", "b": "16.5%", "c": "test"}, + {"a": "2.2", "b": "15.3", "c": "another_test"}, + ] + ) + expected = DataFrame( + [ + {"a": 1.0, "b": "16.5%", "c": "test"}, + {"a": 2.2, "b": "15.3", "c": "another_test"}, + ] + ) + expected["c"] = expected["c"].astype("object") + type_dict = {"a": "float64", "b": "float64", "c": "object"} + + result = df.astype(dtype=type_dict, errors="ignore") + + tm.assert_frame_equal(result, expected) + + def test_astype_dt64tz(self, timezone_frame): + # astype + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ], + dtype=object, + ).T + expected = DataFrame( + expected, + index=timezone_frame.index, + columns=timezone_frame.columns, + dtype=object, + ) + result = timezone_frame.astype(object) + tm.assert_frame_equal(result, expected) + + msg = "Cannot use .astype to convert from timezone-aware dtype to timezone-" + with pytest.raises(TypeError, match=msg): + # dt64tz->dt64 deprecated + timezone_frame.astype("datetime64[ns]") + + def test_astype_dt64tz_to_str(self, timezone_frame): + # str formatting + result = timezone_frame.astype(str) + expected = DataFrame( + [ + [ + "2013-01-01", + "2013-01-01 00:00:00-05:00", + "2013-01-01 00:00:00+01:00", + ], + ["2013-01-02", "NaT", "NaT"], + [ + "2013-01-03", + "2013-01-03 00:00:00-05:00", + "2013-01-03 00:00:00+01:00", + ], + ], + columns=timezone_frame.columns, + dtype="object", + ) + tm.assert_frame_equal(result, expected) + + with option_context("display.max_columns", 20): + result = str(timezone_frame) + assert ( + "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00" + ) in result + assert ( + "1 2013-01-02 NaT NaT" + ) in result + assert ( + "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" + ) in result + + def test_astype_empty_dtype_dict(self): + # issue mentioned further down in the following issue's thread + # https://github.com/pandas-dev/pandas/issues/33113 + df = DataFrame() + result = df.astype({}) + tm.assert_frame_equal(result, df) + assert result is not df + + @pytest.mark.parametrize( + "data, dtype", + [ + (["x", "y", "z"], "string[python]"), + pytest.param( + ["x", "y", "z"], + "string[pyarrow]", + marks=td.skip_if_no("pyarrow"), + ), + (["x", "y", "z"], "category"), + (3 * [Timestamp("2020-01-01", tz="UTC")], None), + (3 * [Interval(0, 1)], None), + ], + ) + @pytest.mark.parametrize("errors", ["raise", "ignore"]) + def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors): + # https://github.com/pandas-dev/pandas/issues/35471 + df = DataFrame(Series(data, dtype=dtype)) + if errors == "ignore": + expected = df + result = df.astype(float, errors=errors) + tm.assert_frame_equal(result, expected) + else: + msg = "(Cannot cast)|(could not convert)" + with pytest.raises((ValueError, TypeError), match=msg): + df.astype(float, errors=errors) + + def test_astype_tz_conversion(self): + # GH 35973 + val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + df = DataFrame(val) + result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"}) + + expected = df + expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"]) + def test_astype_tz_object_conversion(self, tz): + # GH 35973 + val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")} + expected = DataFrame(val) + + # convert expected to object dtype from other tz str (independently tested) + result = expected.astype({"tz": f"datetime64[ns, {tz}]"}) + result = result.astype({"tz": "object"}) + + # do real test: object dtype to a specified tz, different from construction tz. + result = result.astype({"tz": "datetime64[ns, Europe/London]"}) + tm.assert_frame_equal(result, expected) + + def test_astype_dt64_to_string( + self, frame_or_series, tz_naive_fixture, using_infer_string + ): + # GH#41409 + tz = tz_naive_fixture + + dti = date_range("2016-01-01", periods=3, tz=tz) + dta = dti._data + dta[0] = NaT + + obj = frame_or_series(dta) + result = obj.astype("string") + + # Check that Series/DataFrame.astype matches DatetimeArray.astype + expected = frame_or_series(dta.astype("string")) + tm.assert_equal(result, expected) + + item = result.iloc[0] + if frame_or_series is DataFrame: + item = item.iloc[0] + if using_infer_string: + assert item is np.nan + else: + assert item is pd.NA + + # For non-NA values, we should match what we get for non-EA str + alt = obj.astype(str) + assert np.all(alt.iloc[1:] == result.iloc[1:]) + + def test_astype_td64_to_string(self, frame_or_series): + # GH#41409 + tdi = pd.timedelta_range("1 Day", periods=3) + obj = frame_or_series(tdi) + + expected = frame_or_series(["1 days", "2 days", "3 days"], dtype="string") + result = obj.astype("string") + tm.assert_equal(result, expected) + + def test_astype_bytes(self): + # GH#39474 + result = DataFrame(["foo", "bar", "baz"]).astype(bytes) + assert result.dtypes[0] == np.dtype("S3") + + @pytest.mark.parametrize( + "index_slice", + [ + np.s_[:2, :2], + np.s_[:1, :2], + np.s_[:2, :1], + np.s_[::2, ::2], + np.s_[::1, ::2], + np.s_[::2, ::1], + ], + ) + def test_astype_noncontiguous(self, index_slice): + # GH#42396 + data = np.arange(16).reshape(4, 4) + df = DataFrame(data) + + result = df.iloc[index_slice].astype("int16") + expected = df.iloc[index_slice] + tm.assert_frame_equal(result, expected, check_dtype=False) + + def test_astype_retain_attrs(self, any_numpy_dtype): + # GH#44414 + df = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]}) + df.attrs["Location"] = "Michigan" + + result = df.astype({"a": any_numpy_dtype}).attrs + expected = df.attrs + + tm.assert_dict_equal(expected, result) + + +class TestAstypeCategorical: + def test_astype_from_categorical3(self): + df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6]}) + cats = Categorical([1, 2, 3, 4, 5, 6]) + exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) + df["cats"] = df["cats"].astype("category") + tm.assert_frame_equal(exp_df, df) + + def test_astype_from_categorical4(self): + df = DataFrame( + {"cats": ["a", "b", "b", "a", "a", "d"], "vals": [1, 2, 3, 4, 5, 6]} + ) + cats = Categorical(["a", "b", "b", "a", "a", "d"]) + exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) + df["cats"] = df["cats"].astype("category") + tm.assert_frame_equal(exp_df, df) + + def test_categorical_astype_to_int(self, any_int_dtype): + # GH#39402 + + df = DataFrame(data={"col1": pd.array([2.0, 1.0, 3.0])}) + df.col1 = df.col1.astype("category") + df.col1 = df.col1.astype(any_int_dtype) + expected = DataFrame({"col1": pd.array([2, 1, 3], dtype=any_int_dtype)}) + tm.assert_frame_equal(df, expected) + + def test_astype_categorical_to_string_missing(self): + # https://github.com/pandas-dev/pandas/issues/41797 + df = DataFrame(["a", "b", np.nan]) + expected = df.astype(str) + cat = df.astype("category") + result = cat.astype(str) + tm.assert_frame_equal(result, expected) + + +class IntegerArrayNoCopy(pd.core.arrays.IntegerArray): + # GH 42501 + + def copy(self): + assert False + + +class Int16DtypeNoCopy(pd.Int16Dtype): + # GH 42501 + + @classmethod + def construct_array_type(cls): + return IntegerArrayNoCopy + + +def test_frame_astype_no_copy(): + # GH 42501 + df = DataFrame({"a": [1, 4, None, 5], "b": [6, 7, 8, 9]}, dtype=object) + result = df.astype({"a": Int16DtypeNoCopy()}, copy=False) + + assert result.a.dtype == pd.Int16Dtype() + assert np.shares_memory(df.b.values, result.b.values) + + +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_astype_copies(dtype): + # GH#50984 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) + result = df.astype("int64[pyarrow]", copy=True) + df.iloc[0, 0] = 100 + expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) +def test_astype_to_string_not_modifying_input(string_storage, val): + # GH#51073 + df = DataFrame({"a": ["a", "b", val]}) + expected = df.copy() + with option_context("mode.string_storage", string_storage): + df.astype("string", copy=False) + tm.assert_frame_equal(df, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_at_time.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_at_time.py new file mode 100644 index 0000000000000000000000000000000000000000..4c1434bd66aff127e07c4ff3fce90a22721b2035 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_at_time.py @@ -0,0 +1,132 @@ +from datetime import time + +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import timezones + +from pandas import ( + DataFrame, + date_range, +) +import pandas._testing as tm + + +class TestAtTime: + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_localized_at_time(self, tzstr, frame_or_series): + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range("4/16/2012", "5/1/2012", freq="h") + ts = frame_or_series( + np.random.default_rng(2).standard_normal(len(rng)), index=rng + ) + + ts_local = ts.tz_localize(tzstr) + + result = ts_local.at_time(time(10, 0)) + expected = ts.at_time(time(10, 0)).tz_localize(tzstr) + tm.assert_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + def test_at_time(self, frame_or_series): + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame( + np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng + ) + ts = tm.get_obj(ts, frame_or_series) + rs = ts.at_time(rng[1]) + assert (rs.index.hour == rng[1].hour).all() + assert (rs.index.minute == rng[1].minute).all() + assert (rs.index.second == rng[1].second).all() + + result = ts.at_time("9:30") + expected = ts.at_time(time(9, 30)) + tm.assert_equal(result, expected) + + def test_at_time_midnight(self, frame_or_series): + # midnight, everything + rng = date_range("1/1/2000", "1/31/2000") + ts = DataFrame( + np.random.default_rng(2).standard_normal((len(rng), 3)), index=rng + ) + ts = tm.get_obj(ts, frame_or_series) + + result = ts.at_time(time(0, 0)) + tm.assert_equal(result, ts) + + def test_at_time_nonexistent(self, frame_or_series): + # time doesn't exist + rng = date_range("1/1/2012", freq="23Min", periods=384) + ts = DataFrame(np.random.default_rng(2).standard_normal(len(rng)), rng) + ts = tm.get_obj(ts, frame_or_series) + rs = ts.at_time("16:00") + assert len(rs) == 0 + + @pytest.mark.parametrize( + "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)] + ) + def test_at_time_errors(self, hour): + # GH#24043 + dti = date_range("2018", periods=3, freq="h") + df = DataFrame(list(range(len(dti))), index=dti) + if getattr(hour, "tzinfo", None) is None: + result = df.at_time(hour) + expected = df.iloc[1:2] + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(ValueError, match="Index must be timezone"): + df.at_time(hour) + + def test_at_time_tz(self): + # GH#24043 + dti = date_range("2018", periods=3, freq="h", tz="US/Pacific") + df = DataFrame(list(range(len(dti))), index=dti) + result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern"))) + expected = df.iloc[1:2] + tm.assert_frame_equal(result, expected) + + def test_at_time_raises(self, frame_or_series): + # GH#20725 + obj = DataFrame([[1, 2, 3], [4, 5, 6]]) + obj = tm.get_obj(obj, frame_or_series) + msg = "Index must be DatetimeIndex" + with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex + obj.at_time("00:00") + + @pytest.mark.parametrize("axis", ["index", "columns", 0, 1]) + def test_at_time_axis(self, axis): + # issue 8839 + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng)))) + ts.index, ts.columns = rng, rng + + indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)] + + if axis in ["index", 0]: + expected = ts.loc[indices, :] + elif axis in ["columns", 1]: + expected = ts.loc[:, indices] + + result = ts.at_time("9:30", axis=axis) + + # Without clearing freq, result has freq 1440T and expected 5T + result.index = result.index._with_freq(None) + expected.index = expected.index._with_freq(None) + tm.assert_frame_equal(result, expected) + + def test_at_time_datetimeindex(self): + index = date_range("2012-01-01", "2012-01-05", freq="30min") + df = DataFrame( + np.random.default_rng(2).standard_normal((len(index), 5)), index=index + ) + akey = time(12, 0, 0) + ainds = [24, 72, 120, 168] + + result = df.at_time(akey) + expected = df.loc[akey] + expected2 = df.iloc[ainds] + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected2) + assert len(result) == 4 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_between_time.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_between_time.py new file mode 100644 index 0000000000000000000000000000000000000000..74d6291707e19d2b6536f4a5b758302ce3aa8e2b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_between_time.py @@ -0,0 +1,227 @@ +from datetime import ( + datetime, + time, +) + +import numpy as np +import pytest + +from pandas._libs.tslibs import timezones +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + Series, + date_range, +) +import pandas._testing as tm + + +class TestBetweenTime: + @td.skip_if_not_us_locale + def test_between_time_formats(self, frame_or_series): + # GH#11818 + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame( + np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng + ) + ts = tm.get_obj(ts, frame_or_series) + + strings = [ + ("2:00", "2:30"), + ("0200", "0230"), + ("2:00am", "2:30am"), + ("0200am", "0230am"), + ("2:00:00", "2:30:00"), + ("020000", "023000"), + ("2:00:00am", "2:30:00am"), + ("020000am", "023000am"), + ] + expected_length = 28 + + for time_string in strings: + assert len(ts.between_time(*time_string)) == expected_length + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_localized_between_time(self, tzstr, frame_or_series): + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range("4/16/2012", "5/1/2012", freq="h") + ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + if frame_or_series is DataFrame: + ts = ts.to_frame() + + ts_local = ts.tz_localize(tzstr) + + t1, t2 = time(10, 0), time(11, 0) + result = ts_local.between_time(t1, t2) + expected = ts.between_time(t1, t2).tz_localize(tzstr) + tm.assert_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + def test_between_time_types(self, frame_or_series): + # GH11818 + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + obj = DataFrame({"A": 0}, index=rng) + obj = tm.get_obj(obj, frame_or_series) + + msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time" + with pytest.raises(ValueError, match=msg): + obj.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) + + def test_between_time(self, inclusive_endpoints_fixture, frame_or_series): + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame( + np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng + ) + ts = tm.get_obj(ts, frame_or_series) + + stime = time(0, 0) + etime = time(1, 0) + inclusive = inclusive_endpoints_fixture + + filtered = ts.between_time(stime, etime, inclusive=inclusive) + exp_len = 13 * 4 + 1 + + if inclusive in ["right", "neither"]: + exp_len -= 5 + if inclusive in ["left", "neither"]: + exp_len -= 4 + + assert len(filtered) == exp_len + for rs in filtered.index: + t = rs.time() + if inclusive in ["left", "both"]: + assert t >= stime + else: + assert t > stime + + if inclusive in ["right", "both"]: + assert t <= etime + else: + assert t < etime + + result = ts.between_time("00:00", "01:00") + expected = ts.between_time(stime, etime) + tm.assert_equal(result, expected) + + # across midnight + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame( + np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng + ) + ts = tm.get_obj(ts, frame_or_series) + stime = time(22, 0) + etime = time(9, 0) + + filtered = ts.between_time(stime, etime, inclusive=inclusive) + exp_len = (12 * 11 + 1) * 4 + 1 + if inclusive in ["right", "neither"]: + exp_len -= 4 + if inclusive in ["left", "neither"]: + exp_len -= 4 + + assert len(filtered) == exp_len + for rs in filtered.index: + t = rs.time() + if inclusive in ["left", "both"]: + assert (t >= stime) or (t <= etime) + else: + assert (t > stime) or (t <= etime) + + if inclusive in ["right", "both"]: + assert (t <= etime) or (t >= stime) + else: + assert (t < etime) or (t >= stime) + + def test_between_time_raises(self, frame_or_series): + # GH#20725 + obj = DataFrame([[1, 2, 3], [4, 5, 6]]) + obj = tm.get_obj(obj, frame_or_series) + + msg = "Index must be DatetimeIndex" + with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex + obj.between_time(start_time="00:00", end_time="12:00") + + def test_between_time_axis(self, frame_or_series): + # GH#8839 + rng = date_range("1/1/2000", periods=100, freq="10min") + ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + if frame_or_series is DataFrame: + ts = ts.to_frame() + + stime, etime = ("08:00:00", "09:00:00") + expected_length = 7 + + assert len(ts.between_time(stime, etime)) == expected_length + assert len(ts.between_time(stime, etime, axis=0)) == expected_length + msg = f"No axis named {ts.ndim} for object type {type(ts).__name__}" + with pytest.raises(ValueError, match=msg): + ts.between_time(stime, etime, axis=ts.ndim) + + def test_between_time_axis_aliases(self, axis): + # GH#8839 + rng = date_range("1/1/2000", periods=100, freq="10min") + ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng)))) + stime, etime = ("08:00:00", "09:00:00") + exp_len = 7 + + if axis in ["index", 0]: + ts.index = rng + assert len(ts.between_time(stime, etime)) == exp_len + assert len(ts.between_time(stime, etime, axis=0)) == exp_len + + if axis in ["columns", 1]: + ts.columns = rng + selected = ts.between_time(stime, etime, axis=1).columns + assert len(selected) == exp_len + + def test_between_time_axis_raises(self, axis): + # issue 8839 + rng = date_range("1/1/2000", periods=100, freq="10min") + mask = np.arange(0, len(rng)) + rand_data = np.random.default_rng(2).standard_normal((len(rng), len(rng))) + ts = DataFrame(rand_data, index=rng, columns=rng) + stime, etime = ("08:00:00", "09:00:00") + + msg = "Index must be DatetimeIndex" + if axis in ["columns", 1]: + ts.index = mask + with pytest.raises(TypeError, match=msg): + ts.between_time(stime, etime) + with pytest.raises(TypeError, match=msg): + ts.between_time(stime, etime, axis=0) + + if axis in ["index", 0]: + ts.columns = mask + with pytest.raises(TypeError, match=msg): + ts.between_time(stime, etime, axis=1) + + def test_between_time_datetimeindex(self): + index = date_range("2012-01-01", "2012-01-05", freq="30min") + df = DataFrame( + np.random.default_rng(2).standard_normal((len(index), 5)), index=index + ) + bkey = slice(time(13, 0, 0), time(14, 0, 0)) + binds = [26, 27, 28, 74, 75, 76, 122, 123, 124, 170, 171, 172] + + result = df.between_time(bkey.start, bkey.stop) + expected = df.loc[bkey] + expected2 = df.iloc[binds] + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected2) + assert len(result) == 12 + + def test_between_time_incorrect_arg_inclusive(self): + # GH40245 + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = DataFrame( + np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng + ) + + stime = time(0, 0) + etime = time(1, 0) + inclusive = "bad_string" + msg = "Inclusive has to be either 'both', 'neither', 'left' or 'right'" + with pytest.raises(ValueError, match=msg): + ts.between_time(stime, etime, inclusive=inclusive) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_clip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..f783a388d75179e9b68e373da65218dfffb04b55 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_clip.py @@ -0,0 +1,199 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class TestDataFrameClip: + def test_clip(self, float_frame): + median = float_frame.median().median() + original = float_frame.copy() + + double = float_frame.clip(upper=median, lower=median) + assert not (double.values != median).any() + + # Verify that float_frame was not changed inplace + assert (float_frame.values == original.values).all() + + def test_inplace_clip(self, float_frame): + # GH#15388 + median = float_frame.median().median() + frame_copy = float_frame.copy() + + return_value = frame_copy.clip(upper=median, lower=median, inplace=True) + assert return_value is None + assert not (frame_copy.values != median).any() + + def test_dataframe_clip(self): + # GH#2747 + df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2))) + + for lb, ub in [(-1, 1), (1, -1)]: + clipped_df = df.clip(lb, ub) + + lb, ub = min(lb, ub), max(ub, lb) + lb_mask = df.values <= lb + ub_mask = df.values >= ub + mask = ~lb_mask & ~ub_mask + assert (clipped_df.values[lb_mask] == lb).all() + assert (clipped_df.values[ub_mask] == ub).all() + assert (clipped_df.values[mask] == df.values[mask]).all() + + def test_clip_mixed_numeric(self): + # clip on mixed integer or floats + # GH#24162, clipping now preserves numeric types per column + df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]}) + result = df.clip(1, 2) + expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]}) + tm.assert_frame_equal(result, expected) + + df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"]) + expected = df.dtypes + result = df.clip(upper=3).dtypes + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + def test_clip_against_series(self, inplace): + # GH#6966 + + df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2))) + lb = Series(np.random.default_rng(2).standard_normal(1000)) + ub = lb + 1 + + original = df.copy() + clipped_df = df.clip(lb, ub, axis=0, inplace=inplace) + + if inplace: + clipped_df = df + + for i in range(2): + lb_mask = original.iloc[:, i] <= lb + ub_mask = original.iloc[:, i] >= ub + mask = ~lb_mask & ~ub_mask + + result = clipped_df.loc[lb_mask, i] + tm.assert_series_equal(result, lb[lb_mask], check_names=False) + assert result.name == i + + result = clipped_df.loc[ub_mask, i] + tm.assert_series_equal(result, ub[ub_mask], check_names=False) + assert result.name == i + + tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])]) + @pytest.mark.parametrize( + "axis,res", + [ + (0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]), + (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), + ], + ) + def test_clip_against_list_like(self, inplace, lower, axis, res): + # GH#15390 + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) + + original = DataFrame( + arr, columns=["one", "two", "three"], index=["a", "b", "c"] + ) + + result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) + + expected = DataFrame(res, columns=original.columns, index=original.index) + if inplace: + result = original + tm.assert_frame_equal(result, expected, check_exact=True) + + @pytest.mark.parametrize("axis", [0, 1, None]) + def test_clip_against_frame(self, axis): + df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2))) + lb = DataFrame(np.random.default_rng(2).standard_normal((1000, 2))) + ub = lb + 1 + + clipped_df = df.clip(lb, ub, axis=axis) + + lb_mask = df <= lb + ub_mask = df >= ub + mask = ~lb_mask & ~ub_mask + + tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) + tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) + tm.assert_frame_equal(clipped_df[mask], df[mask]) + + def test_clip_against_unordered_columns(self): + # GH#20911 + df1 = DataFrame( + np.random.default_rng(2).standard_normal((1000, 4)), + columns=["A", "B", "C", "D"], + ) + df2 = DataFrame( + np.random.default_rng(2).standard_normal((1000, 4)), + columns=["D", "A", "B", "C"], + ) + df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"]) + result_upper = df1.clip(lower=0, upper=df2) + expected_upper = df1.clip(lower=0, upper=df2[df1.columns]) + result_lower = df1.clip(lower=df3, upper=3) + expected_lower = df1.clip(lower=df3[df1.columns], upper=3) + result_lower_upper = df1.clip(lower=df3, upper=df2) + expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns]) + tm.assert_frame_equal(result_upper, expected_upper) + tm.assert_frame_equal(result_lower, expected_lower) + tm.assert_frame_equal(result_lower_upper, expected_lower_upper) + + def test_clip_with_na_args(self, float_frame): + """Should process np.nan argument as None""" + # GH#17276 + tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) + tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) + + # GH#19992 and adjusted in GH#40420 + df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) + + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + # TODO: avoid this warning here? seems like we should never be upcasting + # in the first place? + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.clip(lower=[4, 5, np.nan], axis=0) + expected = DataFrame( + {"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]} + ) + tm.assert_frame_equal(result, expected) + + result = df.clip(lower=[4, 5, np.nan], axis=1) + expected = DataFrame( + {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]} + ) + tm.assert_frame_equal(result, expected) + + # GH#40420 + data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]} + df = DataFrame(data) + t = Series([2, -4, np.nan, 6, 3]) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.clip(lower=t, axis=0) + expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}) + tm.assert_frame_equal(result, expected) + + def test_clip_int_data_with_float_bound(self): + # GH51472 + df = DataFrame({"a": [1, 2, 3]}) + result = df.clip(lower=1.5) + expected = DataFrame({"a": [1.5, 2.0, 3.0]}) + tm.assert_frame_equal(result, expected) + + def test_clip_with_list_bound(self): + # GH#54817 + df = DataFrame([1, 5]) + expected = DataFrame([3, 5]) + result = df.clip([3]) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([1, 3]) + result = df.clip(upper=[3]) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_combine.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_combine.py new file mode 100644 index 0000000000000000000000000000000000000000..bc6a67e4e1f320dbb71220d33ba03eaf788bcb4b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_combine.py @@ -0,0 +1,47 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestCombine: + @pytest.mark.parametrize( + "data", + [ + pd.date_range("2000", periods=4), + pd.date_range("2000", periods=4, tz="US/Central"), + pd.period_range("2000", periods=4), + pd.timedelta_range(0, periods=4), + ], + ) + def test_combine_datetlike_udf(self, data): + # GH#23079 + df = pd.DataFrame({"A": data}) + other = df.copy() + df.iloc[1, 0] = None + + def combiner(a, b): + return b + + result = df.combine(other, combiner) + tm.assert_frame_equal(result, other) + + def test_combine_generic(self, float_frame): + df1 = float_frame + df2 = float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]] + + combined = df1.combine(df2, np.add) + combined2 = df2.combine(df1, np.add) + assert combined["D"].isna().all() + assert combined2["D"].isna().all() + + chunk = combined.loc[combined.index[:-5], ["A", "B", "C"]] + chunk2 = combined2.loc[combined2.index[:-5], ["A", "B", "C"]] + + exp = ( + float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]].reindex_like(chunk) + * 2 + ) + tm.assert_frame_equal(chunk, exp) + tm.assert_frame_equal(chunk2, exp) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_combine_first.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_combine_first.py new file mode 100644 index 0000000000000000000000000000000000000000..8aeab5dacd8b4aeb96ef5b91ea0de34c485eab2c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_combine_first.py @@ -0,0 +1,556 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.common import is_dtype_equal + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, +) +import pandas._testing as tm + + +class TestDataFrameCombineFirst: + def test_combine_first_mixed(self): + a = Series(["a", "b"], index=range(2)) + b = Series(range(2), index=range(2)) + f = DataFrame({"A": a, "B": b}) + + a = Series(["a", "b"], index=range(5, 7)) + b = Series(range(2), index=range(5, 7)) + g = DataFrame({"A": a, "B": b}) + + exp = DataFrame({"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6]) + combined = f.combine_first(g) + tm.assert_frame_equal(combined, exp) + + def test_combine_first(self, float_frame, using_infer_string): + # disjoint + head, tail = float_frame[:5], float_frame[5:] + + combined = head.combine_first(tail) + reordered_frame = float_frame.reindex(combined.index) + tm.assert_frame_equal(combined, reordered_frame) + tm.assert_index_equal(combined.columns, float_frame.columns) + tm.assert_series_equal(combined["A"], reordered_frame["A"]) + + # same index + fcopy = float_frame.copy() + fcopy["A"] = 1 + del fcopy["C"] + + fcopy2 = float_frame.copy() + fcopy2["B"] = 0 + del fcopy2["D"] + + combined = fcopy.combine_first(fcopy2) + + assert (combined["A"] == 1).all() + tm.assert_series_equal(combined["B"], fcopy["B"]) + tm.assert_series_equal(combined["C"], fcopy2["C"]) + tm.assert_series_equal(combined["D"], fcopy["D"]) + + # overlap + head, tail = reordered_frame[:10].copy(), reordered_frame + head["A"] = 1 + + combined = head.combine_first(tail) + assert (combined["A"][:10] == 1).all() + + # reverse overlap + tail.iloc[:10, tail.columns.get_loc("A")] = 0 + combined = tail.combine_first(head) + assert (combined["A"][:10] == 0).all() + + # no overlap + f = float_frame[:10] + g = float_frame[10:] + combined = f.combine_first(g) + tm.assert_series_equal(combined["A"].reindex(f.index), f["A"]) + tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) + + # corner cases + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="empty entries"): + comb = float_frame.combine_first(DataFrame()) + tm.assert_frame_equal(comb, float_frame) + + comb = DataFrame().combine_first(float_frame) + tm.assert_frame_equal(comb, float_frame.sort_index()) + + comb = float_frame.combine_first(DataFrame(index=["faz", "boo"])) + assert "faz" in comb.index + + # #2525 + df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)]) + df2 = DataFrame(columns=["b"]) + result = df.combine_first(df2) + assert "b" in result + + def test_combine_first_mixed_bug(self): + idx = Index(["a", "b", "c", "e"]) + ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) + ser2 = Series(["a", "b", "c", "e"], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3}) + + idx = Index(["a", "b", "c", "f"]) + ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) + ser2 = Series(["a", "b", "c", "f"], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3}) + + combined = frame1.combine_first(frame2) + assert len(combined.columns) == 5 + + def test_combine_first_same_as_in_update(self): + # gh 3016 (same as in update) + df = DataFrame( + [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) + + other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) + result = df.combine_first(other) + tm.assert_frame_equal(result, df) + + df.loc[0, "A"] = np.nan + result = df.combine_first(other) + df.loc[0, "A"] = 45 + tm.assert_frame_equal(result, df) + + def test_combine_first_doc_example(self): + # doc example + df1 = DataFrame( + {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} + ) + + df2 = DataFrame( + { + "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], + "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0], + } + ) + + result = df1.combine_first(df2) + expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]}) + tm.assert_frame_equal(result, expected) + + def test_combine_first_return_obj_type_with_bools(self): + # GH3552 + + df1 = DataFrame( + [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]] + ) + df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) + + expected = Series([True, True, False], name=2, dtype=bool) + + result_12 = df1.combine_first(df2)[2] + tm.assert_series_equal(result_12, expected) + + result_21 = df2.combine_first(df1)[2] + tm.assert_series_equal(result_21, expected) + + @pytest.mark.parametrize( + "data1, data2, data_expected", + ( + ( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [pd.NaT, pd.NaT, pd.NaT], + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), + ( + [pd.NaT, pd.NaT, pd.NaT], + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), + ( + [datetime(2000, 1, 2), pd.NaT, pd.NaT], + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), + ( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [datetime(2000, 1, 2), pd.NaT, pd.NaT], + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), + ), + ) + def test_combine_first_convert_datatime_correctly( + self, data1, data2, data_expected + ): + # GH 3593 + + df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2}) + result = df1.combine_first(df2) + expected = DataFrame({"a": data_expected}) + tm.assert_frame_equal(result, expected) + + def test_combine_first_align_nan(self): + # GH 7509 (not fixed) + dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) + dfb = DataFrame([[4], [5]], columns=["b"]) + assert dfa["a"].dtype == "datetime64[ns]" + assert dfa["b"].dtype == "int64" + + res = dfa.combine_first(dfb) + exp = DataFrame( + {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2, 5]}, + columns=["a", "b"], + ) + tm.assert_frame_equal(res, exp) + assert res["a"].dtype == "datetime64[ns]" + # TODO: this must be int64 + assert res["b"].dtype == "int64" + + res = dfa.iloc[:0].combine_first(dfb) + exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) + tm.assert_frame_equal(res, exp) + # TODO: this must be datetime64 + assert res["a"].dtype == "float64" + # TODO: this must be int64 + assert res["b"].dtype == "int64" + + def test_combine_first_timezone(self, unit): + # see gh-7630 + data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC").as_unit(unit) + df1 = DataFrame( + columns=["UTCdatetime", "abc"], + data=data1, + index=pd.date_range("20140627", periods=1), + ) + data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC").as_unit(unit) + df2 = DataFrame( + columns=["UTCdatetime", "xyz"], + data=data2, + index=pd.date_range("20140628", periods=1), + ) + res = df2[["UTCdatetime"]].combine_first(df1) + exp = DataFrame( + { + "UTCdatetime": [ + pd.Timestamp("2010-01-01 01:01", tz="UTC"), + pd.Timestamp("2012-12-12 12:12", tz="UTC"), + ], + "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT], + }, + columns=["UTCdatetime", "abc"], + index=pd.date_range("20140627", periods=2, freq="D"), + dtype=f"datetime64[{unit}, UTC]", + ) + assert res["UTCdatetime"].dtype == f"datetime64[{unit}, UTC]" + assert res["abc"].dtype == f"datetime64[{unit}, UTC]" + + tm.assert_frame_equal(res, exp) + + def test_combine_first_timezone2(self, unit): + # see gh-10567 + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC", unit=unit) + df1 = DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC", unit=unit) + df2 = DataFrame({"DATE": dts2}) + + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + assert res["DATE"].dtype == f"datetime64[{unit}, UTC]" + + def test_combine_first_timezone3(self, unit): + dts1 = pd.DatetimeIndex( + ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern" + ).as_unit(unit) + df1 = DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) + dts2 = pd.DatetimeIndex( + ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern" + ).as_unit(unit) + df2 = DataFrame({"DATE": dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.DatetimeIndex( + [ + "2011-01-01", + "2012-01-01", + "NaT", + "2012-01-02", + "2011-01-03", + "2011-01-04", + ], + tz="US/Eastern", + ).as_unit(unit) + exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + + # FIXME: parametrizing over unit breaks on non-nano + def test_combine_first_timezone4(self): + # different tz + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") + df1 = DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-03", "2015-01-05") + df2 = DataFrame({"DATE": dts2}) + + # if df1 doesn't have NaN, keep its dtype + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" + + def test_combine_first_timezone5(self, unit): + dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern", unit=unit) + df1 = DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-01", "2015-01-03", unit=unit) + df2 = DataFrame({"DATE": dts2}) + + res = df1.combine_first(df2) + exp_dts = [ + pd.Timestamp("2015-01-01", tz="US/Eastern"), + pd.Timestamp("2015-01-02", tz="US/Eastern"), + pd.Timestamp("2015-01-03"), + ] + exp = DataFrame({"DATE": exp_dts}) + tm.assert_frame_equal(res, exp) + assert res["DATE"].dtype == "object" + + def test_combine_first_timedelta(self): + data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"]) + df1 = DataFrame({"TD": data1}, index=[1, 3, 5, 7]) + data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"]) + df2 = DataFrame({"TD": data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.TimedeltaIndex( + ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"] + ) + exp = DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + assert res["TD"].dtype == "timedelta64[ns]" + + def test_combine_first_period(self): + data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M") + df1 = DataFrame({"P": data1}, index=[1, 3, 5, 7]) + data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M") + df2 = DataFrame({"P": data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.PeriodIndex( + ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M" + ) + exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + assert res["P"].dtype == data1.dtype + + # different freq + dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D") + df2 = DataFrame({"P": dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = [ + pd.Period("2011-01", freq="M"), + pd.Period("2012-01-01", freq="D"), + pd.NaT, + pd.Period("2012-01-02", freq="D"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-04", freq="M"), + ] + exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + assert res["P"].dtype == "object" + + def test_combine_first_int(self): + # GH14687 - integer series that do no align exactly + + df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") + df2 = DataFrame({"a": [1, 4]}, dtype="int64") + + result_12 = df1.combine_first(df2) + expected_12 = DataFrame({"a": [0, 1, 3, 5]}) + tm.assert_frame_equal(result_12, expected_12) + + result_21 = df2.combine_first(df1) + expected_21 = DataFrame({"a": [1, 4, 3, 5]}) + tm.assert_frame_equal(result_21, expected_21) + + @pytest.mark.parametrize("val", [1, 1.0]) + def test_combine_first_with_asymmetric_other(self, val): + # see gh-20699 + df1 = DataFrame({"isNum": [val]}) + df2 = DataFrame({"isBool": [True]}) + + res = df1.combine_first(df2) + exp = DataFrame({"isBool": [True], "isNum": [val]}) + + tm.assert_frame_equal(res, exp) + + def test_combine_first_string_dtype_only_na(self, nullable_string_dtype): + # GH: 37519 + df = DataFrame( + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype + ) + df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype) + df.set_index(["a", "b"], inplace=True) + df2.set_index(["a", "b"], inplace=True) + result = df.combine_first(df2) + expected = DataFrame( + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype + ).set_index(["a", "b"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "scalar1, scalar2", + [ + (datetime(2020, 1, 1), datetime(2020, 1, 2)), + (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")), + (pd.Timedelta("89 days"), pd.Timedelta("60 min")), + (pd.Interval(left=0, right=1), pd.Interval(left=2, right=3, closed="left")), + ], +) +def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture): + # GH28481 + na_value = nulls_fixture + + frame = DataFrame([[na_value, na_value]], columns=["a", "b"]) + other = DataFrame([[scalar1, scalar2]], columns=["b", "c"]) + + common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]]) + + if is_dtype_equal(common_dtype, "object") or frame.dtypes["b"] == other.dtypes["b"]: + val = scalar1 + else: + val = na_value + + result = frame.combine_first(other) + + expected = DataFrame([[na_value, val, scalar2]], columns=["a", "b", "c"]) + + expected["b"] = expected["b"].astype(common_dtype) + + tm.assert_frame_equal(result, expected) + + +def test_combine_first_timestamp_bug_NaT(): + # GH28481 + frame = DataFrame([[pd.NaT, pd.NaT]], columns=["a", "b"]) + other = DataFrame( + [[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"] + ) + + result = frame.combine_first(other) + expected = DataFrame( + [[pd.NaT, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"] + ) + + tm.assert_frame_equal(result, expected) + + +def test_combine_first_with_nan_multiindex(): + # gh-36562 + + mi1 = MultiIndex.from_arrays( + [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"] + ) + df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) + mi2 = MultiIndex.from_arrays( + [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"] + ) + s = Series([1, 2, 3, 4, 5, 6], index=mi2) + res = df.combine_first(DataFrame({"d": s})) + mi_expected = MultiIndex.from_arrays( + [ + ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], + [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6], + ], + names=["a", "b"], + ) + expected = DataFrame( + { + "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1], + "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan], + }, + index=mi_expected, + ) + tm.assert_frame_equal(res, expected) + + +def test_combine_preserve_dtypes(): + # GH7509 + a_column = Series(["a", "b"], index=range(2)) + b_column = Series(range(2), index=range(2)) + df1 = DataFrame({"A": a_column, "B": b_column}) + + c_column = Series(["a", "b"], index=range(5, 7)) + b_column = Series(range(-1, 1), index=range(5, 7)) + df2 = DataFrame({"B": b_column, "C": c_column}) + + expected = DataFrame( + { + "A": ["a", "b", np.nan, np.nan], + "B": [0, 1, -1, 0], + "C": [np.nan, np.nan, "a", "b"], + }, + index=[0, 1, 5, 6], + ) + combined = df1.combine_first(df2) + tm.assert_frame_equal(combined, expected) + + +def test_combine_first_duplicates_rows_for_nan_index_values(): + # GH39881 + df1 = DataFrame( + {"x": [9, 10, 11]}, + index=MultiIndex.from_arrays([[1, 2, 3], [np.nan, 5, 6]], names=["a", "b"]), + ) + + df2 = DataFrame( + {"y": [12, 13, 14]}, + index=MultiIndex.from_arrays([[1, 2, 4], [np.nan, 5, 7]], names=["a", "b"]), + ) + + expected = DataFrame( + { + "x": [9.0, 10.0, 11.0, np.nan], + "y": [12.0, 13.0, np.nan, 14.0], + }, + index=MultiIndex.from_arrays( + [[1, 2, 3, 4], [np.nan, 5, 6, 7]], names=["a", "b"] + ), + ) + combined = df1.combine_first(df2) + tm.assert_frame_equal(combined, expected) + + +def test_combine_first_int64_not_cast_to_float64(): + # GH 28613 + df_1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df_2 = DataFrame({"A": [1, 20, 30], "B": [40, 50, 60], "C": [12, 34, 65]}) + result = df_1.combine_first(df_2) + expected = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [12, 34, 65]}) + tm.assert_frame_equal(result, expected) + + +def test_midx_losing_dtype(): + # GH#49830 + midx = MultiIndex.from_arrays([[0, 0], [np.nan, np.nan]]) + midx2 = MultiIndex.from_arrays([[1, 1], [np.nan, np.nan]]) + df1 = DataFrame({"a": [None, 4]}, index=midx) + df2 = DataFrame({"a": [3, 3]}, index=midx2) + result = df1.combine_first(df2) + expected_midx = MultiIndex.from_arrays( + [[0, 0, 1, 1], [np.nan, np.nan, np.nan, np.nan]] + ) + expected = DataFrame({"a": [np.nan, 4, 3, 3]}, index=expected_midx) + tm.assert_frame_equal(result, expected) + + +def test_combine_first_empty_columns(): + left = DataFrame(columns=["a", "b"]) + right = DataFrame(columns=["a", "c"]) + result = left.combine_first(right) + expected = DataFrame(columns=["a", "b", "c"]) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_compare.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_compare.py new file mode 100644 index 0000000000000000000000000000000000000000..a4d0a7068a3a650beb11529065d0b62ab702143b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_compare.py @@ -0,0 +1,305 @@ +import numpy as np +import pytest + +from pandas.compat.numpy import np_version_gte1p25 + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"]) +def test_compare_axis(align_axis): + # GH#30429 + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = 4.0 + + result = df.compare(df2, align_axis=align_axis) + + if align_axis in (1, "columns"): + indices = pd.Index([0, 2]) + columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) + expected = pd.DataFrame( + [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]], + index=indices, + columns=columns, + ) + else: + indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) + columns = pd.Index(["col1", "col3"]) + expected = pd.DataFrame( + [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]], + index=indices, + columns=columns, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "keep_shape, keep_equal", + [ + (True, False), + (False, True), + (True, True), + # False, False case is already covered in test_compare_axis + ], +) +def test_compare_various_formats(keep_shape, keep_equal): + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = 4.0 + + result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal) + + if keep_shape: + indices = pd.Index([0, 1, 2]) + columns = pd.MultiIndex.from_product( + [["col1", "col2", "col3"], ["self", "other"]] + ) + if keep_equal: + expected = pd.DataFrame( + [ + ["a", "c", 1.0, 1.0, 1.0, 1.0], + ["b", "b", 2.0, 2.0, 2.0, 2.0], + ["c", "c", np.nan, np.nan, 3.0, 4.0], + ], + index=indices, + columns=columns, + ) + else: + expected = pd.DataFrame( + [ + ["a", "c", np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, 3.0, 4.0], + ], + index=indices, + columns=columns, + ) + else: + indices = pd.Index([0, 2]) + columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) + expected = pd.DataFrame( + [["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns + ) + tm.assert_frame_equal(result, expected) + + +def test_compare_with_equal_nulls(): + # We want to make sure two NaNs are considered the same + # and dropped where applicable + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + + result = df.compare(df2) + indices = pd.Index([0]) + columns = pd.MultiIndex.from_product([["col1"], ["self", "other"]]) + expected = pd.DataFrame([["a", "c"]], index=indices, columns=columns) + tm.assert_frame_equal(result, expected) + + +def test_compare_with_non_equal_nulls(): + # We want to make sure the relevant NaNs do not get dropped + # even if the entire row or column are NaNs + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df.copy() + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = np.nan + + result = df.compare(df2) + + indices = pd.Index([0, 2]) + columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]]) + expected = pd.DataFrame( + [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan]], + index=indices, + columns=columns, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("align_axis", [0, 1]) +def test_compare_multi_index(align_axis): + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]} + ) + df.columns = pd.MultiIndex.from_arrays([["a", "a", "b"], ["col1", "col2", "col3"]]) + df.index = pd.MultiIndex.from_arrays([["x", "x", "y"], [0, 1, 2]]) + + df2 = df.copy() + df2.iloc[0, 0] = "c" + df2.iloc[2, 2] = 4.0 + + result = df.compare(df2, align_axis=align_axis) + + if align_axis == 0: + indices = pd.MultiIndex.from_arrays( + [["x", "x", "y", "y"], [0, 0, 2, 2], ["self", "other", "self", "other"]] + ) + columns = pd.MultiIndex.from_arrays([["a", "b"], ["col1", "col3"]]) + data = [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]] + else: + indices = pd.MultiIndex.from_arrays([["x", "y"], [0, 2]]) + columns = pd.MultiIndex.from_arrays( + [ + ["a", "a", "b", "b"], + ["col1", "col1", "col3", "col3"], + ["self", "other", "self", "other"], + ] + ) + data = [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]] + + expected = pd.DataFrame(data=data, index=indices, columns=columns) + tm.assert_frame_equal(result, expected) + + +def test_compare_unaligned_objects(): + # test DataFrames with different indices + msg = ( + r"Can only compare identically-labeled \(both index and columns\) DataFrame " + "objects" + ) + with pytest.raises(ValueError, match=msg): + df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"]) + df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"]) + df1.compare(df2) + + # test DataFrames with different shapes + msg = ( + r"Can only compare identically-labeled \(both index and columns\) DataFrame " + "objects" + ) + with pytest.raises(ValueError, match=msg): + df1 = pd.DataFrame(np.ones((3, 3))) + df2 = pd.DataFrame(np.zeros((2, 1))) + df1.compare(df2) + + +def test_compare_result_names(): + # GH 44354 + df1 = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + ) + df2 = pd.DataFrame( + { + "col1": ["c", "b", "c"], + "col2": [1.0, 2.0, np.nan], + "col3": [1.0, 2.0, np.nan], + }, + ) + result = df1.compare(df2, result_names=("left", "right")) + expected = pd.DataFrame( + { + ("col1", "left"): {0: "a", 2: np.nan}, + ("col1", "right"): {0: "c", 2: np.nan}, + ("col3", "left"): {0: np.nan, 2: 3.0}, + ("col3", "right"): {0: np.nan, 2: np.nan}, + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "result_names", + [ + [1, 2], + "HK", + {"2": 2, "3": 3}, + 3, + 3.0, + ], +) +def test_invalid_input_result_names(result_names): + # GH 44354 + df1 = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + ) + df2 = pd.DataFrame( + { + "col1": ["c", "b", "c"], + "col2": [1.0, 2.0, np.nan], + "col3": [1.0, 2.0, np.nan], + }, + ) + with pytest.raises( + TypeError, + match=( + f"Passing 'result_names' as a {type(result_names)} is not " + "supported. Provide 'result_names' as a tuple instead." + ), + ): + df1.compare(df2, result_names=result_names) + + +@pytest.mark.parametrize( + "val1,val2", + [(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)], +) +def test_compare_ea_and_np_dtype(val1, val2): + # GH 48966 + arr = [4.0, val1] + ser = pd.Series([1, val2], dtype="Int64") + + df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]}) + df2 = pd.DataFrame({"a": ser, "b": [1.0, 2]}) + expected = pd.DataFrame( + { + ("a", "self"): arr, + ("a", "other"): ser, + ("b", "self"): np.nan, + ("b", "other"): np.nan, + } + ) + if val1 is pd.NA and val2 is pd.NA: + # GH#18463 TODO: is this really the desired behavior? + expected.loc[1, ("a", "self")] = np.nan + + if val1 is pd.NA and np_version_gte1p25: + # can't compare with numpy array if it contains pd.NA + with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): + result = df1.compare(df2, keep_shape=True) + else: + result = df1.compare(df2, keep_shape=True) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "df1_val,df2_val,diff_self,diff_other", + [ + (4, 3, 4, 3), + (4, 4, pd.NA, pd.NA), + (4, pd.NA, 4, pd.NA), + (pd.NA, pd.NA, pd.NA, pd.NA), + ], +) +def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other): + # GH 48966 + df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]}) + df2 = df1.copy() + df2.loc[0, "a"] = df2_val + + expected = pd.DataFrame( + { + ("a", "self"): pd.Series([diff_self, pd.NA], dtype="Int64"), + ("a", "other"): pd.Series([diff_other, pd.NA], dtype="Int64"), + ("b", "self"): np.nan, + ("b", "other"): np.nan, + } + ) + result = df1.compare(df2, keep_shape=True) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_convert_dtypes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_convert_dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..521d2cb14ac6adf0d127e817833ef6620d6cb5e8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_convert_dtypes.py @@ -0,0 +1,202 @@ +import datetime + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestConvertDtypes: + @pytest.mark.parametrize( + "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] + ) + def test_convert_dtypes( + self, convert_integer, expected, string_storage, using_infer_string + ): + # Specific types are tested in tests/series/test_dtypes.py + # Just check that it works for DataFrame here + if using_infer_string: + string_storage = "pyarrow_numpy" + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), + } + ) + with pd.option_context("string_storage", string_storage): + result = df.convert_dtypes(True, True, convert_integer, False) + expected = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=expected), + "b": pd.Series(["x", "y", "z"], dtype=f"string[{string_storage}]"), + } + ) + tm.assert_frame_equal(result, expected) + + def test_convert_empty(self): + # Empty DataFrame can pass convert_dtypes, see GH#40393 + empty_df = pd.DataFrame() + tm.assert_frame_equal(empty_df, empty_df.convert_dtypes()) + + def test_convert_dtypes_retain_column_names(self): + # GH#41435 + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df.columns.name = "cols" + + result = df.convert_dtypes() + tm.assert_index_equal(result.columns, df.columns) + assert result.columns.name == "cols" + + def test_pyarrow_dtype_backend(self): + pa = pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + "b": pd.Series(["x", "y", None], dtype=np.dtype("O")), + "c": pd.Series([True, False, None], dtype=np.dtype("O")), + "d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), + "e": pd.Series(pd.date_range("2022", periods=3)), + "f": pd.Series(pd.date_range("2022", periods=3, tz="UTC").as_unit("s")), + "g": pd.Series(pd.timedelta_range("1D", periods=3)), + } + ) + result = df.convert_dtypes(dtype_backend="pyarrow") + expected = pd.DataFrame( + { + "a": pd.arrays.ArrowExtensionArray( + pa.array([1, 2, 3], type=pa.int32()) + ), + "b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])), + "c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])), + "d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])), + "e": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.datetime(2022, 1, 1), + datetime.datetime(2022, 1, 2), + datetime.datetime(2022, 1, 3), + ], + type=pa.timestamp(unit="ns"), + ) + ), + "f": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.datetime(2022, 1, 1), + datetime.datetime(2022, 1, 2), + datetime.datetime(2022, 1, 3), + ], + type=pa.timestamp(unit="s", tz="UTC"), + ) + ), + "g": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.timedelta(1), + datetime.timedelta(2), + datetime.timedelta(3), + ], + type=pa.duration("ns"), + ) + ), + } + ) + tm.assert_frame_equal(result, expected) + + def test_pyarrow_dtype_backend_already_pyarrow(self): + pytest.importorskip("pyarrow") + expected = pd.DataFrame([1, 2, 3], dtype="int64[pyarrow]") + result = expected.convert_dtypes(dtype_backend="pyarrow") + tm.assert_frame_equal(result, expected) + + def test_pyarrow_dtype_backend_from_pandas_nullable(self): + pa = pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, 2, None], dtype="Int32"), + "b": pd.Series(["x", "y", None], dtype="string[python]"), + "c": pd.Series([True, False, None], dtype="boolean"), + "d": pd.Series([None, 100.5, 200], dtype="Float64"), + } + ) + result = df.convert_dtypes(dtype_backend="pyarrow") + expected = pd.DataFrame( + { + "a": pd.arrays.ArrowExtensionArray( + pa.array([1, 2, None], type=pa.int32()) + ), + "b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])), + "c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])), + "d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])), + } + ) + tm.assert_frame_equal(result, expected) + + def test_pyarrow_dtype_empty_object(self): + # GH 50970 + pytest.importorskip("pyarrow") + expected = pd.DataFrame(columns=[0]) + result = expected.convert_dtypes(dtype_backend="pyarrow") + tm.assert_frame_equal(result, expected) + + def test_pyarrow_engine_lines_false(self): + # GH 48893 + df = pd.DataFrame({"a": [1, 2, 3]}) + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + df.convert_dtypes(dtype_backend="numpy") + + def test_pyarrow_backend_no_conversion(self): + # GH#52872 + pytest.importorskip("pyarrow") + df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"}) + expected = df.copy() + result = df.convert_dtypes( + convert_floating=False, + convert_integer=False, + convert_boolean=False, + convert_string=False, + dtype_backend="pyarrow", + ) + tm.assert_frame_equal(result, expected) + + def test_convert_dtypes_pyarrow_to_np_nullable(self): + # GH 53648 + pytest.importorskip("pyarrow") + ser = pd.DataFrame(range(2), dtype="int32[pyarrow]") + result = ser.convert_dtypes(dtype_backend="numpy_nullable") + expected = pd.DataFrame(range(2), dtype="Int32") + tm.assert_frame_equal(result, expected) + + def test_convert_dtypes_pyarrow_timestamp(self): + # GH 54191 + pytest.importorskip("pyarrow") + ser = pd.Series(pd.date_range("2020-01-01", "2020-01-02", freq="1min")) + expected = ser.astype("timestamp[ms][pyarrow]") + result = expected.convert_dtypes(dtype_backend="pyarrow") + tm.assert_series_equal(result, expected) + + def test_convert_dtypes_avoid_block_splitting(self): + # GH#55341 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) + result = df.convert_dtypes(convert_integer=False) + expected = pd.DataFrame( + { + "a": [1, 2, 3], + "b": [4, 5, 6], + "c": pd.Series(["a"] * 3, dtype="string[python]"), + } + ) + tm.assert_frame_equal(result, expected) + assert result._mgr.nblocks == 2 + + def test_convert_dtypes_from_arrow(self): + # GH#56581 + df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) + result = df.convert_dtypes() + expected = df.astype({"a": "string[python]"}) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_copy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_copy.py new file mode 100644 index 0000000000000000000000000000000000000000..e7901ed36310668dc21b96d44fed0686de368b1f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_copy.py @@ -0,0 +1,64 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm + + +class TestCopy: + @pytest.mark.parametrize("attr", ["index", "columns"]) + def test_copy_index_name_checking(self, float_frame, attr): + # don't want to be able to modify the index stored elsewhere after + # making a copy + ind = getattr(float_frame, attr) + ind.name = None + cp = float_frame.copy() + getattr(cp, attr).name = "foo" + assert getattr(float_frame, attr).name is None + + @td.skip_copy_on_write_invalid_test + def test_copy_cache(self): + # GH#31784 _item_cache not cleared on copy causes incorrect reads after updates + df = DataFrame({"a": [1]}) + + df["x"] = [0] + df["a"] + + df.copy() + + df["a"].values[0] = -1 + + tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0]})) + + df["y"] = [0] + + assert df["a"].values[0] == -1 + tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0], "y": [0]})) + + def test_copy(self, float_frame, float_string_frame): + cop = float_frame.copy() + cop["E"] = cop["A"] + assert "E" not in float_frame + + # copy objects + copy = float_string_frame.copy() + assert copy._mgr is not float_string_frame._mgr + + @td.skip_array_manager_invalid_test + def test_copy_consolidates(self): + # GH#42477 + df = DataFrame( + { + "a": np.random.default_rng(2).integers(0, 100, size=55), + "b": np.random.default_rng(2).integers(0, 100, size=55), + } + ) + + for i in range(10): + df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55) + + assert len(df._mgr.blocks) == 11 + result = df.copy() + assert len(result._mgr.blocks) == 1 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_count.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_count.py new file mode 100644 index 0000000000000000000000000000000000000000..1553a8a86305dd931c5378245daf272472d41b20 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_count.py @@ -0,0 +1,39 @@ +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class TestDataFrameCount: + def test_count(self): + # corner case + frame = DataFrame() + ct1 = frame.count(1) + assert isinstance(ct1, Series) + + ct2 = frame.count(0) + assert isinstance(ct2, Series) + + # GH#423 + df = DataFrame(index=range(10)) + result = df.count(1) + expected = Series(0, index=df.index) + tm.assert_series_equal(result, expected) + + df = DataFrame(columns=range(10)) + result = df.count(0) + expected = Series(0, index=df.columns) + tm.assert_series_equal(result, expected) + + df = DataFrame() + result = df.count() + expected = Series(dtype="int64") + tm.assert_series_equal(result, expected) + + def test_count_objects(self, float_string_frame): + dm = DataFrame(float_string_frame._series) + df = DataFrame(float_string_frame._series) + + tm.assert_series_equal(dm.count(), df.count()) + tm.assert_series_equal(dm.count(1), df.count(1)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_cov_corr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_cov_corr.py new file mode 100644 index 0000000000000000000000000000000000000000..04a08c8b9bc5237d72376fda51b645b047b74966 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_cov_corr.py @@ -0,0 +1,471 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Index, + Series, + date_range, + isna, +) +import pandas._testing as tm + + +class TestDataFrameCov: + def test_cov(self, float_frame, float_string_frame): + # min_periods no NAs (corner case) + expected = float_frame.cov() + result = float_frame.cov(min_periods=len(float_frame)) + + tm.assert_frame_equal(expected, result) + + result = float_frame.cov(min_periods=len(float_frame) + 1) + assert isna(result.values).all() + + # with NAs + frame = float_frame.copy() + frame.iloc[:5, frame.columns.get_loc("A")] = np.nan + frame.iloc[5:10, frame.columns.get_loc("B")] = np.nan + result = frame.cov(min_periods=len(frame) - 8) + expected = frame.cov() + expected.loc["A", "B"] = np.nan + expected.loc["B", "A"] = np.nan + tm.assert_frame_equal(result, expected) + + # regular + result = frame.cov() + expected = frame["A"].cov(frame["C"]) + tm.assert_almost_equal(result["A"]["C"], expected) + + # fails on non-numeric types + with pytest.raises(ValueError, match="could not convert string to float"): + float_string_frame.cov() + result = float_string_frame.cov(numeric_only=True) + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() + tm.assert_frame_equal(result, expected) + + # Single column frame + df = DataFrame(np.linspace(0.0, 1.0, 10)) + result = df.cov() + expected = DataFrame( + np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns + ) + tm.assert_frame_equal(result, expected) + df.loc[0] = np.nan + result = df.cov() + expected = DataFrame( + np.cov(df.values[1:].T).reshape((1, 1)), + index=df.columns, + columns=df.columns, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3]) + def test_cov_ddof(self, test_ddof): + # GH#34611 + np_array1 = np.random.default_rng(2).random(10) + np_array2 = np.random.default_rng(2).random(10) + df = DataFrame({0: np_array1, 1: np_array2}) + result = df.cov(ddof=test_ddof) + expected_np = np.cov(np_array1, np_array2, ddof=test_ddof) + expected = DataFrame(expected_np) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])] + ) + def test_cov_nullable_integer(self, other_column): + # https://github.com/pandas-dev/pandas/issues/33803 + data = DataFrame({"a": pd.array([1, 2, None]), "b": other_column}) + result = data.cov() + arr = np.array([[0.5, 0.5], [0.5, 1.0]]) + expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_cov_numeric_only(self, numeric_only): + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled + df = DataFrame({"a": [1, 0], "c": ["x", "y"]}) + expected = DataFrame(0.5, index=["a"], columns=["a"]) + if numeric_only: + result = df.cov(numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(ValueError, match="could not convert string to float"): + df.cov(numeric_only=numeric_only) + + +class TestDataFrameCorr: + # DataFrame.corr(), as opposed to DataFrame.corrwith + + @pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"]) + def test_corr_scipy_method(self, float_frame, method): + pytest.importorskip("scipy") + float_frame.loc[float_frame.index[:5], "A"] = np.nan + float_frame.loc[float_frame.index[5:10], "B"] = np.nan + float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20].copy() + + correls = float_frame.corr(method=method) + expected = float_frame["A"].corr(float_frame["C"], method=method) + tm.assert_almost_equal(correls["A"]["C"], expected) + + # --------------------------------------------------------------------- + + def test_corr_non_numeric(self, float_string_frame): + with pytest.raises(ValueError, match="could not convert string to float"): + float_string_frame.corr() + result = float_string_frame.corr(numeric_only=True) + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + def test_corr_nooverlap(self, meth): + # nothing in common + pytest.importorskip("scipy") + df = DataFrame( + { + "A": [1, 1.5, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1.5, 1], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + rs = df.corr(meth) + assert isna(rs.loc["A", "B"]) + assert isna(rs.loc["B", "A"]) + assert rs.loc["A", "A"] == 1 + assert rs.loc["B", "B"] == 1 + assert isna(rs.loc["C", "C"]) + + @pytest.mark.parametrize("meth", ["pearson", "spearman"]) + def test_corr_constant(self, meth): + # constant --> all NA + df = DataFrame( + { + "A": [1, 1, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1, 1], + } + ) + rs = df.corr(meth) + assert isna(rs.values).all() + + @pytest.mark.filterwarnings("ignore::RuntimeWarning") + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + def test_corr_int_and_boolean(self, meth): + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled + pytest.importorskip("scipy") + df = DataFrame({"a": [True, False], "b": [1, 0]}) + + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) + result = df.corr(meth) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("method", ["cov", "corr"]) + def test_corr_cov_independent_index_column(self, method): + # GH#14617 + df = DataFrame( + np.random.default_rng(2).standard_normal(4 * 10).reshape(10, 4), + columns=list("abcd"), + ) + result = getattr(df, method)() + assert result.index is not result.columns + assert result.index.equals(result.columns) + + def test_corr_invalid_method(self): + # GH#22298 + df = DataFrame(np.random.default_rng(2).normal(size=(10, 2))) + msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " + with pytest.raises(ValueError, match=msg): + df.corr(method="____") + + def test_corr_int(self): + # dtypes other than float64 GH#1761 + df = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) + + df.cov() + df.corr() + + @pytest.mark.parametrize( + "nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])] + ) + @pytest.mark.parametrize( + "other_column", + [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])], + ) + @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) + def test_corr_nullable_integer(self, nullable_column, other_column, method): + # https://github.com/pandas-dev/pandas/issues/33803 + pytest.importorskip("scipy") + data = DataFrame({"a": nullable_column, "b": other_column}) + result = data.corr(method=method) + expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) + tm.assert_frame_equal(result, expected) + + def test_corr_item_cache(self, using_copy_on_write, warn_copy_on_write): + # Check that corr does not lead to incorrect entries in item_cache + + df = DataFrame({"A": range(10)}) + df["B"] = range(10)[::-1] + + ser = df["A"] # populate item_cache + assert len(df._mgr.arrays) == 2 # i.e. 2 blocks + + _ = df.corr(numeric_only=True) + + if using_copy_on_write: + ser.iloc[0] = 99 + assert df.loc[0, "A"] == 0 + else: + # Check that the corr didn't break link between ser and df + ser.values[0] = 99 + assert df.loc[0, "A"] == 99 + if not warn_copy_on_write: + assert df["A"] is ser + assert df.values[0, 0] == 99 + + @pytest.mark.parametrize("length", [2, 20, 200, 2000]) + def test_corr_for_constant_columns(self, length): + # GH: 37448 + df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"]) + result = df.corr() + expected = DataFrame( + {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"] + ) + tm.assert_frame_equal(result, expected) + + def test_calc_corr_small_numbers(self): + # GH: 37452 + df = DataFrame( + {"A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20]} + ) + result = df.corr() + expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) + def test_corr_min_periods_greater_than_length(self, method): + pytest.importorskip("scipy") + df = DataFrame({"A": [1, 2], "B": [1, 2]}) + result = df.corr(method=method, min_periods=3) + expected = DataFrame( + {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"] + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_corr_numeric_only(self, meth, numeric_only): + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled + pytest.importorskip("scipy") + df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]}) + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) + if numeric_only: + result = df.corr(meth, numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(ValueError, match="could not convert string to float"): + df.corr(meth, numeric_only=numeric_only) + + +class TestDataFrameCorrWith: + @pytest.mark.parametrize( + "dtype", + [ + "float64", + "Float64", + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_corrwith(self, datetime_frame, dtype): + datetime_frame = datetime_frame.astype(dtype) + + a = datetime_frame + noise = Series(np.random.default_rng(2).standard_normal(len(a)), index=a.index) + + b = datetime_frame.add(noise, axis=0) + + # make sure order does not matter + b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) + del b["B"] + + colcorr = a.corrwith(b, axis=0) + tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"])) + + rowcorr = a.corrwith(b, axis=1) + tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) + + dropped = a.corrwith(b, axis=0, drop=True) + tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"])) + assert "B" not in dropped + + dropped = a.corrwith(b, axis=1, drop=True) + assert a.index[-1] not in dropped.index + + # non time-series data + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] + df1 = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + index=index, + columns=columns, + ) + df2 = DataFrame( + np.random.default_rng(2).standard_normal((4, 4)), + index=index[:4], + columns=columns, + ) + correls = df1.corrwith(df2, axis=1) + for row in index[:4]: + tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) + + def test_corrwith_with_objects(self, using_infer_string): + df1 = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + df2 = df1.copy() + cols = ["A", "B", "C", "D"] + + df1["obj"] = "foo" + df2["obj"] = "bar" + + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + df1.corrwith(df2) + else: + with pytest.raises(TypeError, match="Could not convert"): + df1.corrwith(df2) + result = df1.corrwith(df2, numeric_only=True) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) + tm.assert_series_equal(result, expected) + + with pytest.raises(TypeError, match="unsupported operand type"): + df1.corrwith(df2, axis=1) + result = df1.corrwith(df2, axis=1, numeric_only=True) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1) + tm.assert_series_equal(result, expected) + + def test_corrwith_series(self, datetime_frame): + result = datetime_frame.corrwith(datetime_frame["A"]) + expected = datetime_frame.apply(datetime_frame["A"].corr) + + tm.assert_series_equal(result, expected) + + def test_corrwith_matches_corrcoef(self): + df1 = DataFrame(np.arange(10000), columns=["a"]) + df2 = DataFrame(np.arange(10000) ** 2, columns=["a"]) + c1 = df1.corrwith(df2)["a"] + c2 = np.corrcoef(df1["a"], df2["a"])[0][1] + + tm.assert_almost_equal(c1, c2) + assert c1 < 1 + + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_corrwith_mixed_dtypes(self, numeric_only): + # GH#18570 + df = DataFrame( + {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} + ) + s = Series([0, 6, 7, 3]) + if numeric_only: + result = df.corrwith(s, numeric_only=numeric_only) + corrs = [df["a"].corr(s), df["b"].corr(s)] + expected = Series(data=corrs, index=["a", "b"]) + tm.assert_series_equal(result, expected) + else: + with pytest.raises( + ValueError, + match="could not convert string to float", + ): + df.corrwith(s, numeric_only=numeric_only) + + def test_corrwith_index_intersection(self): + df1 = DataFrame( + np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"] + ) + df2 = DataFrame( + np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"] + ) + + result = df1.corrwith(df2, drop=True).index.sort_values() + expected = df1.columns.intersection(df2.columns).sort_values() + tm.assert_index_equal(result, expected) + + def test_corrwith_index_union(self): + df1 = DataFrame( + np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"] + ) + df2 = DataFrame( + np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"] + ) + + result = df1.corrwith(df2, drop=False).index.sort_values() + expected = df1.columns.union(df2.columns).sort_values() + tm.assert_index_equal(result, expected) + + def test_corrwith_dup_cols(self): + # GH#21925 + df1 = DataFrame(np.vstack([np.arange(10)] * 3).T) + df2 = df1.copy() + df2 = pd.concat((df2, df2[0]), axis=1) + + result = df1.corrwith(df2) + expected = Series(np.ones(4), index=[0, 0, 1, 2]) + tm.assert_series_equal(result, expected) + + def test_corr_numerical_instabilities(self): + # GH#45640 + df = DataFrame([[0.2, 0.4], [0.4, 0.2]]) + result = df.corr() + expected = DataFrame({0: [1.0, -1.0], 1: [-1.0, 1.0]}) + tm.assert_frame_equal(result - 1, expected - 1, atol=1e-17) + + def test_corrwith_spearman(self): + # GH#21925 + pytest.importorskip("scipy") + df = DataFrame(np.random.default_rng(2).random(size=(100, 3))) + result = df.corrwith(df**2, method="spearman") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + + def test_corrwith_kendall(self): + # GH#21925 + pytest.importorskip("scipy") + df = DataFrame(np.random.default_rng(2).random(size=(100, 3))) + result = df.corrwith(df**2, method="kendall") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + + def test_corrwith_spearman_with_tied_data(self): + # GH#48826 + pytest.importorskip("scipy") + df1 = DataFrame( + { + "A": [1, np.nan, 7, 8], + "B": [False, True, True, False], + "C": [10, 4, 9, 3], + } + ) + df2 = df1[["B", "C"]] + result = (df1 + 1).corrwith(df2.B, method="spearman") + expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + df_bool = DataFrame( + {"A": [True, True, False, False], "B": [True, False, False, True]} + ) + ser_bool = Series([True, True, False, True]) + result = df_bool.corrwith(ser_bool) + expected = Series([0.57735, 0.57735], index=["A", "B"]) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_describe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_describe.py new file mode 100644 index 0000000000000000000000000000000000000000..5beb09940acf32a4a597819f5b130863d90261e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_describe.py @@ -0,0 +1,417 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameDescribe: + def test_describe_bool_in_mixed_frame(self): + df = DataFrame( + { + "string_data": ["a", "b", "c", "d", "e"], + "bool_data": [True, True, False, False, False], + "int_data": [10, 20, 30, 40, 50], + } + ) + + # Integer data are included in .describe() output, + # Boolean and string data are not. + result = df.describe() + expected = DataFrame( + {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + # Top value is a boolean value that is False + result = df.describe(include=["bool"]) + + expected = DataFrame( + {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] + ) + tm.assert_frame_equal(result, expected) + + def test_describe_empty_object(self): + # GH#27183 + df = DataFrame({"A": [None, None]}, dtype=object) + result = df.describe() + expected = DataFrame( + {"A": [0, 0, np.nan, np.nan]}, + dtype=object, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + result = df.iloc[:0].describe() + tm.assert_frame_equal(result, expected) + + def test_describe_bool_frame(self): + # GH#13891 + df = DataFrame( + { + "bool_data_1": [False, False, True, True], + "bool_data_2": [False, True, True, True], + } + ) + result = df.describe() + expected = DataFrame( + {"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]}, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + df = DataFrame( + { + "bool_data": [False, False, True, True, False], + "int_data": [0, 1, 2, 3, 4], + } + ) + result = df.describe() + expected = DataFrame( + {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + df = DataFrame( + {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} + ) + result = df.describe() + expected = DataFrame( + {"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]}, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + def test_describe_categorical(self): + df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)}) + labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) + cat = df + + # Categoricals should not show up together with numerical columns + result = cat.describe() + assert len(result.columns) == 1 + + # In a frame, describe() for the cat should be the same as for string + # arrays (count, unique, top, freq) + + cat = Categorical( + ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True + ) + s = Series(cat) + result = s.describe() + expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) + tm.assert_series_equal(result, expected) + + cat = Series(Categorical(["a", "b", "c", "c"])) + df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) + result = df3.describe() + tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) + + def test_describe_empty_categorical_column(self): + # GH#26397 + # Ensure the index of an empty categorical DataFrame column + # also contains (count, unique, top, freq) + df = DataFrame({"empty_col": Categorical([])}) + result = df.describe() + expected = DataFrame( + {"empty_col": [0, 0, np.nan, np.nan]}, + index=["count", "unique", "top", "freq"], + dtype="object", + ) + tm.assert_frame_equal(result, expected) + # ensure NaN, not None + assert np.isnan(result.iloc[2, 0]) + assert np.isnan(result.iloc[3, 0]) + + def test_describe_categorical_columns(self): + # GH#11558 + columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") + df = DataFrame( + { + "int1": [10, 20, 30, 40, 50], + "int2": [10, 20, 30, 40, 50], + "obj": ["A", 0, None, "X", 1], + }, + columns=columns, + ) + result = df.describe() + + exp_columns = pd.CategoricalIndex( + ["int1", "int2"], + categories=["int1", "int2", "obj"], + ordered=True, + name="XXX", + ) + expected = DataFrame( + { + "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], + "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + columns=exp_columns, + ) + + tm.assert_frame_equal(result, expected) + tm.assert_categorical_equal(result.columns.values, expected.columns.values) + + def test_describe_datetime_columns(self): + columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], + freq="MS", + tz="US/Eastern", + name="XXX", + ) + df = DataFrame( + { + 0: [10, 20, 30, 40, 50], + 1: [10, 20, 30, 40, 50], + 2: ["A", 0, None, "X", 1], + } + ) + df.columns = columns + result = df.describe() + + exp_columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" + ) + expected = DataFrame( + { + 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], + 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + expected.columns = exp_columns + tm.assert_frame_equal(result, expected) + assert result.columns.freq == "MS" + assert result.columns.tz == expected.columns.tz + + def test_describe_timedelta_values(self): + # GH#6145 + t1 = pd.timedelta_range("1 days", freq="D", periods=5) + t2 = pd.timedelta_range("1 hours", freq="h", periods=5) + df = DataFrame({"t1": t1, "t2": t2}) + + expected = DataFrame( + { + "t1": [ + 5, + pd.Timedelta("3 days"), + df.iloc[:, 0].std(), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.Timedelta("4 days"), + pd.Timedelta("5 days"), + ], + "t2": [ + 5, + pd.Timedelta("3 hours"), + df.iloc[:, 1].std(), + pd.Timedelta("1 hours"), + pd.Timedelta("2 hours"), + pd.Timedelta("3 hours"), + pd.Timedelta("4 hours"), + pd.Timedelta("5 hours"), + ], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + + result = df.describe() + tm.assert_frame_equal(result, expected) + + exp_repr = ( + " t1 t2\n" + "count 5 5\n" + "mean 3 days 00:00:00 0 days 03:00:00\n" + "std 1 days 13:56:50.394919273 0 days 01:34:52.099788303\n" + "min 1 days 00:00:00 0 days 01:00:00\n" + "25% 2 days 00:00:00 0 days 02:00:00\n" + "50% 3 days 00:00:00 0 days 03:00:00\n" + "75% 4 days 00:00:00 0 days 04:00:00\n" + "max 5 days 00:00:00 0 days 05:00:00" + ) + assert repr(result) == exp_repr + + def test_describe_tz_values(self, tz_naive_fixture): + # GH#21332 + tz = tz_naive_fixture + s1 = Series(range(5)) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s2 = Series(date_range(start, end, tz=tz)) + df = DataFrame({"s1": s1, "s2": s2}) + + expected = DataFrame( + { + "s1": [5, 2, 0, 1, 2, 3, 4, 1.581139], + "s2": [ + 5, + Timestamp(2018, 1, 3).tz_localize(tz), + start.tz_localize(tz), + s2[1], + s2[2], + s2[3], + end.tz_localize(tz), + np.nan, + ], + }, + index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"], + ) + result = df.describe(include="all") + tm.assert_frame_equal(result, expected) + + def test_datetime_is_numeric_includes_datetime(self): + df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]}) + result = df.describe() + expected = DataFrame( + { + "a": [ + 3, + Timestamp("2012-01-02"), + Timestamp("2012-01-01"), + Timestamp("2012-01-01T12:00:00"), + Timestamp("2012-01-02"), + Timestamp("2012-01-02T12:00:00"), + Timestamp("2012-01-03"), + np.nan, + ], + "b": [3, 2, 1, 1.5, 2, 2.5, 3, 1], + }, + index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"], + ) + tm.assert_frame_equal(result, expected) + + def test_describe_tz_values2(self): + tz = "CET" + s1 = Series(range(5)) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s2 = Series(date_range(start, end, tz=tz)) + df = DataFrame({"s1": s1, "s2": s2}) + + s1_ = s1.describe() + s2_ = s2.describe() + idx = [ + "count", + "mean", + "min", + "25%", + "50%", + "75%", + "max", + "std", + ] + expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex( + idx, copy=False + ) + + result = df.describe(include="all") + tm.assert_frame_equal(result, expected) + + def test_describe_percentiles_integer_idx(self): + # GH#26660 + df = DataFrame({"x": [1]}) + pct = np.linspace(0, 1, 10 + 1) + result = df.describe(percentiles=pct) + + expected = DataFrame( + {"x": [1.0, 1.0, np.nan, 1.0, *(1.0 for _ in pct), 1.0]}, + index=[ + "count", + "mean", + "std", + "min", + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + "max", + ], + ) + tm.assert_frame_equal(result, expected) + + def test_describe_does_not_raise_error_for_dictlike_elements(self): + # GH#32409 + df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}]) + expected = DataFrame( + {"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"] + ) + result = df.describe() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]]) + def test_describe_when_include_all_exclude_not_allowed(self, exclude): + """ + When include is 'all', then setting exclude != None is not allowed. + """ + df = DataFrame({"x": [1], "y": [2], "z": [3]}) + msg = "exclude must be None when include is 'all'" + with pytest.raises(ValueError, match=msg): + df.describe(include="all", exclude=exclude) + + def test_describe_with_duplicate_columns(self): + df = DataFrame( + [[1, 1, 1], [2, 2, 2], [3, 3, 3]], + columns=["bar", "a", "a"], + dtype="float64", + ) + result = df.describe() + ser = df.iloc[:, 0].describe() + expected = pd.concat([ser, ser, ser], keys=df.columns, axis=1) + tm.assert_frame_equal(result, expected) + + def test_ea_with_na(self, any_numeric_ea_dtype): + # GH#48778 + + df = DataFrame({"a": [1, pd.NA, pd.NA], "b": pd.NA}, dtype=any_numeric_ea_dtype) + result = df.describe() + expected = DataFrame( + {"a": [1.0, 1.0, pd.NA] + [1.0] * 5, "b": [0.0] + [pd.NA] * 7}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + dtype="Float64", + ) + tm.assert_frame_equal(result, expected) + + def test_describe_exclude_pa_dtype(self): + # GH#52570 + pa = pytest.importorskip("pyarrow") + df = DataFrame( + { + "a": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())), + "b": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())), + "c": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())), + } + ) + result = df.describe( + include=pd.ArrowDtype(pa.int8()), exclude=pd.ArrowDtype(pa.int32()) + ) + expected = DataFrame( + {"a": [3, 2, 1, 1, 1.5, 2, 2.5, 3]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + dtype=pd.ArrowDtype(pa.float64()), + ) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_diff.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_diff.py new file mode 100644 index 0000000000000000000000000000000000000000..bef18dbaf8a8a914eae683c16f4e71cc90514c39 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_diff.py @@ -0,0 +1,308 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameDiff: + def test_diff_requires_integer(self): + df = DataFrame(np.random.default_rng(2).standard_normal((2, 2))) + with pytest.raises(ValueError, match="periods must be an integer"): + df.diff(1.5) + + # GH#44572 np.int64 is accepted + @pytest.mark.parametrize("num", [1, np.int64(1)]) + def test_diff(self, datetime_frame, num): + df = datetime_frame + the_diff = df.diff(num) + + expected = df["A"] - df["A"].shift(num) + tm.assert_series_equal(the_diff["A"], expected) + + def test_diff_int_dtype(self): + # int dtype + a = 10_000_000_000_000_000 + b = a + 1 + ser = Series([a, b]) + + rs = DataFrame({"s": ser}).diff() + assert rs.s[1] == 1 + + def test_diff_mixed_numeric(self, datetime_frame): + # mixed numeric + tf = datetime_frame.astype("float32") + the_diff = tf.diff(1) + tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1)) + + def test_diff_axis1_nonconsolidated(self): + # GH#10907 + df = DataFrame({"y": Series([2]), "z": Series([3])}) + df.insert(0, "x", 1) + result = df.diff(axis=1) + expected = DataFrame({"x": np.nan, "y": Series(1), "z": Series(1)}) + tm.assert_frame_equal(result, expected) + + def test_diff_timedelta64_with_nat(self): + # GH#32441 + arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]") + arr[:, 0] = np.timedelta64("NaT", "ns") + + df = DataFrame(arr) + result = df.diff(1, axis=0) + + expected = DataFrame({0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]}) + tm.assert_equal(result, expected) + + result = df.diff(0) + expected = df - df + assert expected[0].isna().all() + tm.assert_equal(result, expected) + + result = df.diff(-1, axis=1) + expected = df * np.nan + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis0_with_nat(self, tz, unit): + # GH#32441 + dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz).as_unit(unit) + ser = Series(dti) + + df = ser.to_frame() + + result = df.diff() + ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]).as_unit( + unit + ) + expected = Series(ex_index).to_frame() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_with_nat_zero_periods(self, tz): + # diff on NaT values should give NaT, not timedelta64(0) + dti = date_range("2016-01-01", periods=4, tz=tz) + ser = Series(dti) + df = ser.to_frame().copy() + + df[1] = ser.copy() + + df.iloc[:, 0] = pd.NaT + + expected = df - df + assert expected[0].isna().all() + + result = df.diff(0, axis=0) + tm.assert_frame_equal(result, expected) + + result = df.diff(0, axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis0(self, tz): + # GH#18578 + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) + + result = df.diff(axis=0) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "1 days"]), + 1: pd.TimedeltaIndex(["NaT", "1 days"]), + } + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis1(self, tz): + # GH#18578 + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) + + result = df.diff(axis=1) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "NaT"]), + 1: pd.TimedeltaIndex(["0 days", "0 days"]), + } + ) + tm.assert_frame_equal(result, expected) + + def test_diff_timedelta(self, unit): + # GH#4533 + df = DataFrame( + { + "time": [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], + "value": [1.0, 2.0], + } + ) + df["time"] = df["time"].dt.as_unit(unit) + + res = df.diff() + exp = DataFrame( + [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"] + ) + exp["time"] = exp["time"].dt.as_unit(unit) + tm.assert_frame_equal(res, exp) + + def test_diff_mixed_dtype(self): + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3))) + df["A"] = np.array([1, 2, 3, 4, 5], dtype=object) + + result = df.diff() + assert result[0].dtype == np.float64 + + def test_diff_neg_n(self, datetime_frame): + rs = datetime_frame.diff(-1) + xp = datetime_frame - datetime_frame.shift(-1) + tm.assert_frame_equal(rs, xp) + + def test_diff_float_n(self, datetime_frame): + rs = datetime_frame.diff(1.0) + xp = datetime_frame.diff(1) + tm.assert_frame_equal(rs, xp) + + def test_diff_axis(self): + # GH#9727 + df = DataFrame([[1.0, 2.0], [3.0, 4.0]]) + tm.assert_frame_equal( + df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]]) + ) + tm.assert_frame_equal( + df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]]) + ) + + def test_diff_period(self): + # GH#32995 Don't pass an incorrect axis + pi = date_range("2016-01-01", periods=3).to_period("D") + df = DataFrame({"A": pi}) + + result = df.diff(1, axis=1) + + expected = (df - pd.NaT).astype(object) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2}) + + result = df.diff(axis=1) + tm.assert_frame_equal(result, expected) + + # GH#21437 mixed-float-dtypes + df = DataFrame( + {"a": np.arange(3, dtype="float32"), "b": np.arange(3, dtype="float64")} + ) + result = df.diff(axis=1) + expected = DataFrame({"a": df["a"] * np.nan, "b": df["b"] * 0}) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes_large_periods(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = df * np.nan + + result = df.diff(axis=1, periods=3) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes_negative_periods(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan}) + + result = df.diff(axis=1, periods=-1) + tm.assert_frame_equal(result, expected) + + def test_diff_sparse(self): + # GH#28813 .diff() should work for sparse dataframes as well + sparse_df = DataFrame([[0, 1], [1, 0]], dtype="Sparse[int]") + + result = sparse_df.diff() + expected = DataFrame( + [[np.nan, np.nan], [1.0, -1.0]], dtype=pd.SparseDtype("float", 0.0) + ) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "axis,expected", + [ + ( + 0, + DataFrame( + { + "a": [np.nan, 0, 1, 0, np.nan, np.nan, np.nan, 0], + "b": [np.nan, 1, np.nan, np.nan, -2, 1, np.nan, np.nan], + "c": np.repeat(np.nan, 8), + "d": [np.nan, 3, 5, 7, 9, 11, 13, 15], + }, + dtype="Int64", + ), + ), + ( + 1, + DataFrame( + { + "a": np.repeat(np.nan, 8), + "b": [0, 1, np.nan, 1, np.nan, np.nan, np.nan, 0], + "c": np.repeat(np.nan, 8), + "d": np.repeat(np.nan, 8), + }, + dtype="Int64", + ), + ), + ], + ) + def test_diff_integer_na(self, axis, expected): + # GH#24171 IntegerNA Support for DataFrame.diff() + df = DataFrame( + { + "a": np.repeat([0, 1, np.nan, 2], 2), + "b": np.tile([0, 1, np.nan, 2], 2), + "c": np.repeat(np.nan, 8), + "d": np.arange(1, 9) ** 2, + }, + dtype="Int64", + ) + + # Test case for default behaviour of diff + result = df.diff(axis=axis) + tm.assert_frame_equal(result, expected) + + def test_diff_readonly(self): + # https://github.com/pandas-dev/pandas/issues/35559 + arr = np.random.default_rng(2).standard_normal((5, 2)) + arr.flags.writeable = False + df = DataFrame(arr) + result = df.diff() + expected = DataFrame(np.array(df)).diff() + tm.assert_frame_equal(result, expected) + + def test_diff_all_int_dtype(self, any_int_numpy_dtype): + # GH 14773 + df = DataFrame(range(5)) + df = df.astype(any_int_numpy_dtype) + result = df.diff() + expected_dtype = ( + "float32" if any_int_numpy_dtype in ("int8", "int16") else "float64" + ) + expected = DataFrame([np.nan, 1.0, 1.0, 1.0, 1.0], dtype=expected_dtype) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_dot.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_dot.py new file mode 100644 index 0000000000000000000000000000000000000000..3e01f67c8794bcf35d2b7be57f8bedcc06c2a137 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_dot.py @@ -0,0 +1,155 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class DotSharedTests: + @pytest.fixture + def obj(self): + raise NotImplementedError + + @pytest.fixture + def other(self) -> DataFrame: + """ + other is a DataFrame that is indexed so that obj.dot(other) is valid + """ + raise NotImplementedError + + @pytest.fixture + def expected(self, obj, other) -> DataFrame: + """ + The expected result of obj.dot(other) + """ + raise NotImplementedError + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + raise NotImplementedError + + def test_dot_equiv_values_dot(self, obj, other, expected): + # `expected` is constructed from obj.values.dot(other.values) + result = obj.dot(other) + tm.assert_equal(result, expected) + + def test_dot_2d_ndarray(self, obj, other, expected): + # Check ndarray argument; in this case we get matching values, + # but index/columns may not match + result = obj.dot(other.values) + assert np.all(result == expected.values) + + def test_dot_1d_ndarray(self, obj, expected): + # can pass correct-length array + row = obj.iloc[0] if obj.ndim == 2 else obj + + result = obj.dot(row.values) + expected = obj.dot(row) + self.reduced_dim_assert(result, expected) + + def test_dot_series(self, obj, other, expected): + # Check series argument + result = obj.dot(other["1"]) + self.reduced_dim_assert(result, expected["1"]) + + def test_dot_series_alignment(self, obj, other, expected): + result = obj.dot(other.iloc[::-1]["1"]) + self.reduced_dim_assert(result, expected["1"]) + + def test_dot_aligns(self, obj, other, expected): + # Check index alignment + other2 = other.iloc[::-1] + result = obj.dot(other2) + tm.assert_equal(result, expected) + + def test_dot_shape_mismatch(self, obj): + msg = "Dot product shape mismatch" + # exception raised is of type Exception + with pytest.raises(Exception, match=msg): + obj.dot(obj.values[:3]) + + def test_dot_misaligned(self, obj, other): + msg = "matrices are not aligned" + with pytest.raises(ValueError, match=msg): + obj.dot(other.T) + + +class TestSeriesDot(DotSharedTests): + @pytest.fixture + def obj(self): + return Series( + np.random.default_rng(2).standard_normal(4), index=["p", "q", "r", "s"] + ) + + @pytest.fixture + def other(self): + return DataFrame( + np.random.default_rng(2).standard_normal((3, 4)), + index=["1", "2", "3"], + columns=["p", "q", "r", "s"], + ).T + + @pytest.fixture + def expected(self, obj, other): + return Series(np.dot(obj.values, other.values), index=other.columns) + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + tm.assert_almost_equal(result, expected) + + +class TestDataFrameDot(DotSharedTests): + @pytest.fixture + def obj(self): + return DataFrame( + np.random.default_rng(2).standard_normal((3, 4)), + index=["a", "b", "c"], + columns=["p", "q", "r", "s"], + ) + + @pytest.fixture + def other(self): + return DataFrame( + np.random.default_rng(2).standard_normal((4, 2)), + index=["p", "q", "r", "s"], + columns=["1", "2"], + ) + + @pytest.fixture + def expected(self, obj, other): + return DataFrame( + np.dot(obj.values, other.values), index=obj.index, columns=other.columns + ) + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + tm.assert_series_equal(result, expected, check_names=False) + assert result.name is None + + +@pytest.mark.parametrize( + "dtype,exp_dtype", + [("Float32", "Float64"), ("Int16", "Int32"), ("float[pyarrow]", "double[pyarrow]")], +) +def test_arrow_dtype(dtype, exp_dtype): + pytest.importorskip("pyarrow") + + cols = ["a", "b"] + df_a = DataFrame([[1, 2], [3, 4], [5, 6]], columns=cols, dtype="int32") + df_b = DataFrame([[1, 0], [0, 1]], index=cols, dtype=dtype) + result = df_a.dot(df_b) + expected = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=exp_dtype) + + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_drop.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_drop.py new file mode 100644 index 0000000000000000000000000000000000000000..06cd51b43a0aa038868d533d4e664db6681bc801 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_drop.py @@ -0,0 +1,546 @@ +import re + +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "msg,labels,level", + [ + (r"labels \[4\] not found in level", 4, "a"), + (r"labels \[7\] not found in level", 7, "b"), + ], +) +def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level): + # GH 8594 + mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = Series([10, 20, 30], index=mi) + df = DataFrame([10, 20, 30], index=mi) + + with pytest.raises(KeyError, match=msg): + s.drop(labels, level=level) + with pytest.raises(KeyError, match=msg): + df.drop(labels, level=level) + + +@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")]) +def test_drop_errors_ignore(labels, level): + # GH 8594 + mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = Series([10, 20, 30], index=mi) + df = DataFrame([10, 20, 30], index=mi) + + expected_s = s.drop(labels, level=level, errors="ignore") + tm.assert_series_equal(s, expected_s) + + expected_df = df.drop(labels, level=level, errors="ignore") + tm.assert_frame_equal(df, expected_df) + + +def test_drop_with_non_unique_datetime_index_and_invalid_keys(): + # GH 30399 + + # define dataframe with unique datetime index + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 3)), + columns=["a", "b", "c"], + index=pd.date_range("2012", freq="h", periods=5), + ) + # create dataframe with non-unique datetime index + df = df.iloc[[0, 2, 2, 3]].copy() + + with pytest.raises(KeyError, match="not found in axis"): + df.drop(["a", "b"]) # Dropping with labels not exist in the index + + +class TestDataFrameDrop: + def test_drop_names(self): + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + df.index.name, df.columns.name = "first", "second" + df_dropped_b = df.drop("b") + df_dropped_e = df.drop("e", axis=1) + df_inplace_b, df_inplace_e = df.copy(), df.copy() + return_value = df_inplace_b.drop("b", inplace=True) + assert return_value is None + return_value = df_inplace_e.drop("e", axis=1, inplace=True) + assert return_value is None + for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): + assert obj.index.name == "first" + assert obj.columns.name == "second" + assert list(df.columns) == ["d", "e", "f"] + + msg = r"\['g'\] not found in axis" + with pytest.raises(KeyError, match=msg): + df.drop(["g"]) + with pytest.raises(KeyError, match=msg): + df.drop(["g"], axis=1) + + # errors = 'ignore' + dropped = df.drop(["g"], errors="ignore") + expected = Index(["a", "b", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + dropped = df.drop(["b", "g"], errors="ignore") + expected = Index(["a", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + dropped = df.drop(["g"], axis=1, errors="ignore") + expected = Index(["d", "e", "f"], name="second") + tm.assert_index_equal(dropped.columns, expected) + + dropped = df.drop(["d", "g"], axis=1, errors="ignore") + expected = Index(["e", "f"], name="second") + tm.assert_index_equal(dropped.columns, expected) + + # GH 16398 + dropped = df.drop([], errors="ignore") + expected = Index(["a", "b", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + def test_drop(self): + simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) + tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]]) + tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]]) + tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) + tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) + + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop(5) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop("C", axis=1) + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop([1, 5]) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop(["A", "C"], axis=1) + + # GH 42881 + with pytest.raises(KeyError, match=r"\['C', 'D', 'F'\] not found in axis"): + simple.drop(["C", "D", "F"], axis=1) + + # errors = 'ignore' + tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple) + tm.assert_frame_equal( + simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :] + ) + tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple) + tm.assert_frame_equal( + simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]] + ) + + # non-unique - wheee! + nu_df = DataFrame( + list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"] + ) + tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]]) + tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"]) + tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 + + nu_df = nu_df.set_index(Index(["X", "Y", "X"])) + nu_df.columns = list("abc") + tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) + tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) + + # inplace cache issue + # GH#5628 + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc") + ) + expected = df[~(df.b > 0)] + return_value = df.drop(labels=df[df.b > 0].index, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + + def test_drop_multiindex_not_lexsorted(self): + # GH#11640 + + # define the lexsorted version + lexsorted_mi = MultiIndex.from_tuples( + [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] + ) + lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) + assert lexsorted_df.columns._is_lexsorted() + + # define the non-lexsorted version + not_lexsorted_df = DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) + not_lexsorted_df = not_lexsorted_df.pivot_table( + index="a", columns=["b", "c"], values="d" + ) + not_lexsorted_df = not_lexsorted_df.reset_index() + assert not not_lexsorted_df.columns._is_lexsorted() + + expected = lexsorted_df.drop("a", axis=1).astype(float) + with tm.assert_produces_warning(PerformanceWarning): + result = not_lexsorted_df.drop("a", axis=1) + + tm.assert_frame_equal(result, expected) + + def test_drop_api_equivalence(self): + # equivalence of the labels/axis and index/columns API's (GH#12392) + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.drop("a") + res2 = df.drop(index="a") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop("d", axis=1) + res2 = df.drop(columns="d") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(labels="e", axis=1) + res2 = df.drop(columns="e") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(["a"], axis=0) + res2 = df.drop(index=["a"]) + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(["a"], axis=0).drop(["d"], axis=1) + res2 = df.drop(index=["a"], columns=["d"]) + tm.assert_frame_equal(res1, res2) + + msg = "Cannot specify both 'labels' and 'index'/'columns'" + with pytest.raises(ValueError, match=msg): + df.drop(labels="a", index="b") + + with pytest.raises(ValueError, match=msg): + df.drop(labels="a", columns="b") + + msg = "Need to specify at least one of 'labels', 'index' or 'columns'" + with pytest.raises(ValueError, match=msg): + df.drop(axis=1) + + data = [[1, 2, 3], [1, 2, 3]] + + @pytest.mark.parametrize( + "actual", + [ + DataFrame(data=data, index=["a", "a"]), + DataFrame(data=data, index=["a", "b"]), + DataFrame(data=data, index=["a", "b"]).set_index([0, 1]), + DataFrame(data=data, index=["a", "a"]).set_index([0, 1]), + ], + ) + def test_raise_on_drop_duplicate_index(self, actual): + # GH#19186 + level = 0 if isinstance(actual.index, MultiIndex) else None + msg = re.escape("\"['c'] not found in axis\"") + with pytest.raises(KeyError, match=msg): + actual.drop("c", level=level, axis=0) + with pytest.raises(KeyError, match=msg): + actual.T.drop("c", level=level, axis=1) + expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore") + tm.assert_frame_equal(expected_no_err, actual) + expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore") + tm.assert_frame_equal(expected_no_err.T, actual) + + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]]) + @pytest.mark.parametrize("drop_labels", [[], [1], [2]]) + def test_drop_empty_list(self, index, drop_labels): + # GH#21494 + expected_index = [i for i in index if i not in drop_labels] + frame = DataFrame(index=index).drop(drop_labels) + tm.assert_frame_equal(frame, DataFrame(index=expected_index)) + + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]]) + @pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]]) + def test_drop_non_empty_list(self, index, drop_labels): + # GH# 21494 + with pytest.raises(KeyError, match="not found in axis"): + DataFrame(index=index).drop(drop_labels) + + @pytest.mark.parametrize( + "empty_listlike", + [ + [], + {}, + np.array([]), + Series([], dtype="datetime64[ns]"), + Index([]), + DatetimeIndex([]), + ], + ) + def test_drop_empty_listlike_non_unique_datetime_index(self, empty_listlike): + # GH#27994 + data = {"column_a": [5, 10], "column_b": ["one", "two"]} + index = [Timestamp("2021-01-01"), Timestamp("2021-01-01")] + df = DataFrame(data, index=index) + + # Passing empty list-like should return the same DataFrame. + expected = df.copy() + result = df.drop(empty_listlike) + tm.assert_frame_equal(result, expected) + + def test_mixed_depth_drop(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index) + + result = df.drop("a", axis=1) + expected = df.drop([("a", "", "")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(["top"], axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + expected = expected.drop([("top", "OD", "wy")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(("top", "OD", "wx"), axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + tm.assert_frame_equal(expected, result) + + expected = df.drop([("top", "OD", "wy")], axis=1) + expected = df.drop("top", axis=1) + + result = df.drop("result1", level=1, axis=1) + expected = df.drop( + [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1 + ) + tm.assert_frame_equal(expected, result) + + def test_drop_multiindex_other_level_nan(self): + # GH#12754 + df = ( + DataFrame( + { + "A": ["one", "one", "two", "two"], + "B": [np.nan, 0.0, 1.0, 2.0], + "C": ["a", "b", "c", "c"], + "D": [1, 2, 3, 4], + } + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.drop("c", level="C") + expected = DataFrame( + [2, 1], + columns=["D"], + index=MultiIndex.from_tuples( + [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_drop_nonunique(self): + df = DataFrame( + [ + ["x-a", "x", "a", 1.5], + ["x-a", "x", "a", 1.2], + ["z-c", "z", "c", 3.1], + ["x-a", "x", "a", 4.1], + ["x-b", "x", "b", 5.1], + ["x-b", "x", "b", 4.1], + ["x-b", "x", "b", 2.2], + ["y-a", "y", "a", 1.2], + ["z-b", "z", "b", 2.1], + ], + columns=["var1", "var2", "var3", "var4"], + ) + + grp_size = df.groupby("var1").size() + drop_idx = grp_size.loc[grp_size == 1] + + idf = df.set_index(["var1", "var2", "var3"]) + + # it works! GH#2101 + result = idf.drop(drop_idx.index, level=0).reset_index() + expected = df[-df.var1.isin(drop_idx.index)] + + result.index = expected.index + + tm.assert_frame_equal(result, expected) + + def test_drop_level(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + result = frame.drop(["bar", "qux"], level="first") + expected = frame.iloc[[0, 1, 2, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = frame.drop(["two"], level="second") + expected = frame.iloc[[0, 2, 3, 6, 7, 9]] + tm.assert_frame_equal(result, expected) + + result = frame.T.drop(["bar", "qux"], axis=1, level="first") + expected = frame.iloc[[0, 1, 2, 5, 6]].T + tm.assert_frame_equal(result, expected) + + result = frame.T.drop(["two"], axis=1, level="second") + expected = frame.iloc[[0, 2, 3, 6, 7, 9]].T + tm.assert_frame_equal(result, expected) + + def test_drop_level_nonunique_datetime(self): + # GH#12701 + idx = Index([2, 3, 4, 4, 5], name="id") + idxdt = pd.to_datetime( + [ + "2016-03-23 14:00", + "2016-03-23 15:00", + "2016-03-23 16:00", + "2016-03-23 16:00", + "2016-03-23 17:00", + ] + ) + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) + df["tstamp"] = idxdt + df = df.set_index("tstamp", append=True) + ts = Timestamp("201603231600") + assert df.index.is_unique is False + + result = df.drop(ts, level="tstamp") + expected = df.loc[idx != 4] + tm.assert_frame_equal(result, expected) + + def test_drop_tz_aware_timestamp_across_dst(self, frame_or_series): + # GH#21761 + start = Timestamp("2017-10-29", tz="Europe/Berlin") + end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin") + index = pd.date_range(start, end, freq="15min") + data = frame_or_series(data=[1] * len(index), index=index) + result = data.drop(start) + expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin") + expected_idx = pd.date_range(expected_start, end, freq="15min") + expected = frame_or_series(data=[1] * len(expected_idx), index=expected_idx) + tm.assert_equal(result, expected) + + def test_drop_preserve_names(self): + index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"] + ) + + df = DataFrame(np.random.default_rng(2).standard_normal((6, 3)), index=index) + + result = df.drop([(0, 2)]) + assert result.index.names == ("one", "two") + + @pytest.mark.parametrize( + "operation", ["__iadd__", "__isub__", "__imul__", "__ipow__"] + ) + @pytest.mark.parametrize("inplace", [False, True]) + def test_inplace_drop_and_operation(self, operation, inplace): + # GH#30484 + df = DataFrame({"x": range(5)}) + expected = df.copy() + df["y"] = range(5) + y = df["y"] + + with tm.assert_produces_warning(None): + if inplace: + df.drop("y", axis=1, inplace=inplace) + else: + df = df.drop("y", axis=1, inplace=inplace) + + # Perform operation and check result + getattr(y, operation)(1) + tm.assert_frame_equal(df, expected) + + def test_drop_with_non_unique_multiindex(self): + # GH#36293 + mi = MultiIndex.from_arrays([["x", "y", "x"], ["i", "j", "i"]]) + df = DataFrame([1, 2, 3], index=mi) + result = df.drop(index="x") + expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]])) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("indexer", [("a", "a"), [("a", "a")]]) + def test_drop_tuple_with_non_unique_multiindex(self, indexer): + # GH#42771 + idx = MultiIndex.from_product([["a", "b"], ["a", "a"]]) + df = DataFrame({"x": range(len(idx))}, index=idx) + result = df.drop(index=[("a", "a")]) + expected = DataFrame( + {"x": [2, 3]}, index=MultiIndex.from_tuples([("b", "a"), ("b", "a")]) + ) + tm.assert_frame_equal(result, expected) + + def test_drop_with_duplicate_columns(self): + df = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] + ) + result = df.drop(["a"], axis=1) + expected = DataFrame([[1], [1], [1]], columns=["bar"]) + tm.assert_frame_equal(result, expected) + result = df.drop("a", axis=1) + tm.assert_frame_equal(result, expected) + + def test_drop_with_duplicate_columns2(self): + # drop buggy GH#6240 + df = DataFrame( + { + "A": np.random.default_rng(2).standard_normal(5), + "B": np.random.default_rng(2).standard_normal(5), + "C": np.random.default_rng(2).standard_normal(5), + "D": ["a", "b", "c", "d", "e"], + } + ) + + expected = df.take([0, 1, 1], axis=1) + df2 = df.take([2, 0, 1, 2, 1], axis=1) + result = df2.drop("C", axis=1) + tm.assert_frame_equal(result, expected) + + def test_drop_inplace_no_leftover_column_reference(self): + # GH 13934 + df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object")) + a = df.a + df.drop(["a"], axis=1, inplace=True) + tm.assert_index_equal(df.columns, Index([], dtype="object")) + a -= a.mean() + tm.assert_index_equal(df.columns, Index([], dtype="object")) + + def test_drop_level_missing_label_multiindex(self): + # GH 18561 + df = DataFrame(index=MultiIndex.from_product([range(3), range(3)])) + with pytest.raises(KeyError, match="labels \\[5\\] not found in level"): + df.drop(5, level=0) + + @pytest.mark.parametrize("idx, level", [(["a", "b"], 0), (["a"], None)]) + def test_drop_index_ea_dtype(self, any_numeric_ea_dtype, idx, level): + # GH#45860 + df = DataFrame( + {"a": [1, 2, 2, pd.NA], "b": 100}, dtype=any_numeric_ea_dtype + ).set_index(idx) + result = df.drop(Index([2, pd.NA]), level=level) + expected = DataFrame( + {"a": [1], "b": 100}, dtype=any_numeric_ea_dtype + ).set_index(idx) + tm.assert_frame_equal(result, expected) + + def test_drop_parse_strings_datetime_index(self): + # GH #5355 + df = DataFrame( + {"a": [1, 2], "b": [1, 2]}, + index=[Timestamp("2000-01-03"), Timestamp("2000-01-04")], + ) + result = df.drop("2000-01-03", axis=0) + expected = DataFrame({"a": [2], "b": [2]}, index=[Timestamp("2000-01-04")]) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_drop_duplicates.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_drop_duplicates.py new file mode 100644 index 0000000000000000000000000000000000000000..6bea97b2cf189d81b99996cc8cc78a3b92f7afc0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_drop_duplicates.py @@ -0,0 +1,473 @@ +from datetime import datetime +import re + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + NaT, + concat, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) +def test_drop_duplicates_with_misspelled_column_name(subset): + # GH 19730 + df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) + msg = re.escape("Index(['a'], dtype=") + + with pytest.raises(KeyError, match=msg): + df.drop_duplicates(subset) + + +def test_drop_duplicates(): + df = DataFrame( + { + "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("AAA") + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep="last") + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep=False) + expected = df.loc[[]] + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates(np.array(["AAA", "B"])) + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates(["AAA", "B"]) + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AAA", "B"), keep="last") + expected = df.loc[[0, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AAA", "B"), keep=False) + expected = df.loc[[0]] + tm.assert_frame_equal(result, expected) + + # consider everything + df2 = df.loc[:, ["AAA", "B", "C"]] + + result = df2.drop_duplicates() + # in this case only + expected = df2.drop_duplicates(["AAA", "B"]) + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep="last") + expected = df2.drop_duplicates(["AAA", "B"], keep="last") + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep=False) + expected = df2.drop_duplicates(["AAA", "B"], keep=False) + tm.assert_frame_equal(result, expected) + + # integers + result = df.drop_duplicates("C") + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates("C", keep="last") + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + df["E"] = df["C"].astype("int8") + result = df.drop_duplicates("E") + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates("E", keep="last") + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + # GH 11376 + df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}) + expected = df.loc[df.index != 3] + tm.assert_frame_equal(df.drop_duplicates(), expected) + + df = DataFrame([[1, 0], [0, 2]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = DataFrame([[-2, 0], [0, -4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + x = np.iinfo(np.int64).max / 3 * 2 + df = DataFrame([[-x, x], [0, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = DataFrame([[-x, x], [x, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + # GH 11864 + df = DataFrame([i] * 9 for i in range(16)) + df = concat([df, DataFrame([[1] + [0] * 8])], ignore_index=True) + + for keep in ["first", "last", False]: + assert df.duplicated(keep=keep).sum() == 0 + + +def test_drop_duplicates_with_duplicate_column_names(): + # GH17836 + df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]) + + result0 = df.drop_duplicates() + tm.assert_frame_equal(result0, df) + + result1 = df.drop_duplicates("a") + expected1 = df[:2] + tm.assert_frame_equal(result1, expected1) + + +def test_drop_duplicates_for_take_all(): + df = DataFrame( + { + "AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("AAA") + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep="last") + expected = df.iloc[[2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("AAA", keep=False) + expected = df.iloc[[2, 6]] + tm.assert_frame_equal(result, expected) + + # multiple columns + result = df.drop_duplicates(["AAA", "B"]) + expected = df.iloc[[0, 1, 2, 3, 4, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["AAA", "B"], keep="last") + expected = df.iloc[[0, 1, 2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["AAA", "B"], keep=False) + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_tuple(): + df = DataFrame( + { + ("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates(("AA", "AB")) + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AA", "AB"), keep="last") + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(("AA", "AB"), keep=False) + expected = df.loc[[]] # empty df + assert len(result) == 0 + tm.assert_frame_equal(result, expected) + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates((("AA", "AB"), "B")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "df", + [ + DataFrame(), + DataFrame(columns=[]), + DataFrame(columns=["A", "B", "C"]), + DataFrame(index=[]), + DataFrame(index=["A", "B", "C"]), + ], +) +def test_drop_duplicates_empty(df): + # GH 20516 + result = df.drop_duplicates() + tm.assert_frame_equal(result, df) + + result = df.copy() + result.drop_duplicates(inplace=True) + tm.assert_frame_equal(result, df) + + +def test_drop_duplicates_NA(): + # none + df = DataFrame( + { + "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("A") + expected = df.loc[[0, 2, 3]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep="last") + expected = df.loc[[1, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(["A", "B"]) + expected = df.loc[[0, 2, 3, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["A", "B"], keep="last") + expected = df.loc[[1, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["A", "B"], keep=False) + expected = df.loc[[6]] + tm.assert_frame_equal(result, expected) + + # nan + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], + "D": range(8), + } + ) + # single column + result = df.drop_duplicates("C") + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep="last") + expected = df.loc[[3, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(["C", "B"]) + expected = df.loc[[0, 1, 2, 4]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["C", "B"], keep="last") + expected = df.loc[[1, 3, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(["C", "B"], keep=False) + expected = df.loc[[1]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_NA_for_take_all(): + # none + df = DataFrame( + { + "A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0], + } + ) + + # single column + result = df.drop_duplicates("A") + expected = df.iloc[[0, 2, 3, 5, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep="last") + expected = df.iloc[[1, 4, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("A", keep=False) + expected = df.iloc[[5, 7]] + tm.assert_frame_equal(result, expected) + + # nan + + # single column + result = df.drop_duplicates("C") + expected = df.iloc[[0, 1, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep="last") + expected = df.iloc[[3, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates("C", keep=False) + expected = df.iloc[[5, 6]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_inplace(): + orig = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) + # single column + df = orig.copy() + return_value = df.drop_duplicates("A", inplace=True) + expected = orig[:2] + result = df + tm.assert_frame_equal(result, expected) + assert return_value is None + + df = orig.copy() + return_value = df.drop_duplicates("A", keep="last", inplace=True) + expected = orig.loc[[6, 7]] + result = df + tm.assert_frame_equal(result, expected) + assert return_value is None + + df = orig.copy() + return_value = df.drop_duplicates("A", keep=False, inplace=True) + expected = orig.loc[[]] + result = df + tm.assert_frame_equal(result, expected) + assert len(df) == 0 + assert return_value is None + + # multi column + df = orig.copy() + return_value = df.drop_duplicates(["A", "B"], inplace=True) + expected = orig.loc[[0, 1, 2, 3]] + result = df + tm.assert_frame_equal(result, expected) + assert return_value is None + + df = orig.copy() + return_value = df.drop_duplicates(["A", "B"], keep="last", inplace=True) + expected = orig.loc[[0, 5, 6, 7]] + result = df + tm.assert_frame_equal(result, expected) + assert return_value is None + + df = orig.copy() + return_value = df.drop_duplicates(["A", "B"], keep=False, inplace=True) + expected = orig.loc[[0]] + result = df + tm.assert_frame_equal(result, expected) + assert return_value is None + + # consider everything + orig2 = orig.loc[:, ["A", "B", "C"]].copy() + + df2 = orig2.copy() + return_value = df2.drop_duplicates(inplace=True) + # in this case only + expected = orig2.drop_duplicates(["A", "B"]) + result = df2 + tm.assert_frame_equal(result, expected) + assert return_value is None + + df2 = orig2.copy() + return_value = df2.drop_duplicates(keep="last", inplace=True) + expected = orig2.drop_duplicates(["A", "B"], keep="last") + result = df2 + tm.assert_frame_equal(result, expected) + assert return_value is None + + df2 = orig2.copy() + return_value = df2.drop_duplicates(keep=False, inplace=True) + expected = orig2.drop_duplicates(["A", "B"], keep=False) + result = df2 + tm.assert_frame_equal(result, expected) + assert return_value is None + + +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize( + "origin_dict, output_dict, ignore_index, output_index", + [ + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ], +) +def test_drop_duplicates_ignore_index( + inplace, origin_dict, output_dict, ignore_index, output_index +): + # GH 30114 + df = DataFrame(origin_dict) + expected = DataFrame(output_dict, index=output_index) + + if inplace: + result_df = df.copy() + result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) + else: + result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) + + tm.assert_frame_equal(result_df, expected) + tm.assert_frame_equal(df, DataFrame(origin_dict)) + + +def test_drop_duplicates_null_in_object_column(nulls_fixture): + # https://github.com/pandas-dev/pandas/issues/32992 + df = DataFrame([[1, nulls_fixture], [2, "a"]], dtype=object) + result = df.drop_duplicates() + tm.assert_frame_equal(result, df) + + +def test_drop_duplicates_series_vs_dataframe(keep): + # GH#14192 + df = DataFrame( + { + "a": [1, 1, 1, "one", "one"], + "b": [2, 2, np.nan, np.nan, np.nan], + "c": [3, 3, np.nan, np.nan, "three"], + "d": [1, 2, 3, 4, 4], + "e": [ + datetime(2015, 1, 1), + datetime(2015, 1, 1), + datetime(2015, 2, 1), + NaT, + NaT, + ], + } + ) + for column in df.columns: + dropped_frame = df[[column]].drop_duplicates(keep=keep) + dropped_series = df[column].drop_duplicates(keep=keep) + tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) + + +@pytest.mark.parametrize("arg", [[1], 1, "True", [], 0]) +def test_drop_duplicates_non_boolean_ignore_index(arg): + # GH#38274 + df = DataFrame({"a": [1, 2, 1, 3]}) + msg = '^For argument "ignore_index" expected type bool, received type .*.$' + with pytest.raises(ValueError, match=msg): + df.drop_duplicates(ignore_index=arg) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_droplevel.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_droplevel.py new file mode 100644 index 0000000000000000000000000000000000000000..e1302d4b73f2b9c8e74b06c70ec29a92c1e48723 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_droplevel.py @@ -0,0 +1,36 @@ +import pytest + +from pandas import ( + DataFrame, + Index, + MultiIndex, +) +import pandas._testing as tm + + +class TestDropLevel: + def test_droplevel(self, frame_or_series): + # GH#20342 + cols = MultiIndex.from_tuples( + [("c", "e"), ("d", "f")], names=["level_1", "level_2"] + ) + mi = MultiIndex.from_tuples([(1, 2), (5, 6), (9, 10)], names=["a", "b"]) + df = DataFrame([[3, 4], [7, 8], [11, 12]], index=mi, columns=cols) + if frame_or_series is not DataFrame: + df = df.iloc[:, 0] + + # test that dropping of a level in index works + expected = df.reset_index("a", drop=True) + result = df.droplevel("a", axis="index") + tm.assert_equal(result, expected) + + if frame_or_series is DataFrame: + # test that dropping of a level in columns works + expected = df.copy() + expected.columns = Index(["c", "d"], name="level_1") + result = df.droplevel("level_2", axis="columns") + tm.assert_equal(result, expected) + else: + # test that droplevel raises ValueError on axis != 0 + with pytest.raises(ValueError, match="No axis named columns"): + df.droplevel(1, axis="columns") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_dropna.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_dropna.py new file mode 100644 index 0000000000000000000000000000000000000000..7899b4aeac3fdef6548f3aadf76ff7718418f089 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_dropna.py @@ -0,0 +1,285 @@ +import datetime + +import dateutil +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class TestDataFrameMissingData: + def test_dropEmptyRows(self, float_frame): + N = len(float_frame.index) + mat = np.random.default_rng(2).standard_normal(N) + mat[:5] = np.nan + + frame = DataFrame({"foo": mat}, index=float_frame.index) + original = Series(mat, index=float_frame.index, name="foo") + expected = original.dropna() + inplace_frame1, inplace_frame2 = frame.copy(), frame.copy() + + smaller_frame = frame.dropna(how="all") + # check that original was preserved + tm.assert_series_equal(frame["foo"], original) + return_value = inplace_frame1.dropna(how="all", inplace=True) + tm.assert_series_equal(smaller_frame["foo"], expected) + tm.assert_series_equal(inplace_frame1["foo"], expected) + assert return_value is None + + smaller_frame = frame.dropna(how="all", subset=["foo"]) + return_value = inplace_frame2.dropna(how="all", subset=["foo"], inplace=True) + tm.assert_series_equal(smaller_frame["foo"], expected) + tm.assert_series_equal(inplace_frame2["foo"], expected) + assert return_value is None + + def test_dropIncompleteRows(self, float_frame): + N = len(float_frame.index) + mat = np.random.default_rng(2).standard_normal(N) + mat[:5] = np.nan + + frame = DataFrame({"foo": mat}, index=float_frame.index) + frame["bar"] = 5 + original = Series(mat, index=float_frame.index, name="foo") + inp_frame1, inp_frame2 = frame.copy(), frame.copy() + + smaller_frame = frame.dropna() + tm.assert_series_equal(frame["foo"], original) + return_value = inp_frame1.dropna(inplace=True) + + exp = Series(mat[5:], index=float_frame.index[5:], name="foo") + tm.assert_series_equal(smaller_frame["foo"], exp) + tm.assert_series_equal(inp_frame1["foo"], exp) + assert return_value is None + + samesize_frame = frame.dropna(subset=["bar"]) + tm.assert_series_equal(frame["foo"], original) + assert (frame["bar"] == 5).all() + return_value = inp_frame2.dropna(subset=["bar"], inplace=True) + tm.assert_index_equal(samesize_frame.index, float_frame.index) + tm.assert_index_equal(inp_frame2.index, float_frame.index) + assert return_value is None + + def test_dropna(self): + df = DataFrame(np.random.default_rng(2).standard_normal((6, 4))) + df.iloc[:2, 2] = np.nan + + dropped = df.dropna(axis=1) + expected = df.loc[:, [0, 1, 3]] + inp = df.copy() + return_value = inp.dropna(axis=1, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + assert return_value is None + + dropped = df.dropna(axis=0) + expected = df.loc[list(range(2, 6))] + inp = df.copy() + return_value = inp.dropna(axis=0, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + assert return_value is None + + # threshold + dropped = df.dropna(axis=1, thresh=5) + expected = df.loc[:, [0, 1, 3]] + inp = df.copy() + return_value = inp.dropna(axis=1, thresh=5, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + assert return_value is None + + dropped = df.dropna(axis=0, thresh=4) + expected = df.loc[range(2, 6)] + inp = df.copy() + return_value = inp.dropna(axis=0, thresh=4, inplace=True) + tm.assert_frame_equal(dropped, expected) + tm.assert_frame_equal(inp, expected) + assert return_value is None + + dropped = df.dropna(axis=1, thresh=4) + tm.assert_frame_equal(dropped, df) + + dropped = df.dropna(axis=1, thresh=3) + tm.assert_frame_equal(dropped, df) + + # subset + dropped = df.dropna(axis=0, subset=[0, 1, 3]) + inp = df.copy() + return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) + tm.assert_frame_equal(dropped, df) + tm.assert_frame_equal(inp, df) + assert return_value is None + + # all + dropped = df.dropna(axis=1, how="all") + tm.assert_frame_equal(dropped, df) + + df[2] = np.nan + dropped = df.dropna(axis=1, how="all") + expected = df.loc[:, [0, 1, 3]] + tm.assert_frame_equal(dropped, expected) + + # bad input + msg = "No axis named 3 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.dropna(axis=3) + + def test_drop_and_dropna_caching(self): + # tst that cacher updates + original = Series([1, 2, np.nan], name="A") + expected = Series([1, 2], dtype=original.dtype, name="A") + df = DataFrame({"A": original.values.copy()}) + df2 = df.copy() + df["A"].dropna() + tm.assert_series_equal(df["A"], original) + + ser = df["A"] + return_value = ser.dropna(inplace=True) + tm.assert_series_equal(ser, expected) + tm.assert_series_equal(df["A"], original) + assert return_value is None + + df2["A"].drop([1]) + tm.assert_series_equal(df2["A"], original) + + ser = df2["A"] + return_value = ser.drop([1], inplace=True) + tm.assert_series_equal(ser, original.drop([1])) + tm.assert_series_equal(df2["A"], original) + assert return_value is None + + def test_dropna_corner(self, float_frame): + # bad input + msg = "invalid how option: foo" + with pytest.raises(ValueError, match=msg): + float_frame.dropna(how="foo") + # non-existent column - 8303 + with pytest.raises(KeyError, match=r"^\['X'\]$"): + float_frame.dropna(subset=["A", "X"]) + + def test_dropna_multiple_axes(self): + df = DataFrame( + [ + [1, np.nan, 2, 3], + [4, np.nan, 5, 6], + [np.nan, np.nan, np.nan, np.nan], + [7, np.nan, 8, 9], + ] + ) + + # GH20987 + with pytest.raises(TypeError, match="supplying multiple axes"): + df.dropna(how="all", axis=[0, 1]) + with pytest.raises(TypeError, match="supplying multiple axes"): + df.dropna(how="all", axis=(0, 1)) + + inp = df.copy() + with pytest.raises(TypeError, match="supplying multiple axes"): + inp.dropna(how="all", axis=(0, 1), inplace=True) + + def test_dropna_tz_aware_datetime(self): + # GH13407 + df = DataFrame() + dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) + dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) + df["Time"] = [dt1] + result = df.dropna(axis=0) + expected = DataFrame({"Time": [dt1]}) + tm.assert_frame_equal(result, expected) + + # Ex2 + df = DataFrame({"Time": [dt1, None, np.nan, dt2]}) + result = df.dropna(axis=0) + expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3]) + tm.assert_frame_equal(result, expected) + + def test_dropna_categorical_interval_index(self): + # GH 25087 + ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28]) + ci = pd.CategoricalIndex(ii) + df = DataFrame({"A": list("abc")}, index=ci) + + expected = df + result = df.dropna() + tm.assert_frame_equal(result, expected) + + def test_dropna_with_duplicate_columns(self): + df = DataFrame( + { + "A": np.random.default_rng(2).standard_normal(5), + "B": np.random.default_rng(2).standard_normal(5), + "C": np.random.default_rng(2).standard_normal(5), + "D": ["a", "b", "c", "d", "e"], + } + ) + df.iloc[2, [0, 1, 2]] = np.nan + df.iloc[0, 0] = np.nan + df.iloc[1, 1] = np.nan + df.iloc[:, 3] = np.nan + expected = df.dropna(subset=["A", "B", "C"], how="all") + expected.columns = ["A", "A", "B", "C"] + + df.columns = ["A", "A", "B", "C"] + + result = df.dropna(subset=["A", "C"], how="all") + tm.assert_frame_equal(result, expected) + + def test_set_single_column_subset(self): + # GH 41021 + df = DataFrame({"A": [1, 2, 3], "B": list("abc"), "C": [4, np.nan, 5]}) + expected = DataFrame( + {"A": [1, 3], "B": list("ac"), "C": [4.0, 5.0]}, index=[0, 2] + ) + result = df.dropna(subset="C") + tm.assert_frame_equal(result, expected) + + def test_single_column_not_present_in_axis(self): + # GH 41021 + df = DataFrame({"A": [1, 2, 3]}) + + # Column not present + with pytest.raises(KeyError, match="['D']"): + df.dropna(subset="D", axis=0) + + def test_subset_is_nparray(self): + # GH 41021 + df = DataFrame({"A": [1, 2, np.nan], "B": list("abc"), "C": [4, np.nan, 5]}) + expected = DataFrame({"A": [1.0], "B": ["a"], "C": [4.0]}) + result = df.dropna(subset=np.array(["A", "C"])) + tm.assert_frame_equal(result, expected) + + def test_no_nans_in_frame(self, axis): + # GH#41965 + df = DataFrame([[1, 2], [3, 4]], columns=pd.RangeIndex(0, 2)) + expected = df.copy() + result = df.dropna(axis=axis) + tm.assert_frame_equal(result, expected, check_index_type=True) + + def test_how_thresh_param_incompatible(self): + # GH46575 + df = DataFrame([1, 2, pd.NA]) + msg = "You cannot set both the how and thresh arguments at the same time" + with pytest.raises(TypeError, match=msg): + df.dropna(how="all", thresh=2) + + with pytest.raises(TypeError, match=msg): + df.dropna(how="any", thresh=2) + + with pytest.raises(TypeError, match=msg): + df.dropna(how=None, thresh=None) + + @pytest.mark.parametrize("val", [1, 1.5]) + def test_dropna_ignore_index(self, val): + # GH#31725 + df = DataFrame({"a": [1, 2, val]}, index=[3, 2, 1]) + result = df.dropna(ignore_index=True) + expected = DataFrame({"a": [1, 2, val]}) + tm.assert_frame_equal(result, expected) + + df.dropna(ignore_index=True, inplace=True) + tm.assert_frame_equal(df, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_dtypes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..ab632ac17318eb76710fa504ac4558f59534e643 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_dtypes.py @@ -0,0 +1,153 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +from pandas import ( + DataFrame, + Series, + date_range, + option_context, +) +import pandas._testing as tm + + +class TestDataFrameDataTypes: + def test_empty_frame_dtypes(self): + empty_df = DataFrame() + tm.assert_series_equal(empty_df.dtypes, Series(dtype=object)) + + nocols_df = DataFrame(index=[1, 2, 3]) + tm.assert_series_equal(nocols_df.dtypes, Series(dtype=object)) + + norows_df = DataFrame(columns=list("abc")) + tm.assert_series_equal(norows_df.dtypes, Series(object, index=list("abc"))) + + norows_int_df = DataFrame(columns=list("abc")).astype(np.int32) + tm.assert_series_equal( + norows_int_df.dtypes, Series(np.dtype("int32"), index=list("abc")) + ) + + df = DataFrame({"a": 1, "b": True, "c": 1.0}, index=[1, 2, 3]) + ex_dtypes = Series({"a": np.int64, "b": np.bool_, "c": np.float64}) + tm.assert_series_equal(df.dtypes, ex_dtypes) + + # same but for empty slice of df + tm.assert_series_equal(df[:0].dtypes, ex_dtypes) + + def test_datetime_with_tz_dtypes(self): + tzframe = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": date_range("20130101", periods=3, tz="US/Eastern"), + "C": date_range("20130101", periods=3, tz="CET"), + } + ) + tzframe.iloc[1, 1] = pd.NaT + tzframe.iloc[1, 2] = pd.NaT + result = tzframe.dtypes.sort_index() + expected = Series( + [ + np.dtype("datetime64[ns]"), + DatetimeTZDtype("ns", "US/Eastern"), + DatetimeTZDtype("ns", "CET"), + ], + ["A", "B", "C"], + ) + + tm.assert_series_equal(result, expected) + + def test_dtypes_are_correct_after_column_slice(self): + # GH6525 + df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float64) + tm.assert_series_equal( + df.dtypes, + Series({"a": np.float64, "b": np.float64, "c": np.float64}), + ) + tm.assert_series_equal(df.iloc[:, 2:].dtypes, Series({"c": np.float64})) + tm.assert_series_equal( + df.dtypes, + Series({"a": np.float64, "b": np.float64, "c": np.float64}), + ) + + @pytest.mark.parametrize( + "data", + [pd.NA, True], + ) + def test_dtypes_are_correct_after_groupby_last(self, data): + # GH46409 + df = DataFrame( + {"id": [1, 2, 3, 4], "test": [True, pd.NA, data, False]} + ).convert_dtypes() + result = df.groupby("id").last().test + expected = df.set_index("id").test + assert result.dtype == pd.BooleanDtype() + tm.assert_series_equal(expected, result) + + def test_dtypes_gh8722(self, float_string_frame): + float_string_frame["bool"] = float_string_frame["A"] > 0 + result = float_string_frame.dtypes + expected = Series( + {k: v.dtype for k, v in float_string_frame.items()}, index=result.index + ) + tm.assert_series_equal(result, expected) + + # compat, GH 8722 + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with option_context("use_inf_as_na", True): + df = DataFrame([[1]]) + result = df.dtypes + tm.assert_series_equal(result, Series({0: np.dtype("int64")})) + + def test_dtypes_timedeltas(self): + df = DataFrame( + { + "A": Series(date_range("2012-1-1", periods=3, freq="D")), + "B": Series([timedelta(days=i) for i in range(3)]), + } + ) + result = df.dtypes + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB") + ) + tm.assert_series_equal(result, expected) + + df["C"] = df["A"] + df["B"] + result = df.dtypes + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + ], + index=list("ABC"), + ) + tm.assert_series_equal(result, expected) + + # mixed int types + df["D"] = 1 + result = df.dtypes + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + np.dtype("int64"), + ], + index=list("ABCD"), + ) + tm.assert_series_equal(result, expected) + + def test_frame_apply_np_array_return_type(self, using_infer_string): + # GH 35517 + df = DataFrame([["foo"]]) + result = df.apply(lambda col: np.array("bar")) + if using_infer_string: + expected = Series([np.array(["bar"])]) + else: + expected = Series(["bar"]) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_duplicated.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_duplicated.py new file mode 100644 index 0000000000000000000000000000000000000000..6052b61ea8db5b8c81c879250129a81634a33de0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_duplicated.py @@ -0,0 +1,117 @@ +import re +import sys + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + date_range, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) +def test_duplicated_with_misspelled_column_name(subset): + # GH 19730 + df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) + msg = re.escape("Index(['a'], dtype=") + + with pytest.raises(KeyError, match=msg): + df.duplicated(subset) + + +def test_duplicated_implemented_no_recursion(): + # gh-21524 + # Ensure duplicated isn't implemented using recursion that + # can fail on wide frames + df = DataFrame(np.random.default_rng(2).integers(0, 1000, (10, 1000))) + rec_limit = sys.getrecursionlimit() + try: + sys.setrecursionlimit(100) + result = df.duplicated() + finally: + sys.setrecursionlimit(rec_limit) + + # Then duplicates produce the bool Series as a result and don't fail during + # calculation. Actual values doesn't matter here, though usually it's all + # False in this case + assert isinstance(result, Series) + assert result.dtype == np.bool_ + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_keep(keep, expected): + df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_nan_none(keep, expected): + df = DataFrame({"C": [np.nan, 3, 3, None, np.nan], "x": 1}, dtype=object) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) +def test_duplicated_subset(subset, keep): + df = DataFrame( + { + "A": [0, 1, 1, 2, 0], + "B": ["a", "b", "b", "c", "a"], + "C": [np.nan, 3, 3, None, np.nan], + } + ) + + if subset is None: + subset = list(df.columns) + elif isinstance(subset, str): + # need to have a DataFrame, not a Series + # -> select columns with singleton list, not string + subset = [subset] + + expected = df[subset].duplicated(keep=keep) + result = df.duplicated(keep=keep, subset=subset) + tm.assert_series_equal(result, expected) + + +def test_duplicated_on_empty_frame(): + # GH 25184 + + df = DataFrame(columns=["a", "b"]) + dupes = df.duplicated("a") + + result = df[dupes] + expected = df.copy() + tm.assert_frame_equal(result, expected) + + +def test_frame_datetime64_duplicated(): + dates = date_range("2010-07-01", end="2010-08-05") + + tst = DataFrame({"symbol": "AAA", "date": dates}) + result = tst.duplicated(["date", "symbol"]) + assert (-result).all() + + tst = DataFrame({"date": dates}) + result = tst.date.duplicated() + assert (-result).all() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_equals.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_equals.py new file mode 100644 index 0000000000000000000000000000000000000000..d0b9d96cafa0db15203cb3057517571a178b25db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_equals.py @@ -0,0 +1,85 @@ +import numpy as np + +from pandas import ( + DataFrame, + date_range, +) +import pandas._testing as tm + + +class TestEquals: + def test_dataframe_not_equal(self): + # see GH#28839 + df1 = DataFrame({"a": [1, 2], "b": ["s", "d"]}) + df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]}) + assert df1.equals(df2) is False + + def test_equals_different_blocks(self, using_array_manager, using_infer_string): + # GH#9330 + df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) + df1 = df0.reset_index()[["A", "B", "C"]] + if not using_array_manager and not using_infer_string: + # this assert verifies that the above operations have + # induced a block rearrangement + assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype + + # do the real tests + tm.assert_frame_equal(df0, df1) + assert df0.equals(df1) + assert df1.equals(df0) + + def test_equals(self): + # Add object dtype column with nans + index = np.random.default_rng(2).random(10) + df1 = DataFrame( + np.random.default_rng(2).random(10), index=index, columns=["floats"] + ) + df1["text"] = "the sky is so blue. we could use more chocolate.".split() + df1["start"] = date_range("2000-1-1", periods=10, freq="min") + df1["end"] = date_range("2000-1-1", periods=10, freq="D") + df1["diff"] = df1["end"] - df1["start"] + # Explicitly cast to object, to avoid implicit cast when setting np.nan + df1["bool"] = (np.arange(10) % 3 == 0).astype(object) + df1.loc[::2] = np.nan + df2 = df1.copy() + assert df1["text"].equals(df2["text"]) + assert df1["start"].equals(df2["start"]) + assert df1["end"].equals(df2["end"]) + assert df1["diff"].equals(df2["diff"]) + assert df1["bool"].equals(df2["bool"]) + assert df1.equals(df2) + assert not df1.equals(object) + + # different dtype + different = df1.copy() + different["floats"] = different["floats"].astype("float32") + assert not df1.equals(different) + + # different index + different_index = -index + different = df2.set_index(different_index) + assert not df1.equals(different) + + # different columns + different = df2.copy() + different.columns = df2.columns[::-1] + assert not df1.equals(different) + + # DatetimeIndex + index = date_range("2000-1-1", periods=10, freq="min") + df1 = df1.set_index(index) + df2 = df1.copy() + assert df1.equals(df2) + + # MultiIndex + df3 = df1.set_index(["text"], append=True) + df2 = df1.set_index(["text"], append=True) + assert df3.equals(df2) + + df2 = df1.set_index(["floats"], append=True) + assert not df3.equals(df2) + + # NaN in index + df3 = df1.set_index(["floats"], append=True) + df2 = df1.set_index(["floats"], append=True) + assert df3.equals(df2) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_explode.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_explode.py new file mode 100644 index 0000000000000000000000000000000000000000..5cd54db62d7832004a797d4e28557807254ff486 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_explode.py @@ -0,0 +1,303 @@ +import re + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_error(): + df = pd.DataFrame( + {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + with pytest.raises( + ValueError, match="column must be a scalar, tuple, or list thereof" + ): + df.explode([list("AA")]) + + with pytest.raises(ValueError, match="column must be unique"): + df.explode(list("AA")) + + df.columns = list("AA") + with pytest.raises( + ValueError, + match=re.escape("DataFrame columns must be unique. Duplicate columns: ['A']"), + ): + df.explode("A") + + +@pytest.mark.parametrize( + "input_subset, error_message", + [ + ( + list("AC"), + "columns must have matching element counts", + ), + ( + [], + "column must be nonempty", + ), + ( + list("AC"), + "columns must have matching element counts", + ), + ], +) +def test_error_multi_columns(input_subset, error_message): + # GH 39240 + df = pd.DataFrame( + { + "A": [[0, 1, 2], np.nan, [], (3, 4)], + "B": 1, + "C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]], + }, + index=list("abcd"), + ) + with pytest.raises(ValueError, match=error_message): + df.explode(input_subset) + + +@pytest.mark.parametrize( + "scalar", + ["a", 0, 1.5, pd.Timedelta("1 days"), pd.Timestamp("2019-12-31")], +) +def test_basic(scalar): + df = pd.DataFrame( + {scalar: pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} + ) + result = df.explode(scalar) + expected = pd.DataFrame( + { + scalar: pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_multi_index_rows(): + df = pd.DataFrame( + {"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1}, + index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]), + ) + + result = df.explode("A") + expected = pd.DataFrame( + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], + index=pd.MultiIndex.from_tuples( + [ + ("a", 1), + ("a", 1), + ("a", 1), + ("a", 2), + ("b", 1), + ("b", 2), + ("b", 2), + ] + ), + dtype=object, + ), + "B": 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_multi_index_columns(): + df = pd.DataFrame( + {("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1} + ) + + result = df.explode(("A", 1)) + expected = pd.DataFrame( + { + ("A", 1): pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], + index=pd.Index([0, 0, 0, 1, 2, 3, 3]), + dtype=object, + ), + ("A", 2): 1, + } + ) + tm.assert_frame_equal(result, expected) + + +def test_usecase(): + # explode a single column + # gh-10511 + df = pd.DataFrame( + [[11, range(5), 10], [22, range(3), 20]], columns=list("ABC") + ).set_index("C") + result = df.explode("B") + + expected = pd.DataFrame( + { + "A": [11, 11, 11, 11, 11, 22, 22, 22], + "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object), + "C": [10, 10, 10, 10, 10, 20, 20, 20], + }, + columns=list("ABC"), + ).set_index("C") + + tm.assert_frame_equal(result, expected) + + # gh-8517 + df = pd.DataFrame( + [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]], + columns=["dt", "name", "text"], + ) + result = df.assign(text=df.text.str.split(" ")).explode("text") + expected = pd.DataFrame( + [ + ["2014-01-01", "Alice", "A"], + ["2014-01-01", "Alice", "B"], + ["2014-01-02", "Bob", "C"], + ["2014-01-02", "Bob", "D"], + ], + columns=["dt", "name", "text"], + index=[0, 0, 1, 1], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "input_dict, input_index, expected_dict, expected_index", + [ + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + [0, 0], + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + [0, 0, 0, 0], + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.Index([0, 0], name="my_index"), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.Index([0, 0, 0, 0], name="my_index"), + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0], [1, 1]], names=["my_first_index", "my_second_index"] + ), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0], [1, 1, 1, 1]], + names=["my_first_index", "my_second_index"], + ), + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None] + ), + ), + ], +) +def test_duplicate_index(input_dict, input_index, expected_dict, expected_index): + # GH 28005 + df = pd.DataFrame(input_dict, index=input_index, dtype=object) + result = df.explode("col1") + expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) + tm.assert_frame_equal(result, expected) + + +def test_ignore_index(): + # GH 34932 + df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]}) + result = df.explode("values", ignore_index=True) + expected = pd.DataFrame( + {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3] + ) + tm.assert_frame_equal(result, expected) + + +def test_explode_sets(): + # https://github.com/pandas-dev/pandas/issues/35614 + df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1]) + result = df.explode(column="a").sort_values(by="a") + expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "input_subset, expected_dict, expected_index", + [ + ( + list("AC"), + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4, np.nan], + index=list("aaabcdde"), + dtype=object, + ), + "B": 1, + "C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan], + }, + list("aaabcdde"), + ), + ( + list("A"), + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4, np.nan], + index=list("aaabcdde"), + dtype=object, + ), + "B": 1, + "C": [ + ["a", "b", "c"], + ["a", "b", "c"], + ["a", "b", "c"], + "foo", + [], + ["d", "e"], + ["d", "e"], + np.nan, + ], + }, + list("aaabcdde"), + ), + ], +) +def test_multi_columns(input_subset, expected_dict, expected_index): + # GH 39240 + df = pd.DataFrame( + { + "A": [[0, 1, 2], np.nan, [], (3, 4), np.nan], + "B": 1, + "C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan], + }, + index=list("abcde"), + ) + result = df.explode(input_subset) + expected = pd.DataFrame(expected_dict, expected_index) + tm.assert_frame_equal(result, expected) + + +def test_multi_columns_nan_empty(): + # GH 46084 + df = pd.DataFrame( + { + "A": [[0, 1], [5], [], [2, 3]], + "B": [9, 8, 7, 6], + "C": [[1, 2], np.nan, [], [3, 4]], + } + ) + result = df.explode(["A", "C"]) + expected = pd.DataFrame( + { + "A": np.array([0, 1, 5, np.nan, 2, 3], dtype=object), + "B": [9, 9, 8, 7, 6, 6], + "C": np.array([1, 2, np.nan, np.nan, 3, 4], dtype=object), + }, + index=[0, 0, 1, 2, 3, 3], + ) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_fillna.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_fillna.py new file mode 100644 index 0000000000000000000000000000000000000000..89c50a8c21e1c89dae9be9e267215c346474ffad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_fillna.py @@ -0,0 +1,932 @@ +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +import pandas.util._test_decorators as td + +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + NaT, + PeriodIndex, + Series, + TimedeltaIndex, + Timestamp, + date_range, + to_datetime, +) +import pandas._testing as tm +from pandas.tests.frame.common import _check_mixed_float + + +class TestFillNA: + def test_fillna_dict_inplace_nonunique_columns( + self, using_copy_on_write, warn_copy_on_write + ): + df = DataFrame( + {"A": [np.nan] * 3, "B": [NaT, Timestamp(1), NaT], "C": [np.nan, "foo", 2]} + ) + df.columns = ["A", "A", "A"] + orig = df[:] + + # TODO(CoW-warn) better warning message + with tm.assert_cow_warning(warn_copy_on_write): + df.fillna({"A": 2}, inplace=True) + # The first and third columns can be set inplace, while the second cannot. + + expected = DataFrame( + {"A": [2.0] * 3, "B": [2, Timestamp(1), 2], "C": [2, "foo", 2]} + ) + expected.columns = ["A", "A", "A"] + tm.assert_frame_equal(df, expected) + + # TODO: what's the expected/desired behavior with CoW? + if not using_copy_on_write: + assert tm.shares_memory(df.iloc[:, 0], orig.iloc[:, 0]) + assert not tm.shares_memory(df.iloc[:, 1], orig.iloc[:, 1]) + if not using_copy_on_write: + assert tm.shares_memory(df.iloc[:, 2], orig.iloc[:, 2]) + + @td.skip_array_manager_not_yet_implemented + def test_fillna_on_column_view(self, using_copy_on_write): + # GH#46149 avoid unnecessary copies + arr = np.full((40, 50), np.nan) + df = DataFrame(arr, copy=False) + + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df[0].fillna(-1, inplace=True) + assert np.isnan(arr[:, 0]).all() + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df[0].fillna(-1, inplace=True) + assert (arr[:, 0] == -1).all() + + # i.e. we didn't create a new 49-column block + assert len(df._mgr.arrays) == 1 + assert np.shares_memory(df.values, arr) + + def test_fillna_datetime(self, datetime_frame): + tf = datetime_frame + tf.loc[tf.index[:5], "A"] = np.nan + tf.loc[tf.index[-5:], "A"] = np.nan + + zero_filled = datetime_frame.fillna(0) + assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all() + + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + padded = datetime_frame.fillna(method="pad") + assert np.isnan(padded.loc[padded.index[:5], "A"]).all() + assert ( + padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] + ).all() + + msg = "Must specify a fill 'value' or 'method'" + with pytest.raises(ValueError, match=msg): + datetime_frame.fillna() + msg = "Cannot specify both 'value' and 'method'" + with pytest.raises(ValueError, match=msg): + datetime_frame.fillna(5, method="ffill") + + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + def test_fillna_mixed_type(self, float_string_frame): + mf = float_string_frame + mf.loc[mf.index[5:20], "foo"] = np.nan + mf.loc[mf.index[-10:], "A"] = np.nan + # TODO: make stronger assertion here, GH 25640 + mf.fillna(value=0) + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + mf.fillna(method="pad") + + def test_fillna_mixed_float(self, mixed_float_frame): + # mixed numeric (but no float16) + mf = mixed_float_frame.reindex(columns=["A", "B", "D"]) + mf.loc[mf.index[-10:], "A"] = np.nan + result = mf.fillna(value=0) + _check_mixed_float(result, dtype={"C": None}) + + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = mf.fillna(method="pad") + _check_mixed_float(result, dtype={"C": None}) + + def test_fillna_empty(self, using_copy_on_write): + if using_copy_on_write: + pytest.skip("condition is unnecessary complex and is deprecated anyway") + # empty frame (GH#2778) + df = DataFrame(columns=["x"]) + for m in ["pad", "backfill"]: + msg = "Series.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.x.fillna(method=m, inplace=True) + df.x.fillna(method=m) + + def test_fillna_different_dtype(self, using_infer_string): + # with different dtype (GH#3386) + df = DataFrame( + [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] + ) + + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna({2: "foo"}) + else: + result = df.fillna({2: "foo"}) + expected = DataFrame( + [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] + ) + tm.assert_frame_equal(result, expected) + + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + return_value = df.fillna({2: "foo"}, inplace=True) + else: + return_value = df.fillna({2: "foo"}, inplace=True) + tm.assert_frame_equal(df, expected) + assert return_value is None + + def test_fillna_limit_and_value(self): + # limit and value + df = DataFrame(np.random.default_rng(2).standard_normal((10, 3))) + df.iloc[2:7, 0] = np.nan + df.iloc[3:5, 2] = np.nan + + expected = df.copy() + expected.iloc[2, 0] = 999 + expected.iloc[3, 2] = 999 + result = df.fillna(999, limit=1) + tm.assert_frame_equal(result, expected) + + def test_fillna_datelike(self): + # with datelike + # GH#6344 + df = DataFrame( + { + "Date": [NaT, Timestamp("2014-1-1")], + "Date2": [Timestamp("2013-1-1"), NaT], + } + ) + + expected = df.copy() + expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"]) + result = df.fillna(value={"Date": df["Date2"]}) + tm.assert_frame_equal(result, expected) + + def test_fillna_tzaware(self): + # with timezone + # GH#15855 + df = DataFrame({"A": [Timestamp("2012-11-11 00:00:00+01:00"), NaT]}) + exp = DataFrame( + { + "A": [ + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + ] + } + ) + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.fillna(method="pad") + tm.assert_frame_equal(res, exp) + + df = DataFrame({"A": [NaT, Timestamp("2012-11-11 00:00:00+01:00")]}) + exp = DataFrame( + { + "A": [ + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + ] + } + ) + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.fillna(method="bfill") + tm.assert_frame_equal(res, exp) + + def test_fillna_tzaware_different_column(self): + # with timezone in another column + # GH#15522 + df = DataFrame( + { + "A": date_range("20130101", periods=4, tz="US/Eastern"), + "B": [1, 2, np.nan, np.nan], + } + ) + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.fillna(method="pad") + expected = DataFrame( + { + "A": date_range("20130101", periods=4, tz="US/Eastern"), + "B": [1.0, 2.0, 2.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) + + def test_na_actions_categorical(self): + cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) + vals = ["a", "b", np.nan, "d"] + df = DataFrame({"cats": cat, "vals": vals}) + cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3]) + vals2 = ["a", "b", "b", "d"] + df_exp_fill = DataFrame({"cats": cat2, "vals": vals2}) + cat3 = Categorical([1, 2, 3], categories=[1, 2, 3]) + vals3 = ["a", "b", np.nan] + df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3}) + cat4 = Categorical([1, 2], categories=[1, 2, 3]) + vals4 = ["a", "b"] + df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4}) + + # fillna + res = df.fillna(value={"cats": 3, "vals": "b"}) + tm.assert_frame_equal(res, df_exp_fill) + + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(TypeError, match=msg): + df.fillna(value={"cats": 4, "vals": "c"}) + + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.fillna(method="pad") + tm.assert_frame_equal(res, df_exp_fill) + + # dropna + res = df.dropna(subset=["cats"]) + tm.assert_frame_equal(res, df_exp_drop_cats) + + res = df.dropna() + tm.assert_frame_equal(res, df_exp_drop_all) + + # make sure that fillna takes missing values into account + c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) + df = DataFrame({"cats": c, "vals": [1, 2, 3]}) + + cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) + df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) + + res = df.fillna("a") + tm.assert_frame_equal(res, df_exp) + + def test_fillna_categorical_nan(self): + # GH#14021 + # np.nan should always be a valid filler + cat = Categorical([np.nan, 2, np.nan]) + val = Categorical([np.nan, np.nan, np.nan]) + df = DataFrame({"cats": cat, "vals": val}) + + # GH#32950 df.median() is poorly behaved because there is no + # Categorical.median + median = Series({"cats": 2.0, "vals": np.nan}) + + res = df.fillna(median) + v_exp = [np.nan, np.nan, np.nan] + df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") + tm.assert_frame_equal(res, df_exp) + + result = df.cats.fillna(np.nan) + tm.assert_series_equal(result, df.cats) + + result = df.vals.fillna(np.nan) + tm.assert_series_equal(result, df.vals) + + idx = DatetimeIndex( + ["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", NaT, NaT] + ) + df = DataFrame({"a": Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=NaT), df) + + idx = PeriodIndex(["2011-01", "2011-01", "2011-01", NaT, NaT], freq="M") + df = DataFrame({"a": Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=NaT), df) + + idx = TimedeltaIndex(["1 days", "2 days", "1 days", NaT, NaT]) + df = DataFrame({"a": Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=NaT), df) + + def test_fillna_downcast(self): + # GH#15277 + # infer int64 from float64 + df = DataFrame({"a": [1.0, np.nan]}) + msg = "The 'downcast' keyword in fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.fillna(0, downcast="infer") + expected = DataFrame({"a": [1, 0]}) + tm.assert_frame_equal(result, expected) + + # infer int64 from float64 when fillna value is a dict + df = DataFrame({"a": [1.0, np.nan]}) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.fillna({"a": 0}, downcast="infer") + expected = DataFrame({"a": [1, 0]}) + tm.assert_frame_equal(result, expected) + + def test_fillna_downcast_false(self, frame_or_series): + # GH#45603 preserve object dtype with downcast=False + obj = frame_or_series([1, 2, 3], dtype="object") + msg = "The 'downcast' keyword in fillna" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = obj.fillna("", downcast=False) + tm.assert_equal(result, obj) + + def test_fillna_downcast_noop(self, frame_or_series): + # GH#45423 + # Two relevant paths: + # 1) not _can_hold_na (e.g. integer) + # 2) _can_hold_na + noop + not can_hold_element + + obj = frame_or_series([1, 2, 3], dtype=np.int64) + + msg = "The 'downcast' keyword in fillna" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#40988 + res = obj.fillna("foo", downcast=np.dtype(np.int32)) + expected = obj.astype(np.int32) + tm.assert_equal(res, expected) + + obj2 = obj.astype(np.float64) + with tm.assert_produces_warning(FutureWarning, match=msg): + res2 = obj2.fillna("foo", downcast="infer") + expected2 = obj # get back int64 + tm.assert_equal(res2, expected2) + + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#40988 + res3 = obj2.fillna("foo", downcast=np.dtype(np.int32)) + tm.assert_equal(res3, expected) + + @pytest.mark.parametrize("columns", [["A", "A", "B"], ["A", "A"]]) + def test_fillna_dictlike_value_duplicate_colnames(self, columns): + # GH#43476 + df = DataFrame(np.nan, index=[0, 1], columns=columns) + with tm.assert_produces_warning(None): + result = df.fillna({"A": 0}) + + expected = df.copy() + expected["A"] = 0.0 + tm.assert_frame_equal(result, expected) + + def test_fillna_dtype_conversion(self, using_infer_string): + # make sure that fillna on an empty frame works + df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) + result = df.dtypes + expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5]) + tm.assert_series_equal(result, expected) + + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.fillna(1) + expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) + tm.assert_frame_equal(result, expected) + + # empty block + df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna("nan") + else: + result = df.fillna("nan") + expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) + def test_fillna_dtype_conversion_equiv_replace(self, val): + df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]}) + expected = df.replace(np.nan, val) + result = df.fillna(val) + tm.assert_frame_equal(result, expected) + + def test_fillna_datetime_columns(self): + # GH#7095 + df = DataFrame( + { + "A": [-1, -2, np.nan], + "B": date_range("20130101", periods=3), + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + }, + index=date_range("20130110", periods=3), + ) + result = df.fillna("?") + expected = DataFrame( + { + "A": [-1, -2, "?"], + "B": date_range("20130101", periods=3), + "C": ["foo", "bar", "?"], + "D": ["foo2", "bar2", "?"], + }, + index=date_range("20130110", periods=3), + ) + tm.assert_frame_equal(result, expected) + + df = DataFrame( + { + "A": [-1, -2, np.nan], + "B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), NaT], + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + }, + index=date_range("20130110", periods=3), + ) + result = df.fillna("?") + expected = DataFrame( + { + "A": [-1, -2, "?"], + "B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), "?"], + "C": ["foo", "bar", "?"], + "D": ["foo2", "bar2", "?"], + }, + index=date_range("20130110", periods=3), + ) + tm.assert_frame_equal(result, expected) + + def test_ffill(self, datetime_frame): + datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan + datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan + + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + alt = datetime_frame.fillna(method="ffill") + tm.assert_frame_equal(datetime_frame.ffill(), alt) + + def test_bfill(self, datetime_frame): + datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan + datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan + + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + alt = datetime_frame.fillna(method="bfill") + + tm.assert_frame_equal(datetime_frame.bfill(), alt) + + def test_frame_pad_backfill_limit(self): + index = np.arange(10) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=index) + + result = df[:2].reindex(index, method="pad", limit=5) + + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df[:2].reindex(index).fillna(method="pad") + expected.iloc[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index, method="backfill", limit=5) + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df[-2:].reindex(index).fillna(method="backfill") + expected.iloc[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_frame_fillna_limit(self): + index = np.arange(10) + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=index) + + result = df[:2].reindex(index) + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = result.fillna(method="pad", limit=5) + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df[:2].reindex(index).fillna(method="pad") + expected.iloc[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = result.fillna(method="backfill", limit=5) + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df[-2:].reindex(index).fillna(method="backfill") + expected.iloc[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_fillna_skip_certain_blocks(self): + # don't try to fill boolean, int blocks + + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)).astype(int)) + + # it works! + df.fillna(np.nan) + + @pytest.mark.parametrize("type", [int, float]) + def test_fillna_positive_limit(self, type): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))).astype(type) + + msg = "Limit must be greater than 0" + with pytest.raises(ValueError, match=msg): + df.fillna(0, limit=-5) + + @pytest.mark.parametrize("type", [int, float]) + def test_fillna_integer_limit(self, type): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))).astype(type) + + msg = "Limit must be an integer" + with pytest.raises(ValueError, match=msg): + df.fillna(0, limit=0.5) + + def test_fillna_inplace(self): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) + df.loc[:4, 1] = np.nan + df.loc[-4:, 3] = np.nan + + expected = df.fillna(value=0) + assert expected is not df + + df.fillna(value=0, inplace=True) + tm.assert_frame_equal(df, expected) + + expected = df.fillna(value={0: 0}, inplace=True) + assert expected is None + + df.loc[:4, 1] = np.nan + df.loc[-4:, 3] = np.nan + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.fillna(method="ffill") + assert expected is not df + + with tm.assert_produces_warning(FutureWarning, match=msg): + df.fillna(method="ffill", inplace=True) + tm.assert_frame_equal(df, expected) + + def test_fillna_dict_series(self): + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + ) + + result = df.fillna({"a": 0, "b": 5}) + + expected = df.copy() + expected["a"] = expected["a"].fillna(0) + expected["b"] = expected["b"].fillna(5) + tm.assert_frame_equal(result, expected) + + # it works + result = df.fillna({"a": 0, "b": 5, "d": 7}) + + # Series treated same as dict + result = df.fillna(df.max()) + expected = df.fillna(df.max().to_dict()) + tm.assert_frame_equal(result, expected) + + # disable this for now + with pytest.raises(NotImplementedError, match="column by column"): + df.fillna(df.max(1), axis=1) + + def test_fillna_dataframe(self): + # GH#8377 + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + }, + index=list("VWXYZ"), + ) + + # df2 may have different index and columns + df2 = DataFrame( + { + "a": [np.nan, 10, 20, 30, 40], + "b": [50, 60, 70, 80, 90], + "foo": ["bar"] * 5, + }, + index=list("VWXuZ"), + ) + + result = df.fillna(df2) + + # only those columns and indices which are shared get filled + expected = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, 40], + "b": [1, 2, 3, np.nan, 90], + "c": [np.nan, 1, 2, 3, 4], + }, + index=list("VWXYZ"), + ) + + tm.assert_frame_equal(result, expected) + + def test_fillna_columns(self): + arr = np.random.default_rng(2).standard_normal((10, 10)) + arr[:, ::2] = np.nan + df = DataFrame(arr) + + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.fillna(method="ffill", axis=1) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.T.fillna(method="pad").T + tm.assert_frame_equal(result, expected) + + df.insert(6, "foo", 5) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.fillna(method="ffill", axis=1) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.astype(float).fillna(method="ffill", axis=1) + tm.assert_frame_equal(result, expected) + + def test_fillna_invalid_method(self, float_frame): + with pytest.raises(ValueError, match="ffil"): + float_frame.fillna(method="ffil") + + def test_fillna_invalid_value(self, float_frame): + # list + msg = '"value" parameter must be a scalar or dict, but you passed a "{}"' + with pytest.raises(TypeError, match=msg.format("list")): + float_frame.fillna([1, 2]) + # tuple + with pytest.raises(TypeError, match=msg.format("tuple")): + float_frame.fillna((1, 2)) + # frame with series + msg = ( + '"value" parameter must be a scalar, dict or Series, but you ' + 'passed a "DataFrame"' + ) + with pytest.raises(TypeError, match=msg): + float_frame.iloc[:, 0].fillna(float_frame) + + def test_fillna_col_reordering(self): + cols = ["COL." + str(i) for i in range(5, 0, -1)] + data = np.random.default_rng(2).random((20, 5)) + df = DataFrame(index=range(20), columns=cols, data=data) + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + filled = df.fillna(method="ffill") + assert df.columns.tolist() == filled.columns.tolist() + + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + def test_fill_corner(self, float_frame, float_string_frame): + mf = float_string_frame + mf.loc[mf.index[5:20], "foo"] = np.nan + mf.loc[mf.index[-10:], "A"] = np.nan + + filled = float_string_frame.fillna(value=0) + assert (filled.loc[filled.index[5:20], "foo"] == 0).all() + del float_string_frame["foo"] + + float_frame.reindex(columns=[]).fillna(value=0) + + def test_fillna_downcast_dict(self): + # GH#40809 + df = DataFrame({"col1": [1, np.nan]}) + + msg = "The 'downcast' keyword in fillna" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.fillna({"col1": 2}, downcast={"col1": "int64"}) + expected = DataFrame({"col1": [1, 2]}) + tm.assert_frame_equal(result, expected) + + def test_fillna_with_columns_and_limit(self): + # GH40989 + df = DataFrame( + [ + [np.nan, 2, np.nan, 0], + [3, 4, np.nan, 1], + [np.nan, np.nan, np.nan, 5], + [np.nan, 3, np.nan, 4], + ], + columns=list("ABCD"), + ) + result = df.fillna(axis=1, value=100, limit=1) + result2 = df.fillna(axis=1, value=100, limit=2) + + expected = DataFrame( + { + "A": Series([100, 3, 100, 100], dtype="float64"), + "B": [2, 4, np.nan, 3], + "C": [np.nan, 100, np.nan, np.nan], + "D": Series([0, 1, 5, 4], dtype="float64"), + }, + index=[0, 1, 2, 3], + ) + expected2 = DataFrame( + { + "A": Series([100, 3, 100, 100], dtype="float64"), + "B": Series([2, 4, 100, 3], dtype="float64"), + "C": [100, 100, np.nan, 100], + "D": Series([0, 1, 5, 4], dtype="float64"), + }, + index=[0, 1, 2, 3], + ) + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected2) + + def test_fillna_datetime_inplace(self): + # GH#48863 + df = DataFrame( + { + "date1": to_datetime(["2018-05-30", None]), + "date2": to_datetime(["2018-09-30", None]), + } + ) + expected = df.copy() + df.fillna(np.nan, inplace=True) + tm.assert_frame_equal(df, expected) + + def test_fillna_inplace_with_columns_limit_and_value(self): + # GH40989 + df = DataFrame( + [ + [np.nan, 2, np.nan, 0], + [3, 4, np.nan, 1], + [np.nan, np.nan, np.nan, 5], + [np.nan, 3, np.nan, 4], + ], + columns=list("ABCD"), + ) + + expected = df.fillna(axis=1, value=100, limit=1) + assert expected is not df + + df.fillna(axis=1, value=100, limit=1, inplace=True) + tm.assert_frame_equal(df, expected) + + @td.skip_array_manager_invalid_test + @pytest.mark.parametrize("val", [-1, {"x": -1, "y": -1}]) + def test_inplace_dict_update_view( + self, val, using_copy_on_write, warn_copy_on_write + ): + # GH#47188 + df = DataFrame({"x": [np.nan, 2], "y": [np.nan, 2]}) + df_orig = df.copy() + result_view = df[:] + with tm.assert_cow_warning(warn_copy_on_write): + df.fillna(val, inplace=True) + expected = DataFrame({"x": [-1, 2.0], "y": [-1.0, 2]}) + tm.assert_frame_equal(df, expected) + if using_copy_on_write: + tm.assert_frame_equal(result_view, df_orig) + else: + tm.assert_frame_equal(result_view, expected) + + def test_single_block_df_with_horizontal_axis(self): + # GH 47713 + df = DataFrame( + { + "col1": [5, 0, np.nan, 10, np.nan], + "col2": [7, np.nan, np.nan, 5, 3], + "col3": [12, np.nan, 1, 2, 0], + "col4": [np.nan, 1, 1, np.nan, 18], + } + ) + result = df.fillna(50, limit=1, axis=1) + expected = DataFrame( + [ + [5.0, 7.0, 12.0, 50.0], + [0.0, 50.0, np.nan, 1.0], + [50.0, np.nan, 1.0, 1.0], + [10.0, 5.0, 2.0, 50.0], + [50.0, 3.0, 0.0, 18.0], + ], + columns=["col1", "col2", "col3", "col4"], + ) + tm.assert_frame_equal(result, expected) + + def test_fillna_with_multi_index_frame(self): + # GH 47649 + pdf = DataFrame( + { + ("x", "a"): [np.nan, 2.0, 3.0], + ("x", "b"): [1.0, 2.0, np.nan], + ("y", "c"): [1.0, 2.0, np.nan], + } + ) + expected = DataFrame( + { + ("x", "a"): [-1.0, 2.0, 3.0], + ("x", "b"): [1.0, 2.0, -1.0], + ("y", "c"): [1.0, 2.0, np.nan], + } + ) + tm.assert_frame_equal(pdf.fillna({"x": -1}), expected) + tm.assert_frame_equal(pdf.fillna({"x": -1, ("x", "b"): -2}), expected) + + expected = DataFrame( + { + ("x", "a"): [-1.0, 2.0, 3.0], + ("x", "b"): [1.0, 2.0, -2.0], + ("y", "c"): [1.0, 2.0, np.nan], + } + ) + tm.assert_frame_equal(pdf.fillna({("x", "b"): -2, "x": -1}), expected) + + +def test_fillna_nonconsolidated_frame(): + # https://github.com/pandas-dev/pandas/issues/36495 + df = DataFrame( + [ + [1, 1, 1, 1.0], + [2, 2, 2, 2.0], + [3, 3, 3, 3.0], + ], + columns=["i1", "i2", "i3", "f1"], + ) + df_nonconsol = df.pivot(index="i1", columns="i2") + result = df_nonconsol.fillna(0) + assert result.isna().sum().sum() == 0 + + +def test_fillna_nones_inplace(): + # GH 48480 + df = DataFrame( + [[None, None], [None, None]], + columns=["A", "B"], + ) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.fillna(value={"A": 1, "B": 2}, inplace=True) + + expected = DataFrame([[1, 2], [1, 2]], columns=["A", "B"]) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("func", ["pad", "backfill"]) +def test_pad_backfill_deprecated(func): + # GH#33396 + df = DataFrame({"a": [1, 2, 3]}) + with tm.assert_produces_warning(FutureWarning): + getattr(df, func)() + + +@pytest.mark.parametrize( + "data, expected_data, method, kwargs", + ( + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan], + "ffill", + {"limit_area": "inside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan], + "ffill", + {"limit_area": "inside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0], + "ffill", + {"limit_area": "outside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan], + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + range(5), + range(5), + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "inside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "inside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "outside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "outside", "limit": 1}, + ), + ), +) +def test_ffill_bfill_limit_area(data, expected_data, method, kwargs): + # GH#56492 + df = DataFrame(data) + expected = DataFrame(expected_data) + result = getattr(df, method)(**kwargs) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_filter.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_filter.py new file mode 100644 index 0000000000000000000000000000000000000000..9d5e6876bb08c2929e6a54c18b865e2720c1424d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_filter.py @@ -0,0 +1,153 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +class TestDataFrameFilter: + def test_filter(self, float_frame, float_string_frame): + # Items + filtered = float_frame.filter(["A", "B", "E"]) + assert len(filtered.columns) == 2 + assert "E" not in filtered + + filtered = float_frame.filter(["A", "B", "E"], axis="columns") + assert len(filtered.columns) == 2 + assert "E" not in filtered + + # Other axis + idx = float_frame.index[0:4] + filtered = float_frame.filter(idx, axis="index") + expected = float_frame.reindex(index=idx) + tm.assert_frame_equal(filtered, expected) + + # like + fcopy = float_frame.copy() + fcopy["AA"] = 1 + + filtered = fcopy.filter(like="A") + assert len(filtered.columns) == 2 + assert "AA" in filtered + + # like with ints in column names + df = DataFrame(0.0, index=[0, 1, 2], columns=[0, 1, "_A", "_B"]) + filtered = df.filter(like="_") + assert len(filtered.columns) == 2 + + # regex with ints in column names + # from PR #10384 + df = DataFrame(0.0, index=[0, 1, 2], columns=["A1", 1, "B", 2, "C"]) + expected = DataFrame( + 0.0, index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object) + ) + filtered = df.filter(regex="^[0-9]+$") + tm.assert_frame_equal(filtered, expected) + + expected = DataFrame(0.0, index=[0, 1, 2], columns=[0, "0", 1, "1"]) + # shouldn't remove anything + filtered = expected.filter(regex="^[0-9]+$") + tm.assert_frame_equal(filtered, expected) + + # pass in None + with pytest.raises(TypeError, match="Must pass"): + float_frame.filter() + with pytest.raises(TypeError, match="Must pass"): + float_frame.filter(items=None) + with pytest.raises(TypeError, match="Must pass"): + float_frame.filter(axis=1) + + # test mutually exclusive arguments + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$", like="bbi") + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$", axis=1) + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$") + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], like="bbi", axis=0) + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], like="bbi") + + # objects + filtered = float_string_frame.filter(like="foo") + assert "foo" in filtered + + # unicode columns, won't ascii-encode + df = float_frame.rename(columns={"B": "\u2202"}) + filtered = df.filter(like="C") + assert "C" in filtered + + def test_filter_regex_search(self, float_frame): + fcopy = float_frame.copy() + fcopy["AA"] = 1 + + # regex + filtered = fcopy.filter(regex="[A]+") + assert len(filtered.columns) == 2 + assert "AA" in filtered + + # doesn't have to be at beginning + df = DataFrame( + {"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]} + ) + + result = df.filter(regex="BB") + exp = df[[x for x in df.columns if "BB" in x]] + tm.assert_frame_equal(result, exp) + + @pytest.mark.parametrize( + "name,expected", + [ + ("a", DataFrame({"a": [1, 2]})), + ("a", DataFrame({"a": [1, 2]})), + ("あ", DataFrame({"あ": [3, 4]})), + ], + ) + def test_filter_unicode(self, name, expected): + # GH13101 + df = DataFrame({"a": [1, 2], "あ": [3, 4]}) + + tm.assert_frame_equal(df.filter(like=name), expected) + tm.assert_frame_equal(df.filter(regex=name), expected) + + @pytest.mark.parametrize("name", ["a", "a"]) + def test_filter_bytestring(self, name): + # GH13101 + df = DataFrame({b"a": [1, 2], b"b": [3, 4]}) + expected = DataFrame({b"a": [1, 2]}) + + tm.assert_frame_equal(df.filter(like=name), expected) + tm.assert_frame_equal(df.filter(regex=name), expected) + + def test_filter_corner(self): + empty = DataFrame() + + result = empty.filter([]) + tm.assert_frame_equal(result, empty) + + result = empty.filter(like="foo") + tm.assert_frame_equal(result, empty) + + def test_filter_regex_non_string(self): + # GH#5798 trying to filter on non-string columns should drop, + # not raise + df = DataFrame(np.random.default_rng(2).random((3, 2)), columns=["STRING", 123]) + result = df.filter(regex="STRING") + expected = df[["STRING"]] + tm.assert_frame_equal(result, expected) + + def test_filter_keep_order(self): + # GH#54980 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[["B", "A"]] + tm.assert_frame_equal(result, expected) + + def test_filter_different_dtype(self): + # GH#54980 + df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[[]] + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_first_and_last.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_first_and_last.py new file mode 100644 index 0000000000000000000000000000000000000000..212e56442ee07460d61c4ef0b790e7bb193f9b3e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_first_and_last.py @@ -0,0 +1,143 @@ +""" +Note: includes tests for `last` +""" +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + bdate_range, + date_range, +) +import pandas._testing as tm + +deprecated_msg = "first is deprecated" +last_deprecated_msg = "last is deprecated" + + +class TestFirst: + def test_first_subset(self, frame_or_series): + ts = DataFrame( + np.random.default_rng(2).standard_normal((100, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100, freq="12h"), + ) + ts = tm.get_obj(ts, frame_or_series) + with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): + result = ts.first("10d") + assert len(result) == 20 + + ts = DataFrame( + np.random.default_rng(2).standard_normal((100, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100, freq="D"), + ) + ts = tm.get_obj(ts, frame_or_series) + with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): + result = ts.first("10d") + assert len(result) == 10 + + with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): + result = ts.first("3ME") + expected = ts[:"3/31/2000"] + tm.assert_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): + result = ts.first("21D") + expected = ts[:21] + tm.assert_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): + result = ts[:0].first("3ME") + tm.assert_equal(result, ts[:0]) + + def test_first_last_raises(self, frame_or_series): + # GH#20725 + obj = DataFrame([[1, 2, 3], [4, 5, 6]]) + obj = tm.get_obj(obj, frame_or_series) + + msg = "'first' only supports a DatetimeIndex index" + with tm.assert_produces_warning( + FutureWarning, match=deprecated_msg + ), pytest.raises( + TypeError, match=msg + ): # index is not a DatetimeIndex + obj.first("1D") + + msg = "'last' only supports a DatetimeIndex index" + with tm.assert_produces_warning( + FutureWarning, match=last_deprecated_msg + ), pytest.raises( + TypeError, match=msg + ): # index is not a DatetimeIndex + obj.last("1D") + + def test_last_subset(self, frame_or_series): + ts = DataFrame( + np.random.default_rng(2).standard_normal((100, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100, freq="12h"), + ) + ts = tm.get_obj(ts, frame_or_series) + with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): + result = ts.last("10d") + assert len(result) == 20 + + ts = DataFrame( + np.random.default_rng(2).standard_normal((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=30, freq="D"), + ) + ts = tm.get_obj(ts, frame_or_series) + with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): + result = ts.last("10d") + assert len(result) == 10 + + with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): + result = ts.last("21D") + expected = ts["2000-01-10":] + tm.assert_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): + result = ts.last("21D") + expected = ts[-21:] + tm.assert_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): + result = ts[:0].last("3ME") + tm.assert_equal(result, ts[:0]) + + @pytest.mark.parametrize("start, periods", [("2010-03-31", 1), ("2010-03-30", 2)]) + def test_first_with_first_day_last_of_month(self, frame_or_series, start, periods): + # GH#29623 + x = frame_or_series([1] * 100, index=bdate_range(start, periods=100)) + with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): + result = x.first("1ME") + expected = frame_or_series( + [1] * periods, index=bdate_range(start, periods=periods) + ) + tm.assert_equal(result, expected) + + def test_first_with_first_day_end_of_frq_n_greater_one(self, frame_or_series): + # GH#29623 + x = frame_or_series([1] * 100, index=bdate_range("2010-03-31", periods=100)) + with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): + result = x.first("2ME") + expected = frame_or_series( + [1] * 23, index=bdate_range("2010-03-31", "2010-04-30") + ) + tm.assert_equal(result, expected) + + def test_empty_not_input(self): + # GH#51032 + df = DataFrame(index=pd.DatetimeIndex([])) + with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): + result = df.last(offset=1) + + with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): + result = df.first(offset=1) + + tm.assert_frame_equal(df, result) + assert df is not result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_first_valid_index.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_first_valid_index.py new file mode 100644 index 0000000000000000000000000000000000000000..2e27f1aa7170058be9cf267984da6d3e3338dc85 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_first_valid_index.py @@ -0,0 +1,78 @@ +""" +Includes test for last_valid_index. +""" +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, + date_range, +) + + +class TestFirstValidIndex: + def test_first_valid_index_single_nan(self, frame_or_series): + # GH#9752 Series/DataFrame should both return None, not raise + obj = frame_or_series([np.nan]) + + assert obj.first_valid_index() is None + assert obj.iloc[:0].first_valid_index() is None + + @pytest.mark.parametrize( + "empty", [DataFrame(), Series(dtype=object), Series([], index=[], dtype=object)] + ) + def test_first_valid_index_empty(self, empty): + # GH#12800 + assert empty.last_valid_index() is None + assert empty.first_valid_index() is None + + @pytest.mark.parametrize( + "data,idx,expected_first,expected_last", + [ + ({"A": [1, 2, 3]}, [1, 1, 2], 1, 2), + ({"A": [1, 2, 3]}, [1, 2, 2], 1, 2), + ({"A": [1, 2, 3, 4]}, ["d", "d", "d", "d"], "d", "d"), + ({"A": [1, np.nan, 3]}, [1, 1, 2], 1, 2), + ({"A": [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), + ({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2), + ], + ) + def test_first_last_valid_frame(self, data, idx, expected_first, expected_last): + # GH#21441 + df = DataFrame(data, index=idx) + assert expected_first == df.first_valid_index() + assert expected_last == df.last_valid_index() + + @pytest.mark.parametrize( + "index", + [Index([str(i) for i in range(20)]), date_range("2020-01-01", periods=20)], + ) + def test_first_last_valid(self, index): + mat = np.random.default_rng(2).standard_normal(len(index)) + mat[:5] = np.nan + mat[-5:] = np.nan + + frame = DataFrame({"foo": mat}, index=index) + assert frame.first_valid_index() == frame.index[5] + assert frame.last_valid_index() == frame.index[-6] + + ser = frame["foo"] + assert ser.first_valid_index() == frame.index[5] + assert ser.last_valid_index() == frame.index[-6] + + @pytest.mark.parametrize( + "index", + [Index([str(i) for i in range(10)]), date_range("2020-01-01", periods=10)], + ) + def test_first_last_valid_all_nan(self, index): + # GH#17400: no valid entries + frame = DataFrame(np.nan, columns=["foo"], index=index) + + assert frame.last_valid_index() is None + assert frame.first_valid_index() is None + + ser = frame["foo"] + assert ser.first_valid_index() is None + assert ser.last_valid_index() is None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_get_numeric_data.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_get_numeric_data.py new file mode 100644 index 0000000000000000000000000000000000000000..c5d32d56d03c1174fda097ee973678b3e3e73d27 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_get_numeric_data.py @@ -0,0 +1,102 @@ +import numpy as np + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Index, + Series, + Timestamp, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +class TestGetNumericData: + def test_get_numeric_data_preserve_dtype(self): + # get the numeric data + obj = DataFrame({"A": [1, "2", 3.0]}, columns=Index(["A"], dtype="object")) + result = obj._get_numeric_data() + expected = DataFrame(dtype=object, index=pd.RangeIndex(3), columns=[]) + tm.assert_frame_equal(result, expected) + + def test_get_numeric_data(self, using_infer_string): + datetime64name = np.dtype("M8[s]").name + objectname = np.dtype(np.object_).name + + df = DataFrame( + {"a": 1.0, "b": 2, "c": "foo", "f": Timestamp("20010102")}, + index=np.arange(10), + ) + result = df.dtypes + expected = Series( + [ + np.dtype("float64"), + np.dtype("int64"), + np.dtype(objectname) if not using_infer_string else "string", + np.dtype(datetime64name), + ], + index=["a", "b", "c", "f"], + ) + tm.assert_series_equal(result, expected) + + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "d": np.array([1.0] * 10, dtype="float32"), + "e": np.array([1] * 10, dtype="int32"), + "f": np.array([1] * 10, dtype="int16"), + "g": Timestamp("20010102"), + }, + index=np.arange(10), + ) + + result = df._get_numeric_data() + expected = df.loc[:, ["a", "b", "d", "e", "f"]] + tm.assert_frame_equal(result, expected) + + only_obj = df.loc[:, ["c", "g"]] + result = only_obj._get_numeric_data() + expected = df.loc[:, []] + tm.assert_frame_equal(result, expected) + + df = DataFrame.from_dict({"a": [1, 2], "b": ["foo", "bar"], "c": [np.pi, np.e]}) + result = df._get_numeric_data() + expected = DataFrame.from_dict({"a": [1, 2], "c": [np.pi, np.e]}) + tm.assert_frame_equal(result, expected) + + df = result.copy() + result = df._get_numeric_data() + expected = df + tm.assert_frame_equal(result, expected) + + def test_get_numeric_data_mixed_dtype(self): + # numeric and object columns + + df = DataFrame( + { + "a": [1, 2, 3], + "b": [True, False, True], + "c": ["foo", "bar", "baz"], + "d": [None, None, None], + "e": [3.14, 0.577, 2.773], + } + ) + result = df._get_numeric_data() + tm.assert_index_equal(result.columns, Index(["a", "b", "e"])) + + def test_get_numeric_data_extension_dtype(self): + # GH#22290 + df = DataFrame( + { + "A": pd.array([-10, np.nan, 0, 10, 20, 30], dtype="Int64"), + "B": Categorical(list("abcabc")), + "C": pd.array([0, 1, 2, 3, np.nan, 5], dtype="UInt8"), + "D": IntervalArray.from_breaks(range(7)), + } + ) + result = df._get_numeric_data() + expected = df.loc[:, ["A", "C"]] + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_head_tail.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_head_tail.py new file mode 100644 index 0000000000000000000000000000000000000000..9363c4d79983f0530bc17666aec7ec8609fb93e4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_head_tail.py @@ -0,0 +1,57 @@ +import numpy as np + +from pandas import DataFrame +import pandas._testing as tm + + +def test_head_tail_generic(index, frame_or_series): + # GH#5370 + + ndim = 2 if frame_or_series is DataFrame else 1 + shape = (len(index),) * ndim + vals = np.random.default_rng(2).standard_normal(shape) + obj = frame_or_series(vals, index=index) + + tm.assert_equal(obj.head(), obj.iloc[:5]) + tm.assert_equal(obj.tail(), obj.iloc[-5:]) + + # 0-len + tm.assert_equal(obj.head(0), obj.iloc[0:0]) + tm.assert_equal(obj.tail(0), obj.iloc[0:0]) + + # bounded + tm.assert_equal(obj.head(len(obj) + 1), obj) + tm.assert_equal(obj.tail(len(obj) + 1), obj) + + # neg index + tm.assert_equal(obj.head(-3), obj.head(len(index) - 3)) + tm.assert_equal(obj.tail(-3), obj.tail(len(index) - 3)) + + +def test_head_tail(float_frame): + tm.assert_frame_equal(float_frame.head(), float_frame[:5]) + tm.assert_frame_equal(float_frame.tail(), float_frame[-5:]) + + tm.assert_frame_equal(float_frame.head(0), float_frame[0:0]) + tm.assert_frame_equal(float_frame.tail(0), float_frame[0:0]) + + tm.assert_frame_equal(float_frame.head(-1), float_frame[:-1]) + tm.assert_frame_equal(float_frame.tail(-1), float_frame[1:]) + tm.assert_frame_equal(float_frame.head(1), float_frame[:1]) + tm.assert_frame_equal(float_frame.tail(1), float_frame[-1:]) + # with a float index + df = float_frame.copy() + df.index = np.arange(len(float_frame)) + 0.1 + tm.assert_frame_equal(df.head(), df.iloc[:5]) + tm.assert_frame_equal(df.tail(), df.iloc[-5:]) + tm.assert_frame_equal(df.head(0), df[0:0]) + tm.assert_frame_equal(df.tail(0), df[0:0]) + tm.assert_frame_equal(df.head(-1), df.iloc[:-1]) + tm.assert_frame_equal(df.tail(-1), df.iloc[1:]) + + +def test_head_tail_empty(): + # test empty dataframe + empty_df = DataFrame() + tm.assert_frame_equal(empty_df.tail(), empty_df) + tm.assert_frame_equal(empty_df.head(), empty_df) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_infer_objects.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_infer_objects.py new file mode 100644 index 0000000000000000000000000000000000000000..a824a615b5c297c13afeedeba600c1a0ba986695 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_infer_objects.py @@ -0,0 +1,42 @@ +from datetime import datetime + +from pandas import DataFrame +import pandas._testing as tm + + +class TestInferObjects: + def test_infer_objects(self): + # GH#11221 + df = DataFrame( + { + "a": ["a", 1, 2, 3], + "b": ["b", 2.0, 3.0, 4.1], + "c": [ + "c", + datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3), + ], + "d": [1, 2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) + df = df.iloc[1:].infer_objects() + + assert df["a"].dtype == "int64" + assert df["b"].dtype == "float64" + assert df["c"].dtype == "M8[ns]" + assert df["d"].dtype == "object" + + expected = DataFrame( + { + "a": [1, 2, 3], + "b": [2.0, 3.0, 4.1], + "c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)], + "d": [2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) + # reconstruct frame to verify inference is same + result = df.reset_index(drop=True) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_info.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_info.py new file mode 100644 index 0000000000000000000000000000000000000000..fcb7677f03f279fe35b9ebfc103bbd59f1073076 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_info.py @@ -0,0 +1,565 @@ +from io import StringIO +import re +from string import ascii_uppercase +import sys +import textwrap + +import numpy as np +import pytest + +from pandas.compat import ( + IS64, + PYPY, +) + +from pandas import ( + CategoricalIndex, + DataFrame, + MultiIndex, + Series, + date_range, + option_context, +) +import pandas._testing as tm + + +@pytest.fixture +def duplicate_columns_frame(): + """Dataframe with duplicate column names.""" + return DataFrame( + np.random.default_rng(2).standard_normal((1500, 4)), + columns=["a", "a", "b", "b"], + ) + + +def test_info_empty(): + # GH #45494 + df = DataFrame() + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + expected = textwrap.dedent( + """\ + + RangeIndex: 0 entries + Empty DataFrame\n""" + ) + assert result == expected + + +def test_info_categorical_column_smoke_test(): + n = 2500 + df = DataFrame({"int64": np.random.default_rng(2).integers(100, size=n, dtype=int)}) + df["category"] = Series( + np.array(list("abcdefghij")).take( + np.random.default_rng(2).integers(0, 10, size=n, dtype=int) + ) + ).astype("category") + df.isna() + buf = StringIO() + df.info(buf=buf) + + df2 = df[df["category"] == "d"] + buf = StringIO() + df2.info(buf=buf) + + +@pytest.mark.parametrize( + "fixture_func_name", + [ + "int_frame", + "float_frame", + "datetime_frame", + "duplicate_columns_frame", + "float_string_frame", + ], +) +def test_info_smoke_test(fixture_func_name, request): + frame = request.getfixturevalue(fixture_func_name) + buf = StringIO() + frame.info(buf=buf) + result = buf.getvalue().splitlines() + assert len(result) > 10 + + buf = StringIO() + frame.info(buf=buf, verbose=False) + + +def test_info_smoke_test2(float_frame): + # pretty useless test, used to be mixed into the repr tests + buf = StringIO() + float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf) + float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf) + + # no columns or index + DataFrame().info(buf=buf) + + +@pytest.mark.parametrize( + "num_columns, max_info_columns, verbose", + [ + (10, 100, True), + (10, 11, True), + (10, 10, True), + (10, 9, False), + (10, 1, False), + ], +) +def test_info_default_verbose_selection(num_columns, max_info_columns, verbose): + frame = DataFrame(np.random.default_rng(2).standard_normal((5, num_columns))) + with option_context("display.max_info_columns", max_info_columns): + io_default = StringIO() + frame.info(buf=io_default) + result = io_default.getvalue() + + io_explicit = StringIO() + frame.info(buf=io_explicit, verbose=verbose) + expected = io_explicit.getvalue() + + assert result == expected + + +def test_info_verbose_check_header_separator_body(): + buf = StringIO() + size = 1001 + start = 5 + frame = DataFrame(np.random.default_rng(2).standard_normal((3, size))) + frame.info(verbose=True, buf=buf) + + res = buf.getvalue() + header = " # Column Dtype \n--- ------ ----- " + assert header in res + + frame.info(verbose=True, buf=buf) + buf.seek(0) + lines = buf.readlines() + assert len(lines) > 0 + + for i, line in enumerate(lines): + if start <= i < start + size: + line_nr = f" {i - start} " + assert line.startswith(line_nr) + + +@pytest.mark.parametrize( + "size, header_exp, separator_exp, first_line_exp, last_line_exp", + [ + ( + 4, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 3 3 3 non-null float64", + ), + ( + 11, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 10 10 3 non-null float64", + ), + ( + 101, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 100 100 3 non-null float64", + ), + ( + 1001, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 1000 1000 3 non-null float64", + ), + ( + 10001, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 10000 10000 3 non-null float64", + ), + ], +) +def test_info_verbose_with_counts_spacing( + size, header_exp, separator_exp, first_line_exp, last_line_exp +): + """Test header column, spacer, first line and last line in verbose mode.""" + frame = DataFrame(np.random.default_rng(2).standard_normal((3, size))) + with StringIO() as buf: + frame.info(verbose=True, show_counts=True, buf=buf) + all_lines = buf.getvalue().splitlines() + # Here table would contain only header, separator and table lines + # dframe repr, index summary, memory usage and dtypes are excluded + table = all_lines[3:-2] + header, separator, first_line, *rest, last_line = table + assert header == header_exp + assert separator == separator_exp + assert first_line == first_line_exp + assert last_line == last_line_exp + + +def test_info_memory(): + # https://github.com/pandas-dev/pandas/issues/21056 + df = DataFrame({"a": Series([1, 2], dtype="i8")}) + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + bytes = float(df.memory_usage().sum()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Data columns (total 1 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 2 non-null int64 + dtypes: int64(1) + memory usage: {bytes} bytes + """ + ) + assert result == expected + + +def test_info_wide(): + io = StringIO() + df = DataFrame(np.random.default_rng(2).standard_normal((5, 101))) + df.info(buf=io) + + io = StringIO() + df.info(buf=io, max_cols=101) + result = io.getvalue() + assert len(result.splitlines()) > 100 + + expected = result + with option_context("display.max_info_columns", 101): + io = StringIO() + df.info(buf=io) + result = io.getvalue() + assert result == expected + + +def test_info_duplicate_columns_shows_correct_dtypes(): + # GH11761 + io = StringIO() + frame = DataFrame([[1, 2.0]], columns=["a", "a"]) + frame.info(buf=io) + lines = io.getvalue().splitlines(True) + assert " 0 a 1 non-null int64 \n" == lines[5] + assert " 1 a 1 non-null float64\n" == lines[6] + + +def test_info_shows_column_dtypes(): + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + df.info(buf=buf) + res = buf.getvalue() + header = ( + " # Column Non-Null Count Dtype \n" + "--- ------ -------------- ----- " + ) + assert header in res + for i, dtype in enumerate(dtypes): + name = f" {i:d} {i:d} {n:d} non-null {dtype}" + assert name in res + + +def test_info_max_cols(): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) + for len_, verbose in [(5, None), (5, False), (12, True)]: + # For verbose always ^ setting ^ summarize ^ full output + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, verbose in [(12, None), (5, False), (12, True)]: + # max_cols not exceeded + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, max_cols in [(12, 5), (5, 4)]: + # setting truncates + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + # setting wouldn't truncate + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + +def test_info_memory_usage(): + # Ensure memory usage is displayed, when asserted, on the last line + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + + # display memory usage case + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert "memory usage: " in res[-1] + + # do not display memory usage case + df.info(buf=buf, memory_usage=False) + res = buf.getvalue().splitlines() + assert "memory usage: " not in res[-1] + + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # memory usage is a lower bound, so print it as XYZ+ MB + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df.iloc[:, :5].info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # excluded column with object dtype, so estimate is accurate + assert not re.match(r"memory usage: [^+]+\+", res[-1]) + + # Test a DataFrame with duplicate columns + dtypes = ["int64", "int64", "int64", "float64"] + data = {} + n = 100 + for i, dtype in enumerate(dtypes): + data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype) + df = DataFrame(data) + df.columns = dtypes + + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df_with_object_index.info(buf=buf, memory_usage="deep") + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+$", res[-1]) + + # Ensure df size is as expected + # (cols * rows * bytes) + index size + df_size = df.memory_usage().sum() + exp_size = len(dtypes) * n * 8 + df.index.nbytes + assert df_size == exp_size + + # Ensure number of cols in memory_usage is the same as df + size_df = np.size(df.columns.values) + 1 # index=True; default + assert size_df == np.size(df.memory_usage()) + + # assert deep works only on object + assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() + + # test for validity + DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) + DataFrame(1, index=["a"], columns=["A"]).index.nbytes + df = DataFrame( + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + ) + df.index.nbytes + df.memory_usage(index=True) + df.index.values.nbytes + + mem = df.memory_usage(deep=True).sum() + assert mem > 0 + + +@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") +def test_info_memory_usage_deep_not_pypy(): + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + > df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() + + +@pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") +def test_info_memory_usage_deep_pypy(): + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + == df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() + + +@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") +def test_usage_via_getsizeof(): + df = DataFrame( + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + ) + mem = df.memory_usage(deep=True).sum() + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = mem - sys.getsizeof(df) + assert abs(diff) < 100 + + +def test_info_memory_usage_qualified(): + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) + df.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=list("ABC")) + df.info(buf=buf) + assert "+" in buf.getvalue() + + buf = StringIO() + df = DataFrame( + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) + ) + df.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + df = DataFrame( + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) + ) + df.info(buf=buf) + assert "+" in buf.getvalue() + + +def test_info_memory_usage_bug_on_multiindex(): + # GH 14308 + # memory usage introspection should not materialize .values + + def memory_usage(f): + return f.memory_usage(deep=True).sum() + + N = 100 + M = len(ascii_uppercase) + index = MultiIndex.from_product( + [list(ascii_uppercase), date_range("20160101", periods=N)], + names=["id", "date"], + ) + df = DataFrame( + {"value": np.random.default_rng(2).standard_normal(N * M)}, index=index + ) + + unstacked = df.unstack("id") + assert df.values.nbytes == unstacked.values.nbytes + assert memory_usage(df) > memory_usage(unstacked) + + # high upper bound + assert memory_usage(unstacked) - memory_usage(df) < 2000 + + +def test_info_categorical(): + # GH14298 + idx = CategoricalIndex(["a", "b"]) + df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx) + + buf = StringIO() + df.info(buf=buf) + + +@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") +def test_info_int_columns(): + # GH#37245 + df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) + buf = StringIO() + df.info(show_counts=True, buf=buf) + result = buf.getvalue() + expected = textwrap.dedent( + """\ + + Index: 2 entries, A to B + Data columns (total 2 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 1 2 non-null int64 + 1 2 2 non-null int64 + dtypes: int64(2) + memory usage: 48.0+ bytes + """ + ) + assert result == expected + + +def test_memory_usage_empty_no_warning(): + # GH#50066 + df = DataFrame(index=["a", "b"]) + with tm.assert_produces_warning(None): + result = df.memory_usage() + expected = Series(16 if IS64 else 8, index=["Index"]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.single_cpu +def test_info_compute_numba(): + # GH#51922 + pytest.importorskip("numba") + df = DataFrame([[1, 2], [3, 4]]) + + with option_context("compute.use_numba", True): + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + + buf = StringIO() + df.info(buf=buf) + expected = buf.getvalue() + assert result == expected + + +@pytest.mark.parametrize( + "row, columns, show_counts, result", + [ + [20, 20, None, True], + [20, 20, True, True], + [20, 20, False, False], + [5, 5, None, False], + [5, 5, True, False], + [5, 5, False, False], + ], +) +def test_info_show_counts(row, columns, show_counts, result): + # Explicit cast to float to avoid implicit cast when setting nan + df = DataFrame(1, columns=range(10), index=range(10)).astype({1: "float"}) + df.iloc[1, 1] = np.nan + + with option_context( + "display.max_info_rows", row, "display.max_info_columns", columns + ): + with StringIO() as buf: + df.info(buf=buf, show_counts=show_counts) + assert ("non-null" in buf.getvalue()) is result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_interpolate.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_interpolate.py new file mode 100644 index 0000000000000000000000000000000000000000..252b950004bea6494b686ba409c5b5563a456a67 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_interpolate.py @@ -0,0 +1,548 @@ +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +from pandas.errors import ChainedAssignmentError +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + NaT, + Series, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameInterpolate: + def test_interpolate_complex(self): + # GH#53635 + ser = Series([complex("1+1j"), float("nan"), complex("2+2j")]) + assert ser.dtype.kind == "c" + + res = ser.interpolate() + expected = Series([ser[0], ser[0] * 1.5, ser[2]]) + tm.assert_series_equal(res, expected) + + df = ser.to_frame() + res = df.interpolate() + expected = expected.to_frame() + tm.assert_frame_equal(res, expected) + + def test_interpolate_datetimelike_values(self, frame_or_series): + # GH#11312, GH#51005 + orig = Series(date_range("2012-01-01", periods=5)) + ser = orig.copy() + ser[2] = NaT + + res = frame_or_series(ser).interpolate() + expected = frame_or_series(orig) + tm.assert_equal(res, expected) + + # datetime64tz cast + ser_tz = ser.dt.tz_localize("US/Pacific") + res_tz = frame_or_series(ser_tz).interpolate() + expected_tz = frame_or_series(orig.dt.tz_localize("US/Pacific")) + tm.assert_equal(res_tz, expected_tz) + + # timedelta64 cast + ser_td = ser - ser[0] + res_td = frame_or_series(ser_td).interpolate() + expected_td = frame_or_series(orig - orig[0]) + tm.assert_equal(res_td, expected_td) + + def test_interpolate_inplace(self, frame_or_series, using_array_manager, request): + # GH#44749 + if using_array_manager and frame_or_series is DataFrame: + mark = pytest.mark.xfail(reason=".values-based in-place check is invalid") + request.applymarker(mark) + + obj = frame_or_series([1, np.nan, 2]) + orig = obj.values + + obj.interpolate(inplace=True) + expected = frame_or_series([1, 1.5, 2]) + tm.assert_equal(obj, expected) + + # check we operated *actually* inplace + assert np.shares_memory(orig, obj.values) + assert orig.squeeze()[1] == 1.5 + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) + def test_interp_basic(self, using_copy_on_write): + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + expected = DataFrame( + { + "A": [1.0, 2.0, 3.0, 4.0], + "B": [1.0, 4.0, 9.0, 9.0], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + msg = "DataFrame.interpolate with object dtype" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.interpolate() + tm.assert_frame_equal(result, expected) + + # check we didn't operate inplace GH#45791 + cvalues = df["C"]._values + dvalues = df["D"].values + if using_copy_on_write: + assert np.shares_memory(cvalues, result["C"]._values) + assert np.shares_memory(dvalues, result["D"]._values) + else: + assert not np.shares_memory(cvalues, result["C"]._values) + assert not np.shares_memory(dvalues, result["D"]._values) + + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.interpolate(inplace=True) + assert res is None + tm.assert_frame_equal(df, expected) + + # check we DID operate inplace + assert np.shares_memory(df["C"]._values, cvalues) + assert np.shares_memory(df["D"]._values, dvalues) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) + def test_interp_basic_with_non_range_index(self, using_infer_string): + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + + msg = "DataFrame.interpolate with object dtype" + warning = FutureWarning if not using_infer_string else None + with tm.assert_produces_warning(warning, match=msg): + result = df.set_index("C").interpolate() + expected = df.set_index("C") + expected.loc[3, "A"] = 3 + expected.loc[5, "B"] = 9 + tm.assert_frame_equal(result, expected) + + def test_interp_empty(self): + # https://github.com/pandas-dev/pandas/issues/35598 + df = DataFrame() + result = df.interpolate() + assert result is not df + expected = df + tm.assert_frame_equal(result, expected) + + def test_interp_bad_method(self): + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + } + ) + msg = ( + r"method must be one of \['linear', 'time', 'index', 'values', " + r"'nearest', 'zero', 'slinear', 'quadratic', 'cubic', " + r"'barycentric', 'krogh', 'spline', 'polynomial', " + r"'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima', " + r"'cubicspline'\]. Got 'not_a_method' instead." + ) + with pytest.raises(ValueError, match=msg): + df.interpolate(method="not_a_method") + + def test_interp_combo(self): + df = DataFrame( + { + "A": [1.0, 2.0, np.nan, 4.0], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + + result = df["A"].interpolate() + expected = Series([1.0, 2.0, 3.0, 4.0], name="A") + tm.assert_series_equal(result, expected) + + msg = "The 'downcast' keyword in Series.interpolate is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df["A"].interpolate(downcast="infer") + expected = Series([1, 2, 3, 4], name="A") + tm.assert_series_equal(result, expected) + + def test_inerpolate_invalid_downcast(self): + # GH#53103 + df = DataFrame( + { + "A": [1.0, 2.0, np.nan, 4.0], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + + msg = "downcast must be either None or 'infer'" + msg2 = "The 'downcast' keyword in DataFrame.interpolate is deprecated" + msg3 = "The 'downcast' keyword in Series.interpolate is deprecated" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg2): + df.interpolate(downcast="int64") + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg3): + df["A"].interpolate(downcast="int64") + + def test_interp_nan_idx(self): + df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]}) + df = df.set_index("A") + msg = ( + "Interpolation with NaNs in the index has not been implemented. " + "Try filling those NaNs before interpolating." + ) + with pytest.raises(NotImplementedError, match=msg): + df.interpolate(method="values") + + def test_interp_various(self): + pytest.importorskip("scipy") + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) + df = df.set_index("C") + expected = df.copy() + result = df.interpolate(method="polynomial", order=1) + + expected.loc[3, "A"] = 2.66666667 + expected.loc[13, "A"] = 5.76923076 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="cubic") + # GH #15662. + expected.loc[3, "A"] = 2.81547781 + expected.loc[13, "A"] = 5.52964175 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="nearest") + expected.loc[3, "A"] = 2 + expected.loc[13, "A"] = 5 + tm.assert_frame_equal(result, expected, check_dtype=False) + + result = df.interpolate(method="quadratic") + expected.loc[3, "A"] = 2.82150771 + expected.loc[13, "A"] = 6.12648668 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="slinear") + expected.loc[3, "A"] = 2.66666667 + expected.loc[13, "A"] = 5.76923077 + tm.assert_frame_equal(result, expected) + + result = df.interpolate(method="zero") + expected.loc[3, "A"] = 2.0 + expected.loc[13, "A"] = 5 + tm.assert_frame_equal(result, expected, check_dtype=False) + + def test_interp_alt_scipy(self): + pytest.importorskip("scipy") + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) + result = df.interpolate(method="barycentric") + expected = df.copy() + expected.loc[2, "A"] = 3 + expected.loc[5, "A"] = 6 + tm.assert_frame_equal(result, expected) + + msg = "The 'downcast' keyword in DataFrame.interpolate is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.interpolate(method="barycentric", downcast="infer") + tm.assert_frame_equal(result, expected.astype(np.int64)) + + result = df.interpolate(method="krogh") + expectedk = df.copy() + expectedk["A"] = expected["A"] + tm.assert_frame_equal(result, expectedk) + + result = df.interpolate(method="pchip") + expected.loc[2, "A"] = 3 + expected.loc[5, "A"] = 6.0 + + tm.assert_frame_equal(result, expected) + + def test_interp_rowwise(self): + df = DataFrame( + { + 0: [1, 2, np.nan, 4], + 1: [2, 3, 4, np.nan], + 2: [np.nan, 4, 5, 6], + 3: [4, np.nan, 6, 7], + 4: [1, 2, 3, 4], + } + ) + result = df.interpolate(axis=1) + expected = df.copy() + expected.loc[3, 1] = 5 + expected.loc[0, 2] = 3 + expected.loc[1, 3] = 3 + expected[4] = expected[4].astype(np.float64) + tm.assert_frame_equal(result, expected) + + result = df.interpolate(axis=1, method="values") + tm.assert_frame_equal(result, expected) + + result = df.interpolate(axis=0) + expected = df.interpolate() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "axis_name, axis_number", + [ + pytest.param("rows", 0, id="rows_0"), + pytest.param("index", 0, id="index_0"), + pytest.param("columns", 1, id="columns_1"), + ], + ) + def test_interp_axis_names(self, axis_name, axis_number): + # GH 29132: test axis names + data = {0: [0, np.nan, 6], 1: [1, np.nan, 7], 2: [2, 5, 8]} + + df = DataFrame(data, dtype=np.float64) + result = df.interpolate(axis=axis_name, method="linear") + expected = df.interpolate(axis=axis_number, method="linear") + tm.assert_frame_equal(result, expected) + + def test_rowwise_alt(self): + df = DataFrame( + { + 0: [0, 0.5, 1.0, np.nan, 4, 8, np.nan, np.nan, 64], + 1: [1, 2, 3, 4, 3, 2, 1, 0, -1], + } + ) + df.interpolate(axis=0) + # TODO: assert something? + + @pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] + ) + def test_interp_leading_nans(self, check_scipy): + df = DataFrame( + {"A": [np.nan, np.nan, 0.5, 0.25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]} + ) + result = df.interpolate() + expected = df.copy() + expected.loc[3, "B"] = -3.75 + tm.assert_frame_equal(result, expected) + + if check_scipy: + result = df.interpolate(method="polynomial", order=1) + tm.assert_frame_equal(result, expected) + + def test_interp_raise_on_only_mixed(self, axis): + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": ["a", "b", "c", "d"], + "C": [np.nan, 2, 5, 7], + "D": [np.nan, np.nan, 9, 9], + "E": [1, 2, 3, 4], + } + ) + msg = ( + "Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype." + ) + with pytest.raises(TypeError, match=msg): + df.astype("object").interpolate(axis=axis) + + def test_interp_raise_on_all_object_dtype(self): + # GH 22985 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object") + msg = ( + "Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype." + ) + with pytest.raises(TypeError, match=msg): + df.interpolate() + + def test_interp_inplace(self, using_copy_on_write): + df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) + expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) + expected_cow = df.copy() + result = df.copy() + + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + return_value = result["a"].interpolate(inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected_cow) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + return_value = result["a"].interpolate(inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + result = df.copy() + msg = "The 'downcast' keyword in Series.interpolate is deprecated" + + if using_copy_on_write: + with tm.assert_produces_warning( + (FutureWarning, ChainedAssignmentError), match=msg + ): + return_value = result["a"].interpolate(inplace=True, downcast="infer") + assert return_value is None + tm.assert_frame_equal(result, expected_cow) + else: + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = result["a"].interpolate(inplace=True, downcast="infer") + assert return_value is None + tm.assert_frame_equal(result, expected.astype("int64")) + + def test_interp_inplace_row(self): + # GH 10395 + result = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]} + ) + expected = result.interpolate(method="linear", axis=1, inplace=False) + return_value = result.interpolate(method="linear", axis=1, inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + def test_interp_ignore_all_good(self): + # GH + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 2, 3, 4], + "C": [1.0, 2.0, np.nan, 4.0], + "D": [1.0, 2.0, 3.0, 4.0], + } + ) + expected = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="float64"), + "B": np.array([1, 2, 3, 4], dtype="int64"), + "C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"), + "D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"), + } + ) + + msg = "The 'downcast' keyword in DataFrame.interpolate is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.interpolate(downcast=None) + tm.assert_frame_equal(result, expected) + + # all good + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df[["B", "D"]].interpolate(downcast=None) + tm.assert_frame_equal(result, df[["B", "D"]]) + + def test_interp_time_inplace_axis(self): + # GH 9687 + periods = 5 + idx = date_range(start="2014-01-01", periods=periods) + data = np.random.default_rng(2).random((periods, periods)) + data[data < 0.5] = np.nan + expected = DataFrame(index=idx, columns=idx, data=data) + + result = expected.interpolate(axis=0, method="time") + return_value = expected.interpolate(axis=0, method="time", inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("axis_name, axis_number", [("index", 0), ("columns", 1)]) + def test_interp_string_axis(self, axis_name, axis_number): + # https://github.com/pandas-dev/pandas/issues/25190 + x = np.linspace(0, 100, 1000) + y = np.sin(x) + df = DataFrame( + data=np.tile(y, (10, 1)), index=np.arange(10), columns=x + ).reindex(columns=x * 1.005) + result = df.interpolate(method="linear", axis=axis_name) + expected = df.interpolate(method="linear", axis=axis_number) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("multiblock", [True, False]) + @pytest.mark.parametrize("method", ["ffill", "bfill", "pad"]) + def test_interp_fillna_methods( + self, request, axis, multiblock, method, using_array_manager + ): + # GH 12918 + if using_array_manager and axis in (1, "columns"): + # TODO(ArrayManager) support axis=1 + td.mark_array_manager_not_yet_implemented(request) + + df = DataFrame( + { + "A": [1.0, 2.0, 3.0, 4.0, np.nan, 5.0], + "B": [2.0, 4.0, 6.0, np.nan, 8.0, 10.0], + "C": [3.0, 6.0, 9.0, np.nan, np.nan, 30.0], + } + ) + if multiblock: + df["D"] = np.nan + df["E"] = 1.0 + + method2 = method if method != "pad" else "ffill" + expected = getattr(df, method2)(axis=axis) + msg = f"DataFrame.interpolate with method={method} is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.interpolate(method=method, axis=axis) + tm.assert_frame_equal(result, expected) + + def test_interpolate_empty_df(self): + # GH#53199 + df = DataFrame() + expected = df.copy() + result = df.interpolate(inplace=True) + assert result is None + tm.assert_frame_equal(df, expected) + + def test_interpolate_ea(self, any_int_ea_dtype): + # GH#55347 + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=any_int_ea_dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="Float64") + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + [ + "Float64", + "Float32", + pytest.param("float32[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_interpolate_ea_float(self, dtype): + # GH#55347 + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype=dtype) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + ["int64", "uint64", "int32", "int16", "int8", "uint32", "uint16", "uint8"], + ) + def test_interpolate_arrow(self, dtype): + # GH#55347 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype + "[pyarrow]") + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="float64[pyarrow]") + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_is_homogeneous_dtype.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_is_homogeneous_dtype.py new file mode 100644 index 0000000000000000000000000000000000000000..1fe28cb8eb8562d116ed3306a7576a06c9c50450 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_is_homogeneous_dtype.py @@ -0,0 +1,58 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + Categorical, + DataFrame, +) + +# _is_homogeneous_type always returns True for ArrayManager +pytestmark = td.skip_array_manager_invalid_test + + +@pytest.mark.parametrize( + "data, expected", + [ + # empty + (DataFrame(), True), + # multi-same + (DataFrame({"A": [1, 2], "B": [1, 2]}), True), + # multi-object + ( + DataFrame( + { + "A": np.array([1, 2], dtype=object), + "B": np.array(["a", "b"], dtype=object), + }, + dtype="object", + ), + True, + ), + # multi-extension + ( + DataFrame({"A": Categorical(["a", "b"]), "B": Categorical(["a", "b"])}), + True, + ), + # differ types + (DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False), + # differ sizes + ( + DataFrame( + { + "A": np.array([1, 2], dtype=np.int32), + "B": np.array([1, 2], dtype=np.int64), + } + ), + False, + ), + # multi-extension differ + ( + DataFrame({"A": Categorical(["a", "b"]), "B": Categorical(["b", "c"])}), + False, + ), + ], +) +def test_is_homogeneous_type(data, expected): + assert data._is_homogeneous_type is expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_isetitem.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_isetitem.py new file mode 100644 index 0000000000000000000000000000000000000000..69f394afb65191fe4cc52519fbc52959d2e1dd76 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_isetitem.py @@ -0,0 +1,50 @@ +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class TestDataFrameSetItem: + def test_isetitem_ea_df(self): + # GH#49922 + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + rhs = DataFrame([[11, 12], [13, 14]], dtype="Int64") + + df.isetitem([0, 1], rhs) + expected = DataFrame( + { + 0: Series([11, 13], dtype="Int64"), + 1: Series([12, 14], dtype="Int64"), + 2: [3, 6], + } + ) + tm.assert_frame_equal(df, expected) + + def test_isetitem_ea_df_scalar_indexer(self): + # GH#49922 + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + rhs = DataFrame([[11], [13]], dtype="Int64") + + df.isetitem(2, rhs) + expected = DataFrame( + { + 0: [1, 4], + 1: [2, 5], + 2: Series([11, 13], dtype="Int64"), + } + ) + tm.assert_frame_equal(df, expected) + + def test_isetitem_dimension_mismatch(self): + # GH#51701 + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) + value = df.copy() + with pytest.raises(ValueError, match="Got 2 positions but value has 3 columns"): + df.isetitem([1, 2], value) + + value = df.copy() + with pytest.raises(ValueError, match="Got 2 positions but value has 1 columns"): + df.isetitem([1, 2], value[["a"]]) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_isin.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_isin.py new file mode 100644 index 0000000000000000000000000000000000000000..b4511aad27a93bd2d9411ac5cdb427196dbf9dda --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_isin.py @@ -0,0 +1,227 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, + Series, +) +import pandas._testing as tm + + +class TestDataFrameIsIn: + def test_isin(self): + # GH#4211 + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) + other = ["a", "b", "c"] + + result = df.isin(other) + expected = DataFrame([df.loc[s].isin(other) for s in df.index]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) + def test_isin_empty(self, empty): + # GH#16991 + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + expected = DataFrame(False, df.index, df.columns) + + result = df.isin(empty) + tm.assert_frame_equal(result, expected) + + def test_isin_dict(self): + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + d = {"A": ["a"]} + + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, "A"] = True + + result = df.isin(d) + tm.assert_frame_equal(result, expected) + + # non unique columns + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + df.columns = ["A", "A"] + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, "A"] = True + result = df.isin(d) + tm.assert_frame_equal(result, expected) + + def test_isin_with_string_scalar(self): + # GH#4763 + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) + msg = ( + r"only list-like or dict-like objects are allowed " + r"to be passed to DataFrame.isin\(\), you passed a 'str'" + ) + with pytest.raises(TypeError, match=msg): + df.isin("a") + + with pytest.raises(TypeError, match=msg): + df.isin("aaa") + + def test_isin_df(self): + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) + df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) + expected = DataFrame(False, df1.index, df1.columns) + result = df1.isin(df2) + expected.loc[[1, 3], "A"] = True + expected.loc[[0, 2], "B"] = True + tm.assert_frame_equal(result, expected) + + # partial overlapping columns + df2.columns = ["A", "C"] + result = df1.isin(df2) + expected["B"] = False + tm.assert_frame_equal(result, expected) + + def test_isin_tuples(self): + # GH#16394 + df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) + df["C"] = list(zip(df["A"], df["B"])) + result = df["C"].isin([(1, "a")]) + tm.assert_series_equal(result, Series([True, False, False], name="C")) + + def test_isin_df_dupe_values(self): + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) + # just cols duped + df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"]) + msg = r"cannot compute isin with a duplicate axis\." + with pytest.raises(ValueError, match=msg): + df1.isin(df2) + + # just index duped + df2 = DataFrame( + [[0, 2], [12, 4], [2, np.nan], [4, 5]], + columns=["A", "B"], + index=[0, 0, 1, 1], + ) + with pytest.raises(ValueError, match=msg): + df1.isin(df2) + + # cols and index: + df2.columns = ["B", "B"] + with pytest.raises(ValueError, match=msg): + df1.isin(df2) + + def test_isin_dupe_self(self): + other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]}) + df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"]) + result = df.isin(other) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected.loc[0] = True + expected.iloc[1, 1] = True + tm.assert_frame_equal(result, expected) + + def test_isin_against_series(self): + df = DataFrame( + {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"] + ) + s = Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected.loc["a", "A"] = True + expected.loc["d"] = True + result = df.isin(s) + tm.assert_frame_equal(result, expected) + + def test_isin_multiIndex(self): + idx = MultiIndex.from_tuples( + [ + (0, "a", "foo"), + (0, "a", "bar"), + (0, "b", "bar"), + (0, "b", "baz"), + (2, "a", "foo"), + (2, "a", "bar"), + (2, "c", "bar"), + (2, "c", "baz"), + (1, "b", "foo"), + (1, "b", "bar"), + (1, "c", "bar"), + (1, "c", "baz"), + ] + ) + df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx) + df2 = DataFrame( + { + "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1], + } + ) + # against regular index + expected = DataFrame(False, index=df1.index, columns=df1.columns) + result = df1.isin(df2) + tm.assert_frame_equal(result, expected) + + df2.index = idx + expected = df2.values.astype(bool) + expected[:, 1] = ~expected[:, 1] + expected = DataFrame(expected, columns=["A", "B"], index=idx) + + result = df1.isin(df2) + tm.assert_frame_equal(result, expected) + + def test_isin_empty_datetimelike(self): + # GH#15473 + df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])}) + df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]}) + df2 = DataFrame({"date": []}) + df3 = DataFrame() + + expected = DataFrame({"date": [False, False]}) + + result = df1_ts.isin(df2) + tm.assert_frame_equal(result, expected) + result = df1_ts.isin(df3) + tm.assert_frame_equal(result, expected) + + result = df1_td.isin(df2) + tm.assert_frame_equal(result, expected) + result = df1_td.isin(df3) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "values", + [ + DataFrame({"a": [1, 2, 3]}, dtype="category"), + Series([1, 2, 3], dtype="category"), + ], + ) + def test_isin_category_frame(self, values): + # GH#34256 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + expected = DataFrame({"a": [True, True, True], "b": [False, False, False]}) + + result = df.isin(values) + tm.assert_frame_equal(result, expected) + + def test_isin_read_only(self): + # https://github.com/pandas-dev/pandas/issues/37174 + arr = np.array([1, 2, 3]) + arr.setflags(write=False) + df = DataFrame([1, 2, 3]) + result = df.isin(arr) + expected = DataFrame([True, True, True]) + tm.assert_frame_equal(result, expected) + + def test_isin_not_lossy(self): + # GH 53514 + val = 1666880195890293744 + df = DataFrame({"a": [val], "b": [1.0]}) + result = df.isin([val]) + expected = DataFrame({"a": [True], "b": [False]}) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_iterrows.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_iterrows.py new file mode 100644 index 0000000000000000000000000000000000000000..0bd0bed76dc9dea5df4d0afb76ebaf0760a23ecc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_iterrows.py @@ -0,0 +1,16 @@ +from pandas import ( + DataFrame, + Timedelta, +) + + +def test_no_overflow_of_freq_and_time_in_dataframe(): + # GH 35665 + df = DataFrame( + { + "some_string": ["2222Y3"], + "time": [Timedelta("0 days 00:00:00.990000")], + } + ) + for _, row in df.iterrows(): + assert row.dtype == "object" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_join.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_join.py new file mode 100644 index 0000000000000000000000000000000000000000..735f6c50ab739dba04ebc49fec73cfa3147fc661 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_join.py @@ -0,0 +1,576 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas.errors import MergeError + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + date_range, + period_range, +) +import pandas._testing as tm +from pandas.core.reshape.concat import concat + + +@pytest.fixture +def frame_with_period_index(): + return DataFrame( + data=np.arange(20).reshape(4, 5), + columns=list("abcde"), + index=period_range(start="2000", freq="Y", periods=4), + ) + + +@pytest.fixture +def left(): + return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) + + +@pytest.fixture +def right(): + return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) + + +@pytest.fixture +def left_no_dup(): + return DataFrame( + {"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]}, + index=range(4), + ) + + +@pytest.fixture +def right_no_dup(): + return DataFrame( + { + "a": ["a", "b", "c", "d", "e"], + "c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"], + }, + index=range(5), + ).set_index("a") + + +@pytest.fixture +def left_w_dups(left_no_dup): + return concat( + [left_no_dup, DataFrame({"a": ["a"], "b": ["cow"]}, index=[3])], sort=True + ) + + +@pytest.fixture +def right_w_dups(right_no_dup): + return concat( + [right_no_dup, DataFrame({"a": ["e"], "c": ["moo"]}, index=[3])] + ).set_index("a") + + +@pytest.mark.parametrize( + "how, sort, expected", + [ + ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])), + ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])), + ( + "left", + False, + DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]), + ), + ( + "left", + True, + DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]), + ), + ( + "right", + False, + DataFrame({"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]), + ), + ( + "right", + True, + DataFrame({"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]), + ), + ( + "outer", + False, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ( + "outer", + True, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ], +) +def test_join(left, right, how, sort, expected): + result = left.join(right, how=how, sort=sort, validate="1:1") + tm.assert_frame_equal(result, expected) + + +def test_suffix_on_list_join(): + first = DataFrame({"key": [1, 2, 3, 4, 5]}) + second = DataFrame({"key": [1, 8, 3, 2, 5], "v1": [1, 2, 3, 4, 5]}) + third = DataFrame({"keys": [5, 2, 3, 4, 1], "v2": [1, 2, 3, 4, 5]}) + + # check proper errors are raised + msg = "Suffixes not supported when joining multiple DataFrames" + with pytest.raises(ValueError, match=msg): + first.join([second], lsuffix="y") + with pytest.raises(ValueError, match=msg): + first.join([second, third], rsuffix="x") + with pytest.raises(ValueError, match=msg): + first.join([second, third], lsuffix="y", rsuffix="x") + with pytest.raises(ValueError, match="Indexes have overlapping values"): + first.join([second, third]) + + # no errors should be raised + arr_joined = first.join([third]) + norm_joined = first.join(third) + tm.assert_frame_equal(arr_joined, norm_joined) + + +def test_join_invalid_validate(left_no_dup, right_no_dup): + # GH 46622 + # Check invalid arguments + msg = ( + '"invalid" is not a valid argument. ' + "Valid arguments are:\n" + '- "1:1"\n' + '- "1:m"\n' + '- "m:1"\n' + '- "m:m"\n' + '- "one_to_one"\n' + '- "one_to_many"\n' + '- "many_to_one"\n' + '- "many_to_many"' + ) + with pytest.raises(ValueError, match=msg): + left_no_dup.merge(right_no_dup, on="a", validate="invalid") + + +@pytest.mark.parametrize("dtype", ["object", "string[pyarrow]"]) +def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups, dtype): + # GH 46622 + # Dups on right allowed by one_to_many constraint + if dtype == "string[pyarrow]": + pytest.importorskip("pyarrow") + left_no_dup = left_no_dup.astype(dtype) + right_w_dups.index = right_w_dups.index.astype(dtype) + left_no_dup.join( + right_w_dups, + on="a", + validate="one_to_many", + ) + + # Dups on right not allowed by one_to_one constraint + msg = "Merge keys are not unique in right dataset; not a one-to-one merge" + with pytest.raises(MergeError, match=msg): + left_no_dup.join( + right_w_dups, + on="a", + validate="one_to_one", + ) + + +def test_join_on_single_col_dup_on_left(left_w_dups, right_no_dup): + # GH 46622 + # Dups on left allowed by many_to_one constraint + left_w_dups.join( + right_no_dup, + on="a", + validate="many_to_one", + ) + + # Dups on left not allowed by one_to_one constraint + msg = "Merge keys are not unique in left dataset; not a one-to-one merge" + with pytest.raises(MergeError, match=msg): + left_w_dups.join( + right_no_dup, + on="a", + validate="one_to_one", + ) + + +def test_join_on_single_col_dup_on_both(left_w_dups, right_w_dups): + # GH 46622 + # Dups on both allowed by many_to_many constraint + left_w_dups.join(right_w_dups, on="a", validate="many_to_many") + + # Dups on both not allowed by many_to_one constraint + msg = "Merge keys are not unique in right dataset; not a many-to-one merge" + with pytest.raises(MergeError, match=msg): + left_w_dups.join( + right_w_dups, + on="a", + validate="many_to_one", + ) + + # Dups on both not allowed by one_to_many constraint + msg = "Merge keys are not unique in left dataset; not a one-to-many merge" + with pytest.raises(MergeError, match=msg): + left_w_dups.join( + right_w_dups, + on="a", + validate="one_to_many", + ) + + +def test_join_on_multi_col_check_dup(): + # GH 46622 + # Two column join, dups in both, but jointly no dups + left = DataFrame( + { + "a": ["a", "a", "b", "b"], + "b": [0, 1, 0, 1], + "c": ["cat", "dog", "weasel", "horse"], + }, + index=range(4), + ).set_index(["a", "b"]) + + right = DataFrame( + { + "a": ["a", "a", "b"], + "b": [0, 1, 0], + "d": ["meow", "bark", "um... weasel noise?"], + }, + index=range(3), + ).set_index(["a", "b"]) + + expected_multi = DataFrame( + { + "a": ["a", "a", "b"], + "b": [0, 1, 0], + "c": ["cat", "dog", "weasel"], + "d": ["meow", "bark", "um... weasel noise?"], + }, + index=range(3), + ).set_index(["a", "b"]) + + # Jointly no dups allowed by one_to_one constraint + result = left.join(right, how="inner", validate="1:1") + tm.assert_frame_equal(result, expected_multi) + + +def test_join_index(float_frame): + # left / right + + f = float_frame.loc[float_frame.index[:10], ["A", "B"]] + f2 = float_frame.loc[float_frame.index[5:], ["C", "D"]].iloc[::-1] + + joined = f.join(f2) + tm.assert_index_equal(f.index, joined.index) + expected_columns = Index(["A", "B", "C", "D"]) + tm.assert_index_equal(joined.columns, expected_columns) + + joined = f.join(f2, how="left") + tm.assert_index_equal(joined.index, f.index) + tm.assert_index_equal(joined.columns, expected_columns) + + joined = f.join(f2, how="right") + tm.assert_index_equal(joined.index, f2.index) + tm.assert_index_equal(joined.columns, expected_columns) + + # inner + + joined = f.join(f2, how="inner") + tm.assert_index_equal(joined.index, f.index[5:10]) + tm.assert_index_equal(joined.columns, expected_columns) + + # outer + + joined = f.join(f2, how="outer") + tm.assert_index_equal(joined.index, float_frame.index.sort_values()) + tm.assert_index_equal(joined.columns, expected_columns) + + with pytest.raises(ValueError, match="join method"): + f.join(f2, how="foo") + + # corner case - overlapping columns + msg = "columns overlap but no suffix" + for how in ("outer", "left", "inner"): + with pytest.raises(ValueError, match=msg): + float_frame.join(float_frame, how=how) + + +def test_join_index_more(float_frame): + af = float_frame.loc[:, ["A", "B"]] + bf = float_frame.loc[::2, ["C", "D"]] + + expected = af.copy() + expected["C"] = float_frame["C"][::2] + expected["D"] = float_frame["D"][::2] + + result = af.join(bf) + tm.assert_frame_equal(result, expected) + + result = af.join(bf, how="right") + tm.assert_frame_equal(result, expected[::2]) + + result = bf.join(af, how="right") + tm.assert_frame_equal(result, expected.loc[:, result.columns]) + + +def test_join_index_series(float_frame): + df = float_frame.copy() + ser = df.pop(float_frame.columns[-1]) + joined = df.join(ser) + + tm.assert_frame_equal(joined, float_frame) + + ser.name = None + with pytest.raises(ValueError, match="must have a name"): + df.join(ser) + + +def test_join_overlap(float_frame): + df1 = float_frame.loc[:, ["A", "B", "C"]] + df2 = float_frame.loc[:, ["B", "C", "D"]] + + joined = df1.join(df2, lsuffix="_df1", rsuffix="_df2") + df1_suf = df1.loc[:, ["B", "C"]].add_suffix("_df1") + df2_suf = df2.loc[:, ["B", "C"]].add_suffix("_df2") + + no_overlap = float_frame.loc[:, ["A", "D"]] + expected = df1_suf.join(df2_suf).join(no_overlap) + + # column order not necessarily sorted + tm.assert_frame_equal(joined, expected.loc[:, joined.columns]) + + +def test_join_period_index(frame_with_period_index): + other = frame_with_period_index.rename(columns=lambda key: f"{key}{key}") + + joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1) + + joined_cols = frame_with_period_index.columns.append(other.columns) + + joined = frame_with_period_index.join(other) + expected = DataFrame( + data=joined_values, columns=joined_cols, index=frame_with_period_index.index + ) + + tm.assert_frame_equal(joined, expected) + + +def test_join_left_sequence_non_unique_index(): + # https://github.com/pandas-dev/pandas/issues/19607 + df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3]) + df2 = DataFrame({"b": [100, 200, 300]}, index=[4, 3, 2]) + df3 = DataFrame({"c": [400, 500, 600]}, index=[2, 2, 4]) + + joined = df1.join([df2, df3], how="left") + + expected = DataFrame( + { + "a": [0, 10, 10, 20], + "b": [np.nan, 300, 300, 200], + "c": [np.nan, 400, 500, np.nan], + }, + index=[1, 2, 2, 3], + ) + + tm.assert_frame_equal(joined, expected) + + +def test_join_list_series(float_frame): + # GH#46850 + # Join a DataFrame with a list containing both a Series and a DataFrame + left = float_frame.A.to_frame() + right = [float_frame.B, float_frame[["C", "D"]]] + result = left.join(right) + tm.assert_frame_equal(result, float_frame) + + +@pytest.mark.parametrize("sort_kw", [True, False]) +def test_suppress_future_warning_with_sort_kw(sort_kw): + a = DataFrame({"col1": [1, 2]}, index=["c", "a"]) + + b = DataFrame({"col2": [4, 5]}, index=["b", "a"]) + + c = DataFrame({"col3": [7, 8]}, index=["a", "b"]) + + expected = DataFrame( + { + "col1": {"a": 2.0, "b": float("nan"), "c": 1.0}, + "col2": {"a": 5.0, "b": 4.0, "c": float("nan")}, + "col3": {"a": 7.0, "b": 8.0, "c": float("nan")}, + } + ) + if sort_kw is False: + expected = expected.reindex(index=["c", "a", "b"]) + + with tm.assert_produces_warning(None): + result = a.join([b, c], how="outer", sort=sort_kw) + tm.assert_frame_equal(result, expected) + + +class TestDataFrameJoin: + def test_join(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + a = frame.loc[frame.index[:5], ["A"]] + b = frame.loc[frame.index[2:], ["B", "C"]] + + joined = a.join(b, how="outer").reindex(frame.index) + expected = frame.copy().values.copy() + expected[np.isnan(joined.values)] = np.nan + expected = DataFrame(expected, index=frame.index, columns=frame.columns) + + assert not np.isnan(joined.values).all() + + tm.assert_frame_equal(joined, expected) + + def test_join_segfault(self): + # GH#1532 + df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]}) + df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]}) + df1 = df1.set_index(["a", "b"]) + df2 = df2.set_index(["a", "b"]) + # it works! + for how in ["left", "right", "outer"]: + df1.join(df2, how=how) + + def test_join_str_datetime(self): + str_dates = ["20120209", "20120222"] + dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] + + A = DataFrame(str_dates, index=range(2), columns=["aa"]) + C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates) + + tst = A.join(C, on="aa") + + assert len(tst.columns) == 3 + + def test_join_multiindex_leftright(self): + # GH 10741 + df1 = DataFrame( + [ + ["a", "x", 0.471780], + ["a", "y", 0.774908], + ["a", "z", 0.563634], + ["b", "x", -0.353756], + ["b", "y", 0.368062], + ["b", "z", -1.721840], + ["c", "x", 1], + ["c", "y", 2], + ["c", "z", 3], + ], + columns=["first", "second", "value1"], + ).set_index(["first", "second"]) + + df2 = DataFrame([["a", 10], ["b", 20]], columns=["first", "value2"]).set_index( + ["first"] + ) + + exp = DataFrame( + [ + [0.471780, 10], + [0.774908, 10], + [0.563634, 10], + [-0.353756, 20], + [0.368062, 20], + [-1.721840, 20], + [1.000000, np.nan], + [2.000000, np.nan], + [3.000000, np.nan], + ], + index=df1.index, + columns=["value1", "value2"], + ) + + # these must be the same results (but columns are flipped) + tm.assert_frame_equal(df1.join(df2, how="left"), exp) + tm.assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]]) + + exp_idx = MultiIndex.from_product( + [["a", "b"], ["x", "y", "z"]], names=["first", "second"] + ) + exp = DataFrame( + [ + [0.471780, 10], + [0.774908, 10], + [0.563634, 10], + [-0.353756, 20], + [0.368062, 20], + [-1.721840, 20], + ], + index=exp_idx, + columns=["value1", "value2"], + ) + + tm.assert_frame_equal(df1.join(df2, how="right"), exp) + tm.assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]]) + + def test_join_multiindex_dates(self): + # GH 33692 + date = pd.Timestamp(2000, 1, 1).date() + + df1_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) + df1 = DataFrame({"col1": [0]}, index=df1_index) + df2_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) + df2 = DataFrame({"col2": [0]}, index=df2_index) + df3_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) + df3 = DataFrame({"col3": [0]}, index=df3_index) + + result = df1.join([df2, df3]) + + expected_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"]) + expected = DataFrame( + {"col1": [0], "col2": [0], "col3": [0]}, index=expected_index + ) + + tm.assert_equal(result, expected) + + def test_merge_join_different_levels_raises(self): + # GH#9455 + # GH 40993: For raising, enforced in 2.0 + + # first dataframe + df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]]) + + # second dataframe + columns = MultiIndex.from_tuples([("a", ""), ("c", "c1")]) + df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]]) + + # merge + with pytest.raises( + MergeError, match="Not allowed to merge between different levels" + ): + pd.merge(df1, df2, on="a") + + # join, see discussion in GH#12219 + with pytest.raises( + MergeError, match="Not allowed to merge between different levels" + ): + df1.join(df2, on="a") + + def test_frame_join_tzaware(self): + test1 = DataFrame( + np.zeros((6, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=6, freq="100ms", tz="US/Central" + ), + ) + test2 = DataFrame( + np.zeros((3, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=3, freq="250ms", tz="US/Central" + ), + columns=range(3, 6), + ) + + result = test1.join(test2, how="outer") + expected = test1.index.union(test2.index) + + tm.assert_index_equal(result.index, expected) + assert result.index.tz.zone == "US/Central" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_map.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_map.py new file mode 100644 index 0000000000000000000000000000000000000000..03681c3df844e058e147a026e45c226469f38f9d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_map.py @@ -0,0 +1,216 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + +from pandas.tseries.offsets import BDay + + +def test_map(float_frame): + result = float_frame.map(lambda x: x * 2) + tm.assert_frame_equal(result, float_frame * 2) + float_frame.map(type) + + # GH 465: function returning tuples + result = float_frame.map(lambda x: (x, x))["A"].iloc[0] + assert isinstance(result, tuple) + + +@pytest.mark.parametrize("val", [1, 1.0]) +def test_map_float_object_conversion(val): + # GH 2909: object conversion to float in constructor? + df = DataFrame(data=[val, "a"]) + result = df.map(lambda x: x).dtypes[0] + assert result == object + + +@pytest.mark.parametrize("na_action", [None, "ignore"]) +def test_map_keeps_dtype(na_action): + # GH52219 + arr = Series(["a", np.nan, "b"]) + sparse_arr = arr.astype(pd.SparseDtype(object)) + df = DataFrame(data={"a": arr, "b": sparse_arr}) + + def func(x): + return str.upper(x) if not pd.isna(x) else x + + result = df.map(func, na_action=na_action) + + expected_sparse = pd.array(["A", np.nan, "B"], dtype=pd.SparseDtype(object)) + expected_arr = expected_sparse.astype(object) + expected = DataFrame({"a": expected_arr, "b": expected_sparse}) + + tm.assert_frame_equal(result, expected) + + result_empty = df.iloc[:0, :].map(func, na_action=na_action) + expected_empty = expected.iloc[:0, :] + tm.assert_frame_equal(result_empty, expected_empty) + + +def test_map_str(): + # GH 2786 + df = DataFrame(np.random.default_rng(2).random((3, 4))) + df2 = df.copy() + cols = ["a", "a", "a", "a"] + df.columns = cols + + expected = df2.map(str) + expected.columns = cols + result = df.map(str) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "col, val", + [["datetime", Timestamp("20130101")], ["timedelta", pd.Timedelta("1 min")]], +) +def test_map_datetimelike(col, val): + # datetime/timedelta + df = DataFrame(np.random.default_rng(2).random((3, 4))) + df[col] = val + result = df.map(str) + assert result.loc[0, col] == str(df.loc[0, col]) + + +@pytest.mark.parametrize( + "expected", + [ + DataFrame(), + DataFrame(columns=list("ABC")), + DataFrame(index=list("ABC")), + DataFrame({"A": [], "B": [], "C": []}), + ], +) +@pytest.mark.parametrize("func", [round, lambda x: x]) +def test_map_empty(expected, func): + # GH 8222 + result = expected.map(func) + tm.assert_frame_equal(result, expected) + + +def test_map_kwargs(): + # GH 40652 + result = DataFrame([[1, 2], [3, 4]]).map(lambda x, y: x + y, y=2) + expected = DataFrame([[3, 4], [5, 6]]) + tm.assert_frame_equal(result, expected) + + +def test_map_na_ignore(float_frame): + # GH 23803 + strlen_frame = float_frame.map(lambda x: len(str(x))) + float_frame_with_na = float_frame.copy() + mask = np.random.default_rng(2).integers(0, 2, size=float_frame.shape, dtype=bool) + float_frame_with_na[mask] = pd.NA + strlen_frame_na_ignore = float_frame_with_na.map( + lambda x: len(str(x)), na_action="ignore" + ) + # Set float64 type to avoid upcast when setting NA below + strlen_frame_with_na = strlen_frame.copy().astype("float64") + strlen_frame_with_na[mask] = pd.NA + tm.assert_frame_equal(strlen_frame_na_ignore, strlen_frame_with_na) + + +def test_map_box_timestamps(): + # GH 2689, GH 2627 + ser = Series(date_range("1/1/2000", periods=10)) + + def func(x): + return (x.hour, x.day, x.month) + + # it works! + DataFrame(ser).map(func) + + +def test_map_box(): + # ufunc will not be boxed. Same test cases as the test_map_box + df = DataFrame( + { + "a": [Timestamp("2011-01-01"), Timestamp("2011-01-02")], + "b": [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + ], + "c": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], + "d": [ + pd.Period("2011-01-01", freq="M"), + pd.Period("2011-01-02", freq="M"), + ], + } + ) + + result = df.map(lambda x: type(x).__name__) + expected = DataFrame( + { + "a": ["Timestamp", "Timestamp"], + "b": ["Timestamp", "Timestamp"], + "c": ["Timedelta", "Timedelta"], + "d": ["Period", "Period"], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_frame_map_dont_convert_datetime64(): + df = DataFrame({"x1": [datetime(1996, 1, 1)]}) + + df = df.map(lambda x: x + BDay()) + df = df.map(lambda x: x + BDay()) + + result = df.x1.dtype + assert result == "M8[ns]" + + +def test_map_function_runs_once(): + df = DataFrame({"a": [1, 2, 3]}) + values = [] # Save values function is applied to + + def reducing_function(val): + values.append(val) + + def non_reducing_function(val): + values.append(val) + return val + + for func in [reducing_function, non_reducing_function]: + del values[:] + + df.map(func) + assert values == df.a.to_list() + + +def test_map_type(): + # GH 46719 + df = DataFrame( + {"col1": [3, "string", float], "col2": [0.25, datetime(2020, 1, 1), np.nan]}, + index=["a", "b", "c"], + ) + + result = df.map(type) + expected = DataFrame( + {"col1": [int, str, type], "col2": [float, datetime, float]}, + index=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + + +def test_map_invalid_na_action(float_frame): + # GH 23803 + with pytest.raises(ValueError, match="na_action must be .*Got 'abc'"): + float_frame.map(lambda x: len(str(x)), na_action="abc") + + +def test_applymap_deprecated(): + # GH52353 + df = DataFrame({"a": [1, 2, 3]}) + msg = "DataFrame.applymap has been deprecated. Use DataFrame.map instead." + with tm.assert_produces_warning(FutureWarning, match=msg): + df.applymap(lambda x: x) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_matmul.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_matmul.py new file mode 100644 index 0000000000000000000000000000000000000000..be9462b64fa1b919b13772e9d07727258931b952 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_matmul.py @@ -0,0 +1,98 @@ +import operator + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +class TestMatMul: + def test_matmul(self): + # matmul test is for GH#10259 + a = DataFrame( + np.random.default_rng(2).standard_normal((3, 4)), + index=["a", "b", "c"], + columns=["p", "q", "r", "s"], + ) + b = DataFrame( + np.random.default_rng(2).standard_normal((4, 2)), + index=["p", "q", "r", "s"], + columns=["one", "two"], + ) + + # DataFrame @ DataFrame + result = operator.matmul(a, b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_frame_equal(result, expected) + + # DataFrame @ Series + result = operator.matmul(a, b.one) + expected = Series(np.dot(a.values, b.one.values), index=["a", "b", "c"]) + tm.assert_series_equal(result, expected) + + # np.array @ DataFrame + result = operator.matmul(a.values, b) + assert isinstance(result, DataFrame) + assert result.columns.equals(b.columns) + assert result.index.equals(Index(range(3))) + expected = np.dot(a.values, b.values) + tm.assert_almost_equal(result.values, expected) + + # nested list @ DataFrame (__rmatmul__) + result = operator.matmul(a.values.tolist(), b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_almost_equal(result.values, expected.values) + + # mixed dtype DataFrame @ DataFrame + a["q"] = a.q.round().astype(int) + result = operator.matmul(a, b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_frame_equal(result, expected) + + # different dtypes DataFrame @ DataFrame + a = a.astype(int) + result = operator.matmul(a, b) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) + tm.assert_frame_equal(result, expected) + + # unaligned + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 4)), + index=[1, 2, 3], + columns=range(4), + ) + df2 = DataFrame( + np.random.default_rng(2).standard_normal((5, 3)), + index=range(5), + columns=[1, 2, 3], + ) + + with pytest.raises(ValueError, match="aligned"): + operator.matmul(df, df2) + + def test_matmul_message_shapes(self): + # GH#21581 exception message should reflect original shapes, + # not transposed shapes + a = np.random.default_rng(2).random((10, 4)) + b = np.random.default_rng(2).random((5, 3)) + + df = DataFrame(b) + + msg = r"shapes \(10, 4\) and \(5, 3\) not aligned" + with pytest.raises(ValueError, match=msg): + a @ df + with pytest.raises(ValueError, match=msg): + a.tolist() @ df diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_nlargest.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_nlargest.py new file mode 100644 index 0000000000000000000000000000000000000000..3ba893501914acf31043a700eff4ef556f0cc5aa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_nlargest.py @@ -0,0 +1,250 @@ +""" +Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo" +but are implicitly also testing nsmallest_foo. +""" +from string import ascii_lowercase + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.util.version import Version + + +@pytest.fixture +def df_duplicates(): + return pd.DataFrame( + {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, + index=[0, 0, 1, 1, 1], + ) + + +@pytest.fixture +def df_strings(): + return pd.DataFrame( + { + "a": np.random.default_rng(2).permutation(10), + "b": list(ascii_lowercase[:10]), + "c": np.random.default_rng(2).permutation(10).astype("float64"), + } + ) + + +@pytest.fixture +def df_main_dtypes(): + return pd.DataFrame( + { + "group": [1, 1, 2], + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "category_string": pd.Series(list("abc")).astype("category"), + "category_int": [7, 8, 9], + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + }, + columns=[ + "group", + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ], + ) + + +class TestNLargestNSmallest: + # ---------------------------------------------------------------------- + # Top / bottom + @pytest.mark.parametrize( + "order", + [ + ["a"], + ["c"], + ["a", "b"], + ["a", "c"], + ["b", "a"], + ["b", "c"], + ["a", "b", "c"], + ["c", "a", "b"], + ["c", "b", "a"], + ["b", "c", "a"], + ["b", "a", "c"], + # dups! + ["b", "c", "c"], + ], + ) + @pytest.mark.parametrize("n", range(1, 11)) + def test_nlargest_n(self, df_strings, nselect_method, n, order): + # GH#10393 + df = df_strings + if "b" in order: + error_msg = ( + f"Column 'b' has dtype (object|string), " + f"cannot use method '{nselect_method}' with this dtype" + ) + with pytest.raises(TypeError, match=error_msg): + getattr(df, nselect_method)(n, order) + else: + ascending = nselect_method == "nsmallest" + result = getattr(df, nselect_method)(n, order) + expected = df.sort_values(order, ascending=ascending).head(n) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "columns", [["group", "category_string"], ["group", "string"]] + ) + def test_nlargest_error(self, df_main_dtypes, nselect_method, columns): + df = df_main_dtypes + col = columns[1] + error_msg = ( + f"Column '{col}' has dtype {df[col].dtype}, " + f"cannot use method '{nselect_method}' with this dtype" + ) + # escape some characters that may be in the repr + error_msg = ( + error_msg.replace("(", "\\(") + .replace(")", "\\)") + .replace("[", "\\[") + .replace("]", "\\]") + ) + with pytest.raises(TypeError, match=error_msg): + getattr(df, nselect_method)(2, columns) + + def test_nlargest_all_dtypes(self, df_main_dtypes): + df = df_main_dtypes + df.nsmallest(2, list(set(df) - {"category_string", "string"})) + df.nlargest(2, list(set(df) - {"category_string", "string"})) + + def test_nlargest_duplicates_on_starter_columns(self): + # regression test for GH#22752 + + df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]}) + + result = df.nlargest(4, columns=["a", "b"]) + expected = pd.DataFrame( + {"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3] + ) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(4, columns=["a", "b"]) + expected = pd.DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0] + ) + tm.assert_frame_equal(result, expected) + + def test_nlargest_n_identical_values(self): + # GH#15297 + df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) + + result = df.nlargest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "order", + [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], + ) + @pytest.mark.parametrize("n", range(1, 6)) + def test_nlargest_n_duplicate_index(self, df_duplicates, n, order, request): + # GH#13412 + + df = df_duplicates + result = df.nsmallest(n, order) + expected = df.sort_values(order).head(n) + tm.assert_frame_equal(result, expected) + + result = df.nlargest(n, order) + expected = df.sort_values(order, ascending=False).head(n) + if Version(np.__version__) >= Version("1.25") and ( + (order == ["a"] and n in (1, 2, 3, 4)) or (order == ["a", "b"]) and n == 5 + ): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + tm.assert_frame_equal(result, expected) + + def test_nlargest_duplicate_keep_all_ties(self): + # GH#16818 + df = pd.DataFrame( + {"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]} + ) + result = df.nlargest(4, "a", keep="all") + expected = pd.DataFrame( + { + "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(2, "a", keep="all") + expected = pd.DataFrame( + { + "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) + tm.assert_frame_equal(result, expected) + + def test_nlargest_multiindex_column_lookup(self): + # Check whether tuples are correctly treated as multi-level lookups. + # GH#23033 + df = pd.DataFrame( + columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]), + data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]], + ) + + # nsmallest + result = df.nsmallest(3, ("x", "a")) + expected = df.iloc[[2, 0, 3]] + tm.assert_frame_equal(result, expected) + + # nlargest + result = df.nlargest(3, ("x", "b")) + expected = df.iloc[[3, 2, 1]] + tm.assert_frame_equal(result, expected) + + def test_nlargest_nan(self): + # GH#43060 + df = pd.DataFrame([np.nan, np.nan, 0, 1, 2, 3]) + result = df.nlargest(5, 0) + expected = df.sort_values(0, ascending=False).head(5) + tm.assert_frame_equal(result, expected) + + def test_nsmallest_nan_after_n_element(self): + # GH#46589 + df = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, None, 7], + "b": [7, 6, 5, 4, 3, 2, 1], + "c": [1, 1, 2, 2, 3, 3, 3], + }, + index=range(7), + ) + result = df.nsmallest(5, columns=["a", "b"]) + expected = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": [7, 6, 5, 4, 3], + "c": [1, 1, 2, 2, 3], + }, + index=range(5), + ).astype({"a": "float"}) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_pct_change.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_pct_change.py new file mode 100644 index 0000000000000000000000000000000000000000..92b66e12d4356ca33d6351b092e3655541c9e8bb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_pct_change.py @@ -0,0 +1,180 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class TestDataFramePctChange: + @pytest.mark.parametrize( + "periods, fill_method, limit, exp", + [ + (1, "ffill", None, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, 0]), + (1, "ffill", 1, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, np.nan]), + (1, "bfill", None, [np.nan, 0, 0, 1, 1, 1.5, np.nan, np.nan]), + (1, "bfill", 1, [np.nan, np.nan, 0, 1, 1, 1.5, np.nan, np.nan]), + (-1, "ffill", None, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, 0, np.nan]), + (-1, "ffill", 1, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, np.nan, np.nan]), + (-1, "bfill", None, [0, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]), + (-1, "bfill", 1, [np.nan, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]), + ], + ) + def test_pct_change_with_nas( + self, periods, fill_method, limit, exp, frame_or_series + ): + vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] + obj = frame_or_series(vals) + + msg = ( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + f"{type(obj).__name__}.pct_change are deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + res = obj.pct_change(periods=periods, fill_method=fill_method, limit=limit) + tm.assert_equal(res, frame_or_series(exp)) + + def test_pct_change_numeric(self): + # GH#11150 + pnl = DataFrame( + [np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)] + ).astype(np.float64) + pnl.iat[1, 0] = np.nan + pnl.iat[1, 1] = np.nan + pnl.iat[2, 3] = 60 + + msg = ( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + "DataFrame.pct_change are deprecated" + ) + + for axis in range(2): + expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pnl.pct_change(axis=axis, fill_method="pad") + tm.assert_frame_equal(result, expected) + + def test_pct_change(self, datetime_frame): + msg = ( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + "DataFrame.pct_change are deprecated" + ) + + rs = datetime_frame.pct_change(fill_method=None) + tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) + + rs = datetime_frame.pct_change(2) + filled = datetime_frame.ffill() + tm.assert_frame_equal(rs, filled / filled.shift(2) - 1) + + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = datetime_frame.pct_change(fill_method="bfill", limit=1) + filled = datetime_frame.bfill(limit=1) + tm.assert_frame_equal(rs, filled / filled.shift(1) - 1) + + rs = datetime_frame.pct_change(freq="5D") + filled = datetime_frame.ffill() + tm.assert_frame_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) + + def test_pct_change_shift_over_nas(self): + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) + + df = DataFrame({"a": s, "b": s}) + + msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + chg = df.pct_change() + + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + edf = DataFrame({"a": expected, "b": expected}) + tm.assert_frame_equal(chg, edf) + + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) + def test_pct_change_periods_freq( + self, datetime_frame, freq, periods, fill_method, limit + ): + msg = ( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + "DataFrame.pct_change are deprecated" + ) + + # GH#7292 + with tm.assert_produces_warning(FutureWarning, match=msg): + rs_freq = datetime_frame.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs_periods = datetime_frame.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_frame_equal(rs_freq, rs_periods) + + empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs_freq = empty_ts.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs_periods = empty_ts.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_frame_equal(rs_freq, rs_periods) + + +@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) +def test_pct_change_with_duplicated_indices(fill_method): + # GH30463 + data = DataFrame( + {0: [np.nan, 1, 2, 3, 9, 18], 1: [0, 1, np.nan, 3, 9, 18]}, index=["a", "b"] * 3 + ) + + warn = None if fill_method is None else FutureWarning + msg = ( + "The 'fill_method' keyword being not None and the 'limit' keyword in " + "DataFrame.pct_change are deprecated" + ) + with tm.assert_produces_warning(warn, match=msg): + result = data.pct_change(fill_method=fill_method) + + if fill_method is None: + second_column = [np.nan, np.inf, np.nan, np.nan, 2.0, 1.0] + else: + second_column = [np.nan, np.inf, 0.0, 2.0, 2.0, 1.0] + expected = DataFrame( + {0: [np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], 1: second_column}, + index=["a", "b"] * 3, + ) + tm.assert_frame_equal(result, expected) + + +def test_pct_change_none_beginning_no_warning(): + # GH#54481 + df = DataFrame( + [ + [1, None], + [2, 1], + [3, 2], + [4, 3], + [5, 4], + ] + ) + result = df.pct_change() + expected = DataFrame( + {0: [np.nan, 1, 0.5, 1 / 3, 0.25], 1: [np.nan, np.nan, 1, 0.5, 1 / 3]} + ) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_pipe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_pipe.py new file mode 100644 index 0000000000000000000000000000000000000000..5bcc4360487f38491e2ae9f4c79d837e72ed0f6d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_pipe.py @@ -0,0 +1,39 @@ +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class TestPipe: + def test_pipe(self, frame_or_series): + obj = DataFrame({"A": [1, 2, 3]}) + expected = DataFrame({"A": [1, 4, 9]}) + if frame_or_series is Series: + obj = obj["A"] + expected = expected["A"] + + f = lambda x, y: x**y + result = obj.pipe(f, 2) + tm.assert_equal(result, expected) + + def test_pipe_tuple(self, frame_or_series): + obj = DataFrame({"A": [1, 2, 3]}) + obj = tm.get_obj(obj, frame_or_series) + + f = lambda x, y: y + result = obj.pipe((f, "y"), 0) + tm.assert_equal(result, obj) + + def test_pipe_tuple_error(self, frame_or_series): + obj = DataFrame({"A": [1, 2, 3]}) + obj = tm.get_obj(obj, frame_or_series) + + f = lambda x, y: y + + msg = "y is both the pipe target and a keyword argument" + + with pytest.raises(ValueError, match=msg): + obj.pipe((f, "y"), x=1, y=0) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_pop.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_pop.py new file mode 100644 index 0000000000000000000000000000000000000000..3eb058015cd3da081e3c34954c0bd3229337de31 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_pop.py @@ -0,0 +1,72 @@ +import numpy as np + +from pandas import ( + DataFrame, + MultiIndex, + Series, +) +import pandas._testing as tm + + +class TestDataFramePop: + def test_pop(self, float_frame, warn_copy_on_write): + float_frame.columns.name = "baz" + + float_frame.pop("A") + assert "A" not in float_frame + + float_frame["foo"] = "bar" + float_frame.pop("foo") + assert "foo" not in float_frame + assert float_frame.columns.name == "baz" + + # gh-10912: inplace ops cause caching issue + a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"]) + b = a.pop("B") + with tm.assert_cow_warning(warn_copy_on_write): + b += 1 + + # original frame + expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"]) + tm.assert_frame_equal(a, expected) + + # result + expected = Series([2, 5], index=["X", "Y"], name="B") + 1 + tm.assert_series_equal(b, expected) + + def test_pop_non_unique_cols(self): + df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]}) + df.columns = ["a", "b", "a"] + + res = df.pop("a") + assert type(res) == DataFrame + assert len(res) == 2 + assert len(df.columns) == 1 + assert "b" in df.columns + assert "a" not in df.columns + assert len(df.index) == 2 + + def test_mixed_depth_pop(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.default_rng(2).standard_normal((4, 6)), columns=index) + + df1 = df.copy() + df2 = df.copy() + result = df1.pop("a") + expected = df2.pop(("a", "", "")) + tm.assert_series_equal(expected, result, check_names=False) + tm.assert_frame_equal(df1, df2) + assert result.name == "a" + + expected = df1["top"] + df1 = df1.drop(["top"], axis=1) + result = df2.pop("top") + tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(df1, df2) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_quantile.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_quantile.py new file mode 100644 index 0000000000000000000000000000000000000000..0f27eae1a3bfcb963bd3e5ba33056a6635a2d2bc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_quantile.py @@ -0,0 +1,972 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + Series, + Timestamp, +) +import pandas._testing as tm + + +@pytest.fixture( + params=[["linear", "single"], ["nearest", "table"]], ids=lambda x: "-".join(x) +) +def interp_method(request): + """(interpolation, method) arguments for quantile""" + return request.param + + +class TestDataFrameQuantile: + @pytest.mark.parametrize( + "df,expected", + [ + [ + DataFrame( + { + 0: Series(pd.arrays.SparseArray([1, 2])), + 1: Series(pd.arrays.SparseArray([3, 4])), + } + ), + Series([1.5, 3.5], name=0.5), + ], + [ + DataFrame(Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")), + Series([1.0], name=0.5), + ], + ], + ) + def test_quantile_sparse(self, df, expected): + # GH#17198 + # GH#24600 + result = df.quantile() + expected = expected.astype("Sparse[float]") + tm.assert_series_equal(result, expected) + + def test_quantile( + self, datetime_frame, interp_method, using_array_manager, request + ): + interpolation, method = interp_method + df = datetime_frame + result = df.quantile( + 0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method + ) + expected = Series( + [np.percentile(df[col], 10) for col in df.columns], + index=df.columns, + name=0.1, + ) + if interpolation == "linear": + # np.percentile values only comparable to linear interpolation + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result.index, expected.index) + request.applymarker( + pytest.mark.xfail( + using_array_manager, reason="Name set incorrectly for arraymanager" + ) + ) + assert result.name == expected.name + + result = df.quantile( + 0.9, axis=1, numeric_only=True, interpolation=interpolation, method=method + ) + expected = Series( + [np.percentile(df.loc[date], 90) for date in df.index], + index=df.index, + name=0.9, + ) + if interpolation == "linear": + # np.percentile values only comparable to linear interpolation + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result.index, expected.index) + request.applymarker( + pytest.mark.xfail( + using_array_manager, reason="Name set incorrectly for arraymanager" + ) + ) + assert result.name == expected.name + + def test_empty(self, interp_method): + interpolation, method = interp_method + q = DataFrame({"x": [], "y": []}).quantile( + 0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method + ) + assert np.isnan(q["x"]) and np.isnan(q["y"]) + + def test_non_numeric_exclusion(self, interp_method, request, using_array_manager): + interpolation, method = interp_method + df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) + rs = df.quantile( + 0.5, numeric_only=True, interpolation=interpolation, method=method + ) + xp = df.median(numeric_only=True).rename(0.5) + if interpolation == "nearest": + xp = (xp + 0.5).astype(np.int64) + if method == "table" and using_array_manager: + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) + tm.assert_series_equal(rs, xp) + + def test_axis(self, interp_method, request, using_array_manager): + # axis + interpolation, method = interp_method + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + result = df.quantile(0.5, axis=1, interpolation=interpolation, method=method) + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) + if interpolation == "nearest": + expected = expected.astype(np.int64) + if method == "table" and using_array_manager: + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) + tm.assert_series_equal(result, expected) + + result = df.quantile( + [0.5, 0.75], axis=1, interpolation=interpolation, method=method + ) + expected = DataFrame( + {1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75] + ) + if interpolation == "nearest": + expected.iloc[0, :] -= 0.5 + expected.iloc[1, :] += 0.25 + expected = expected.astype(np.int64) + tm.assert_frame_equal(result, expected, check_index_type=True) + + def test_axis_numeric_only_true(self, interp_method, request, using_array_manager): + # We may want to break API in the future to change this + # so that we exclude non-numeric along the same axis + # See GH #7312 + interpolation, method = interp_method + df = DataFrame([[1, 2, 3], ["a", "b", 4]]) + result = df.quantile( + 0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method + ) + expected = Series([3.0, 4.0], index=[0, 1], name=0.5) + if interpolation == "nearest": + expected = expected.astype(np.int64) + if method == "table" and using_array_manager: + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) + tm.assert_series_equal(result, expected) + + def test_quantile_date_range(self, interp_method, request, using_array_manager): + # GH 2460 + interpolation, method = interp_method + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + ser = Series(dti) + df = DataFrame(ser) + + result = df.quantile( + numeric_only=False, interpolation=interpolation, method=method + ) + expected = Series( + ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]" + ) + if method == "table" and using_array_manager: + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) + + tm.assert_series_equal(result, expected) + + def test_quantile_axis_mixed(self, interp_method, request, using_array_manager): + # mixed on axis=1 + interpolation, method = interp_method + df = DataFrame( + { + "A": [1, 2, 3], + "B": [2.0, 3.0, 4.0], + "C": pd.date_range("20130101", periods=3), + "D": ["foo", "bar", "baz"], + } + ) + result = df.quantile( + 0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method + ) + expected = Series([1.5, 2.5, 3.5], name=0.5) + if interpolation == "nearest": + expected -= 0.5 + if method == "table" and using_array_manager: + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) + tm.assert_series_equal(result, expected) + + # must raise + msg = "'<' not supported between instances of 'Timestamp' and 'float'" + with pytest.raises(TypeError, match=msg): + df.quantile(0.5, axis=1, numeric_only=False) + + def test_quantile_axis_parameter(self, interp_method, request, using_array_manager): + # GH 9543/9544 + interpolation, method = interp_method + if method == "table" and using_array_manager: + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + + result = df.quantile(0.5, axis=0, interpolation=interpolation, method=method) + + expected = Series([2.0, 3.0], index=["A", "B"], name=0.5) + if interpolation == "nearest": + expected = expected.astype(np.int64) + tm.assert_series_equal(result, expected) + + expected = df.quantile( + 0.5, axis="index", interpolation=interpolation, method=method + ) + if interpolation == "nearest": + expected = expected.astype(np.int64) + tm.assert_series_equal(result, expected) + + result = df.quantile(0.5, axis=1, interpolation=interpolation, method=method) + + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) + if interpolation == "nearest": + expected = expected.astype(np.int64) + tm.assert_series_equal(result, expected) + + result = df.quantile( + 0.5, axis="columns", interpolation=interpolation, method=method + ) + tm.assert_series_equal(result, expected) + + msg = "No axis named -1 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.quantile(0.1, axis=-1, interpolation=interpolation, method=method) + msg = "No axis named column for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.quantile(0.1, axis="column") + + def test_quantile_interpolation(self): + # see gh-10174 + + # interpolation method other than default linear + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + result = df.quantile(0.5, axis=1, interpolation="nearest") + expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5) + tm.assert_series_equal(result, expected) + + # cross-check interpolation=nearest results in original dtype + exp = np.percentile( + np.array([[1, 2, 3], [2, 3, 4]]), + 0.5, + axis=0, + method="nearest", + ) + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64") + tm.assert_series_equal(result, expected) + + # float + df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3]) + result = df.quantile(0.5, axis=1, interpolation="nearest") + expected = Series([1.0, 2.0, 3.0], index=[1, 2, 3], name=0.5) + tm.assert_series_equal(result, expected) + exp = np.percentile( + np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]), + 0.5, + axis=0, + method="nearest", + ) + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64") + tm.assert_series_equal(result, expected) + + # axis + result = df.quantile([0.5, 0.75], axis=1, interpolation="lower") + expected = DataFrame( + {1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75] + ) + tm.assert_frame_equal(result, expected) + + # test degenerate case + df = DataFrame({"x": [], "y": []}) + q = df.quantile(0.1, axis=0, interpolation="higher") + assert np.isnan(q["x"]) and np.isnan(q["y"]) + + # multi + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantile([0.25, 0.5], interpolation="midpoint") + + # https://github.com/numpy/numpy/issues/7163 + expected = DataFrame( + [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], + index=[0.25, 0.5], + columns=["a", "b", "c"], + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_interpolation_datetime(self, datetime_frame): + # see gh-10174 + + # interpolation = linear (default case) + df = datetime_frame + q = df.quantile(0.1, axis=0, numeric_only=True, interpolation="linear") + assert q["A"] == np.percentile(df["A"], 10) + + def test_quantile_interpolation_int(self, int_frame): + # see gh-10174 + + df = int_frame + # interpolation = linear (default case) + q = df.quantile(0.1) + assert q["A"] == np.percentile(df["A"], 10) + + # test with and without interpolation keyword + q1 = df.quantile(0.1, axis=0, interpolation="linear") + assert q1["A"] == np.percentile(df["A"], 10) + tm.assert_series_equal(q, q1) + + def test_quantile_multi(self, interp_method, request, using_array_manager): + interpolation, method = interp_method + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantile([0.25, 0.5], interpolation=interpolation, method=method) + expected = DataFrame( + [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], + index=[0.25, 0.5], + columns=["a", "b", "c"], + ) + if interpolation == "nearest": + expected = expected.astype(np.int64) + if method == "table" and using_array_manager: + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) + tm.assert_frame_equal(result, expected) + + def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager): + interpolation, method = interp_method + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantile( + [0.25, 0.5], axis=1, interpolation=interpolation, method=method + ) + expected = DataFrame( + [[1.0, 2.0, 3.0]] * 2, index=[0.25, 0.5], columns=[0, 1, 2] + ) + if interpolation == "nearest": + expected = expected.astype(np.int64) + if method == "table" and using_array_manager: + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) + tm.assert_frame_equal(result, expected) + + def test_quantile_multi_empty(self, interp_method): + interpolation, method = interp_method + result = DataFrame({"x": [], "y": []}).quantile( + [0.1, 0.9], axis=0, interpolation=interpolation, method=method + ) + expected = DataFrame( + {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9] + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_datetime(self, unit): + dti = pd.to_datetime(["2010", "2011"]).as_unit(unit) + df = DataFrame({"a": dti, "b": [0, 5]}) + + # exclude datetime + result = df.quantile(0.5, numeric_only=True) + expected = Series([2.5], index=["b"], name=0.5) + tm.assert_series_equal(result, expected) + + # datetime + result = df.quantile(0.5, numeric_only=False) + expected = Series( + [Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5 + ) + tm.assert_series_equal(result, expected) + + # datetime w/ multi + result = df.quantile([0.5], numeric_only=False) + expected = DataFrame( + {"a": Timestamp("2010-07-02 12:00:00").as_unit(unit), "b": 2.5}, + index=[0.5], + ) + tm.assert_frame_equal(result, expected) + + # axis = 1 + df["c"] = pd.to_datetime(["2011", "2012"]).as_unit(unit) + result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False) + expected = Series( + [Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")], + index=[0, 1], + name=0.5, + dtype=f"M8[{unit}]", + ) + tm.assert_series_equal(result, expected) + + result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False) + expected = DataFrame( + [[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]], + index=[0.5], + columns=[0, 1], + dtype=f"M8[{unit}]", + ) + tm.assert_frame_equal(result, expected) + + # empty when numeric_only=True + result = df[["a", "c"]].quantile(0.5, numeric_only=True) + expected = Series([], index=[], dtype=np.float64, name=0.5) + tm.assert_series_equal(result, expected) + + result = df[["a", "c"]].quantile([0.5], numeric_only=True) + expected = DataFrame(index=[0.5], columns=[]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [ + "datetime64[ns]", + "datetime64[ns, US/Pacific]", + "timedelta64[ns]", + "Period[D]", + ], + ) + def test_quantile_dt64_empty(self, dtype, interp_method): + # GH#41544 + interpolation, method = interp_method + df = DataFrame(columns=["a", "b"], dtype=dtype) + + res = df.quantile( + 0.5, axis=1, numeric_only=False, interpolation=interpolation, method=method + ) + expected = Series([], index=[], name=0.5, dtype=dtype) + tm.assert_series_equal(res, expected) + + # no columns in result, so no dtype preservation + res = df.quantile( + [0.5], + axis=1, + numeric_only=False, + interpolation=interpolation, + method=method, + ) + expected = DataFrame(index=[0.5], columns=[]) + tm.assert_frame_equal(res, expected) + + @pytest.mark.parametrize("invalid", [-1, 2, [0.5, -1], [0.5, 2]]) + def test_quantile_invalid(self, invalid, datetime_frame, interp_method): + msg = "percentiles should all be in the interval \\[0, 1\\]" + interpolation, method = interp_method + with pytest.raises(ValueError, match=msg): + datetime_frame.quantile(invalid, interpolation=interpolation, method=method) + + def test_quantile_box(self, interp_method, request, using_array_manager): + interpolation, method = interp_method + if method == "table" and using_array_manager: + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) + df = DataFrame( + { + "A": [ + Timestamp("2011-01-01"), + Timestamp("2011-01-02"), + Timestamp("2011-01-03"), + ], + "B": [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + Timestamp("2011-01-03", tz="US/Eastern"), + ], + "C": [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ], + } + ) + + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) + + exp = Series( + [ + Timestamp("2011-01-02"), + Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + ], + name=0.5, + index=["A", "B", "C"], + ) + tm.assert_series_equal(res, exp) + + res = df.quantile( + [0.5], numeric_only=False, interpolation=interpolation, method=method + ) + exp = DataFrame( + [ + [ + Timestamp("2011-01-02"), + Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + ] + ], + index=[0.5], + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(res, exp) + + def test_quantile_box_nat(self): + # DatetimeLikeBlock may be consolidated and contain NaT in different loc + df = DataFrame( + { + "A": [ + Timestamp("2011-01-01"), + pd.NaT, + Timestamp("2011-01-02"), + Timestamp("2011-01-03"), + ], + "a": [ + Timestamp("2011-01-01"), + Timestamp("2011-01-02"), + pd.NaT, + Timestamp("2011-01-03"), + ], + "B": [ + Timestamp("2011-01-01", tz="US/Eastern"), + pd.NaT, + Timestamp("2011-01-02", tz="US/Eastern"), + Timestamp("2011-01-03", tz="US/Eastern"), + ], + "b": [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + pd.NaT, + Timestamp("2011-01-03", tz="US/Eastern"), + ], + "C": [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.NaT, + ], + "c": [ + pd.NaT, + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ], + }, + columns=list("AaBbCc"), + ) + + res = df.quantile(0.5, numeric_only=False) + exp = Series( + [ + Timestamp("2011-01-02"), + Timestamp("2011-01-02"), + Timestamp("2011-01-02", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + pd.Timedelta("2 days"), + ], + name=0.5, + index=list("AaBbCc"), + ) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame( + [ + [ + Timestamp("2011-01-02"), + Timestamp("2011-01-02"), + Timestamp("2011-01-02", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + pd.Timedelta("2 days"), + ] + ], + index=[0.5], + columns=list("AaBbCc"), + ) + tm.assert_frame_equal(res, exp) + + def test_quantile_nan(self, interp_method, request, using_array_manager): + interpolation, method = interp_method + if method == "table" and using_array_manager: + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) + # GH 14357 - float block where some cols have missing values + df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) + df.iloc[-1, 1] = np.nan + + res = df.quantile(0.5, interpolation=interpolation, method=method) + exp = Series( + [3.0, 2.5 if interpolation == "linear" else 3.0], index=["a", "b"], name=0.5 + ) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75], interpolation=interpolation, method=method) + exp = DataFrame( + { + "a": [3.0, 4.0], + "b": [2.5, 3.25] if interpolation == "linear" else [3.0, 4.0], + }, + index=[0.5, 0.75], + ) + tm.assert_frame_equal(res, exp) + + res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method) + exp = Series(np.arange(1.0, 6.0), name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile( + [0.5, 0.75], axis=1, interpolation=interpolation, method=method + ) + exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) + if interpolation == "nearest": + exp.iloc[1, -1] = np.nan + tm.assert_frame_equal(res, exp) + + # full-nan column + df["b"] = np.nan + + res = df.quantile(0.5, interpolation=interpolation, method=method) + exp = Series([3.0, np.nan], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75], interpolation=interpolation, method=method) + exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + def test_quantile_nat(self, interp_method, request, using_array_manager, unit): + interpolation, method = interp_method + if method == "table" and using_array_manager: + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) + # full NaT column + df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}, dtype=f"M8[{unit}]") + + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) + exp = Series([pd.NaT], index=["a"], name=0.5, dtype=f"M8[{unit}]") + tm.assert_series_equal(res, exp) + + res = df.quantile( + [0.5], numeric_only=False, interpolation=interpolation, method=method + ) + exp = DataFrame({"a": [pd.NaT]}, index=[0.5], dtype=f"M8[{unit}]") + tm.assert_frame_equal(res, exp) + + # mixed non-null / full null column + df = DataFrame( + { + "a": [ + Timestamp("2012-01-01"), + Timestamp("2012-01-02"), + Timestamp("2012-01-03"), + ], + "b": [pd.NaT, pd.NaT, pd.NaT], + }, + dtype=f"M8[{unit}]", + ) + + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) + exp = Series( + [Timestamp("2012-01-02"), pd.NaT], + index=["a", "b"], + name=0.5, + dtype=f"M8[{unit}]", + ) + tm.assert_series_equal(res, exp) + + res = df.quantile( + [0.5], numeric_only=False, interpolation=interpolation, method=method + ) + exp = DataFrame( + [[Timestamp("2012-01-02"), pd.NaT]], + index=[0.5], + columns=["a", "b"], + dtype=f"M8[{unit}]", + ) + tm.assert_frame_equal(res, exp) + + def test_quantile_empty_no_rows_floats(self, interp_method): + interpolation, method = interp_method + + df = DataFrame(columns=["a", "b"], dtype="float64") + + res = df.quantile(0.5, interpolation=interpolation, method=method) + exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], interpolation=interpolation, method=method) + exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5]) + tm.assert_frame_equal(res, exp) + + res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method) + exp = Series([], index=[], dtype="float64", name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], axis=1, interpolation=interpolation, method=method) + exp = DataFrame(columns=[], index=[0.5]) + tm.assert_frame_equal(res, exp) + + def test_quantile_empty_no_rows_ints(self, interp_method): + interpolation, method = interp_method + df = DataFrame(columns=["a", "b"], dtype="int64") + + res = df.quantile(0.5, interpolation=interpolation, method=method) + exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) + tm.assert_series_equal(res, exp) + + def test_quantile_empty_no_rows_dt64(self, interp_method): + interpolation, method = interp_method + # datetimes + df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]") + + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) + exp = Series( + [pd.NaT, pd.NaT], index=["a", "b"], dtype="datetime64[ns]", name=0.5 + ) + tm.assert_series_equal(res, exp) + + # Mixed dt64/dt64tz + df["a"] = df["a"].dt.tz_localize("US/Central") + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) + exp = exp.astype(object) + if interpolation == "nearest": + # GH#18463 TODO: would we prefer NaTs here? + msg = "The 'downcast' keyword in fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + exp = exp.fillna(np.nan, downcast=False) + tm.assert_series_equal(res, exp) + + # both dt64tz + df["b"] = df["b"].dt.tz_localize("US/Central") + res = df.quantile( + 0.5, numeric_only=False, interpolation=interpolation, method=method + ) + exp = exp.astype(df["b"].dtype) + tm.assert_series_equal(res, exp) + + def test_quantile_empty_no_columns(self, interp_method): + # GH#23925 _get_numeric_data may drop all columns + interpolation, method = interp_method + df = DataFrame(pd.date_range("1/1/18", periods=5)) + df.columns.name = "captain tightpants" + result = df.quantile( + 0.5, numeric_only=True, interpolation=interpolation, method=method + ) + expected = Series([], index=[], name=0.5, dtype=np.float64) + expected.index.name = "captain tightpants" + tm.assert_series_equal(result, expected) + + result = df.quantile( + [0.5], numeric_only=True, interpolation=interpolation, method=method + ) + expected = DataFrame([], index=[0.5], columns=[]) + expected.columns.name = "captain tightpants" + tm.assert_frame_equal(result, expected) + + def test_quantile_item_cache( + self, using_array_manager, interp_method, using_copy_on_write + ): + # previous behavior incorrect retained an invalid _item_cache entry + interpolation, method = interp_method + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"] + ) + df["D"] = df["A"] * 2 + ser = df["A"] + if not using_array_manager: + assert len(df._mgr.blocks) == 2 + + df.quantile(numeric_only=False, interpolation=interpolation, method=method) + + if using_copy_on_write: + ser.iloc[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] != 99 + else: + ser.values[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] == 99 + + def test_invalid_method(self): + with pytest.raises(ValueError, match="Invalid method: foo"): + DataFrame(range(1)).quantile(0.5, method="foo") + + def test_table_invalid_interpolation(self): + with pytest.raises(ValueError, match="Invalid interpolation: foo"): + DataFrame(range(1)).quantile(0.5, method="table", interpolation="foo") + + +class TestQuantileExtensionDtype: + # TODO: tests for axis=1? + # TODO: empty case? + + @pytest.fixture( + params=[ + pytest.param( + pd.IntervalIndex.from_breaks(range(10)), + marks=pytest.mark.xfail(reason="raises when trying to add Intervals"), + ), + pd.period_range("2016-01-01", periods=9, freq="D"), + pd.date_range("2016-01-01", periods=9, tz="US/Pacific"), + pd.timedelta_range("1 Day", periods=9), + pd.array(np.arange(9), dtype="Int64"), + pd.array(np.arange(9), dtype="Float64"), + ], + ids=lambda x: str(x.dtype), + ) + def index(self, request): + # NB: not actually an Index object + idx = request.param + idx.name = "A" + return idx + + @pytest.fixture + def obj(self, index, frame_or_series): + # bc index is not always an Index (yet), we need to re-patch .name + obj = frame_or_series(index).copy() + + if frame_or_series is Series: + obj.name = "A" + else: + obj.columns = ["A"] + return obj + + def compute_quantile(self, obj, qs): + if isinstance(obj, Series): + result = obj.quantile(qs) + else: + result = obj.quantile(qs, numeric_only=False) + return result + + def test_quantile_ea(self, request, obj, index): + # result should be invariant to shuffling + indexer = np.arange(len(index), dtype=np.intp) + np.random.default_rng(2).shuffle(indexer) + obj = obj.iloc[indexer] + + qs = [0.5, 0, 1] + result = self.compute_quantile(obj, qs) + + exp_dtype = index.dtype + if index.dtype == "Int64": + # match non-nullable casting behavior + exp_dtype = "Float64" + + # expected here assumes len(index) == 9 + expected = Series( + [index[4], index[0], index[-1]], dtype=exp_dtype, index=qs, name="A" + ) + expected = type(obj)(expected) + + tm.assert_equal(result, expected) + + def test_quantile_ea_with_na(self, obj, index): + obj.iloc[0] = index._na_value + obj.iloc[-1] = index._na_value + + # result should be invariant to shuffling + indexer = np.arange(len(index), dtype=np.intp) + np.random.default_rng(2).shuffle(indexer) + obj = obj.iloc[indexer] + + qs = [0.5, 0, 1] + result = self.compute_quantile(obj, qs) + + # expected here assumes len(index) == 9 + expected = Series( + [index[4], index[1], index[-2]], dtype=index.dtype, index=qs, name="A" + ) + expected = type(obj)(expected) + tm.assert_equal(result, expected) + + def test_quantile_ea_all_na(self, request, obj, index): + obj.iloc[:] = index._na_value + # Check dtypes were preserved; this was once a problem see GH#39763 + assert np.all(obj.dtypes == index.dtype) + + # result should be invariant to shuffling + indexer = np.arange(len(index), dtype=np.intp) + np.random.default_rng(2).shuffle(indexer) + obj = obj.iloc[indexer] + + qs = [0.5, 0, 1] + result = self.compute_quantile(obj, qs) + + expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value) + expected = Series(expected, index=qs, name="A") + expected = type(obj)(expected) + tm.assert_equal(result, expected) + + def test_quantile_ea_scalar(self, request, obj, index): + # scalar qs + + # result should be invariant to shuffling + indexer = np.arange(len(index), dtype=np.intp) + np.random.default_rng(2).shuffle(indexer) + obj = obj.iloc[indexer] + + qs = 0.5 + result = self.compute_quantile(obj, qs) + + exp_dtype = index.dtype + if index.dtype == "Int64": + exp_dtype = "Float64" + + expected = Series({"A": index[4]}, dtype=exp_dtype, name=0.5) + if isinstance(obj, Series): + expected = expected["A"] + assert result == expected + else: + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, expected_data, expected_index, axis", + [ + ["float64", [], [], 1], + ["int64", [], [], 1], + ["float64", [np.nan, np.nan], ["a", "b"], 0], + ["int64", [np.nan, np.nan], ["a", "b"], 0], + ], + ) + def test_empty_numeric(self, dtype, expected_data, expected_index, axis): + # GH 14564 + df = DataFrame(columns=["a", "b"], dtype=dtype) + result = df.quantile(0.5, axis=axis) + expected = Series( + expected_data, name=0.5, index=Index(expected_index), dtype="float64" + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, expected_data, expected_index, axis, expected_dtype", + [ + ["datetime64[ns]", [], [], 1, "datetime64[ns]"], + ["datetime64[ns]", [pd.NaT, pd.NaT], ["a", "b"], 0, "datetime64[ns]"], + ], + ) + def test_empty_datelike( + self, dtype, expected_data, expected_index, axis, expected_dtype + ): + # GH 14564 + df = DataFrame(columns=["a", "b"], dtype=dtype) + result = df.quantile(0.5, axis=axis, numeric_only=False) + expected = Series( + expected_data, name=0.5, index=Index(expected_index), dtype=expected_dtype + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "expected_data, expected_index, axis", + [ + [[np.nan, np.nan], range(2), 1], + [[], [], 0], + ], + ) + def test_datelike_numeric_only(self, expected_data, expected_index, axis): + # GH 14564 + df = DataFrame( + { + "a": pd.to_datetime(["2010", "2011"]), + "b": [0, 5], + "c": pd.to_datetime(["2011", "2012"]), + } + ) + result = df[["a", "c"]].quantile(0.5, axis=axis, numeric_only=True) + expected = Series( + expected_data, name=0.5, index=Index(expected_index), dtype=np.float64 + ) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_rank.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_rank.py new file mode 100644 index 0000000000000000000000000000000000000000..8d7a0b373f5f8ad16b8e35167a1c9b71f0d21753 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_rank.py @@ -0,0 +1,510 @@ +from datetime import ( + datetime, + timedelta, +) + +import numpy as np +import pytest + +from pandas._libs.algos import ( + Infinity, + NegInfinity, +) + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +class TestRank: + s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]) + df = DataFrame({"A": s, "B": s}) + + results = { + "average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]), + "min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]), + "max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]), + "first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]), + "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), + } + + @pytest.fixture(params=["average", "min", "max", "first", "dense"]) + def method(self, request): + """ + Fixture for trying all rank methods + """ + return request.param + + def test_rank(self, float_frame): + sp_stats = pytest.importorskip("scipy.stats") + + float_frame.loc[::2, "A"] = np.nan + float_frame.loc[::3, "B"] = np.nan + float_frame.loc[::4, "C"] = np.nan + float_frame.loc[::5, "D"] = np.nan + + ranks0 = float_frame.rank() + ranks1 = float_frame.rank(1) + mask = np.isnan(float_frame.values) + + fvals = float_frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fvals) + exp0[mask] = np.nan + + exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fvals) + exp1[mask] = np.nan + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # integers + df = DataFrame( + np.random.default_rng(2).integers(0, 5, size=40).reshape((10, 4)) + ) + + result = df.rank() + exp = df.astype(float).rank() + tm.assert_frame_equal(result, exp) + + result = df.rank(1) + exp = df.astype(float).rank(1) + tm.assert_frame_equal(result, exp) + + def test_rank2(self): + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 + result = df.rank(1, pct=True) + tm.assert_frame_equal(result, expected) + + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = df.rank(0) / 2.0 + result = df.rank(0, pct=True) + tm.assert_frame_equal(result, expected) + + df = DataFrame([["b", "c", "a"], ["a", "c", "b"]]) + expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) + result = df.rank(1, numeric_only=False) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) + result = df.rank(0, numeric_only=False) + tm.assert_frame_equal(result, expected) + + df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]]) + expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]]) + result = df.rank(1, numeric_only=False) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]]) + result = df.rank(0, numeric_only=False) + tm.assert_frame_equal(result, expected) + + # f7u12, this does not work without extensive workaround + data = [ + [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], + ] + df = DataFrame(data) + + # check the rank + expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]]) + result = df.rank(1, numeric_only=False, ascending=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]]) + result = df.rank(1, numeric_only=False, ascending=False) + tm.assert_frame_equal(result, expected) + + df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) + exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]}) + tm.assert_frame_equal(df.rank(), exp) + + def test_rank_does_not_mutate(self): + # GH#18521 + # Check rank does not mutate DataFrame + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 3)), dtype="float64" + ) + expected = df.copy() + df.rank() + result = df + tm.assert_frame_equal(result, expected) + + def test_rank_mixed_frame(self, float_string_frame): + float_string_frame["datetime"] = datetime.now() + float_string_frame["timedelta"] = timedelta(days=1, seconds=1) + + float_string_frame.rank(numeric_only=False) + with pytest.raises(TypeError, match="not supported between instances of"): + float_string_frame.rank(axis=1) + + def test_rank_na_option(self, float_frame): + sp_stats = pytest.importorskip("scipy.stats") + + float_frame.loc[::2, "A"] = np.nan + float_frame.loc[::3, "B"] = np.nan + float_frame.loc[::4, "C"] = np.nan + float_frame.loc[::5, "D"] = np.nan + + # bottom + ranks0 = float_frame.rank(na_option="bottom") + ranks1 = float_frame.rank(1, na_option="bottom") + + fvals = float_frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fvals) + exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fvals) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # top + ranks0 = float_frame.rank(na_option="top") + ranks1 = float_frame.rank(1, na_option="top") + + fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values + fval1 = float_frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fval0) + exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fval1) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # descending + + # bottom + ranks0 = float_frame.rank(na_option="top", ascending=False) + ranks1 = float_frame.rank(1, na_option="top", ascending=False) + + fvals = float_frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(sp_stats.rankdata, 0, -fvals) + exp1 = np.apply_along_axis(sp_stats.rankdata, 1, -fvals) + + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) + + # descending + + # top + ranks0 = float_frame.rank(na_option="bottom", ascending=False) + ranks1 = float_frame.rank(1, na_option="bottom", ascending=False) + + fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values + fval1 = float_frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(sp_stats.rankdata, 0, -fval0) + exp1 = np.apply_along_axis(sp_stats.rankdata, 1, -fval1) + + tm.assert_numpy_array_equal(ranks0.values, exp0) + tm.assert_numpy_array_equal(ranks1.values, exp1) + + # bad values throw error + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + + with pytest.raises(ValueError, match=msg): + float_frame.rank(na_option="bad", ascending=False) + + # invalid type + with pytest.raises(ValueError, match=msg): + float_frame.rank(na_option=True, ascending=False) + + def test_rank_axis(self): + # check if using axes' names gives the same result + df = DataFrame([[2, 1], [4, 3]]) + tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index")) + tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns")) + + @pytest.mark.parametrize("ax", [0, 1]) + @pytest.mark.parametrize("m", ["average", "min", "max", "first", "dense"]) + def test_rank_methods_frame(self, ax, m): + sp_stats = pytest.importorskip("scipy.stats") + + xs = np.random.default_rng(2).integers(0, 21, (100, 26)) + xs = (xs - 10.0) / 10.0 + cols = [chr(ord("z") - i) for i in range(xs.shape[1])] + + for vals in [xs, xs + 1e6, xs * 1e-6]: + df = DataFrame(vals, columns=cols) + + result = df.rank(axis=ax, method=m) + sprank = np.apply_along_axis( + sp_stats.rankdata, ax, vals, m if m != "first" else "ordinal" + ) + sprank = sprank.astype(np.float64) + expected = DataFrame(sprank, columns=cols).astype("float64") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) + def test_rank_descending(self, method, dtype): + if "i" in dtype: + df = self.df.dropna().astype(dtype) + else: + df = self.df.astype(dtype) + + res = df.rank(ascending=False) + expected = (df.max() - df).rank() + tm.assert_frame_equal(res, expected) + + expected = (df.max() - df).rank(method=method) + + if dtype != "O": + res2 = df.rank(method=method, ascending=False, numeric_only=True) + tm.assert_frame_equal(res2, expected) + + res3 = df.rank(method=method, ascending=False, numeric_only=False) + tm.assert_frame_equal(res3, expected) + + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize("dtype", [None, object]) + def test_rank_2d_tie_methods(self, method, axis, dtype): + df = self.df + + def _check2d(df, expected, method="average", axis=0): + exp_df = DataFrame({"A": expected, "B": expected}) + + if axis == 1: + df = df.T + exp_df = exp_df.T + + result = df.rank(method=method, axis=axis) + tm.assert_frame_equal(result, exp_df) + + frame = df if dtype is None else df.astype(dtype) + _check2d(frame, self.results[method], method=method, axis=axis) + + @pytest.mark.parametrize( + "method,exp", + [ + ("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]), + ( + "min", + [ + [1.0 / 3, 1.0, 1.0], + [1.0 / 3, 1.0 / 3, 2.0 / 3], + [1.0 / 3, 1.0 / 3, 1.0 / 3], + ], + ), + ( + "max", + [[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]], + ), + ( + "average", + [[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]], + ), + ( + "first", + [ + [1.0 / 3, 1.0, 1.0], + [2.0 / 3, 1.0 / 3, 2.0 / 3], + [3.0 / 3, 2.0 / 3, 1.0 / 3], + ], + ), + ], + ) + def test_rank_pct_true(self, method, exp): + # see gh-15630. + + df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]]) + result = df.rank(method=method, pct=True) + + expected = DataFrame(exp) + tm.assert_frame_equal(result, expected) + + @pytest.mark.single_cpu + def test_pct_max_many_rows(self): + # GH 18271 + df = DataFrame( + {"A": np.arange(2**24 + 1), "B": np.arange(2**24 + 1, 0, -1)} + ) + result = df.rank(pct=True).max() + assert (result == 1).all() + + @pytest.mark.parametrize( + "contents,dtype", + [ + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-50, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float64", + ), + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-45, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float32", + ), + ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), + ( + [ + np.iinfo(np.int64).min, + -100, + 0, + 1, + 9999, + 100000, + 1e10, + np.iinfo(np.int64).max, + ], + "int64", + ), + ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), + ( + [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 5)], + "datetime64", + ), + ], + ) + def test_rank_inf_and_nan(self, contents, dtype, frame_or_series): + dtype_na_map = { + "float64": np.nan, + "float32": np.nan, + "object": None, + "datetime64": np.datetime64("nat"), + } + # Insert nans at random positions if underlying dtype has missing + # value. Then adjust the expected order by adding nans accordingly + # This is for testing whether rank calculation is affected + # when values are interwined with nan values. + values = np.array(contents, dtype=dtype) + exp_order = np.array(range(len(values)), dtype="float64") + 1.0 + if dtype in dtype_na_map: + na_value = dtype_na_map[dtype] + nan_indices = np.random.default_rng(2).choice(range(len(values)), 5) + values = np.insert(values, nan_indices, na_value) + exp_order = np.insert(exp_order, nan_indices, np.nan) + + # Shuffle the testing array and expected results in the same way + random_order = np.random.default_rng(2).permutation(len(values)) + obj = frame_or_series(values[random_order]) + expected = frame_or_series(exp_order[random_order], dtype="float64") + result = obj.rank() + tm.assert_equal(result, expected) + + def test_df_series_inf_nan_consistency(self): + # GH#32593 + index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10] + col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6] + col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] + df = DataFrame( + data={ + "col1": col1, + "col2": col2, + }, + index=index, + dtype="f8", + ) + df_result = df.rank() + + series_result = df.copy() + series_result["col1"] = df["col1"].rank() + series_result["col2"] = df["col2"].rank() + + tm.assert_frame_equal(df_result, series_result) + + def test_rank_both_inf(self): + # GH#32593 + df = DataFrame({"a": [-np.inf, 0, np.inf]}) + expected = DataFrame({"a": [1.0, 2.0, 3.0]}) + result = df.rank() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "na_option,ascending,expected", + [ + ("top", True, [3.0, 1.0, 2.0]), + ("top", False, [2.0, 1.0, 3.0]), + ("bottom", True, [2.0, 3.0, 1.0]), + ("bottom", False, [1.0, 3.0, 2.0]), + ], + ) + def test_rank_inf_nans_na_option( + self, frame_or_series, method, na_option, ascending, expected + ): + obj = frame_or_series([np.inf, np.nan, -np.inf]) + result = obj.rank(method=method, na_option=na_option, ascending=ascending) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "na_option,ascending,expected", + [ + ("bottom", True, [1.0, 2.0, 4.0, 3.0]), + ("bottom", False, [1.0, 2.0, 4.0, 3.0]), + ("top", True, [2.0, 3.0, 1.0, 4.0]), + ("top", False, [2.0, 3.0, 1.0, 4.0]), + ], + ) + def test_rank_object_first( + self, frame_or_series, na_option, ascending, expected, using_infer_string + ): + obj = frame_or_series(["foo", "foo", None, "foo"]) + result = obj.rank(method="first", na_option=na_option, ascending=ascending) + expected = frame_or_series(expected) + if using_infer_string and isinstance(obj, Series): + expected = expected.astype("uint64") + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "data,expected", + [ + ( + {"a": [1, 2, "a"], "b": [4, 5, 6]}, + DataFrame({"b": [1.0, 2.0, 3.0]}, columns=Index(["b"], dtype=object)), + ), + ({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])), + ], + ) + def test_rank_mixed_axis_zero(self, data, expected): + df = DataFrame(data, columns=Index(list(data.keys()), dtype=object)) + with pytest.raises(TypeError, match="'<' not supported between instances of"): + df.rank() + result = df.rank(numeric_only=True) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, exp_dtype", + [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], + ) + def test_rank_string_dtype(self, dtype, exp_dtype): + # GH#55362 + pytest.importorskip("pyarrow") + obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + result = obj.rank(method="first") + expected = Series([1, 2, None, 3], dtype=exp_dtype) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reindex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reindex.py new file mode 100644 index 0000000000000000000000000000000000000000..d862e14ce86cbc2bc0e74e5e9bd768c2f2eb285c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reindex.py @@ -0,0 +1,1327 @@ +from datetime import ( + datetime, + timedelta, +) +import inspect + +import numpy as np +import pytest + +from pandas._libs.tslibs.timezones import dateutil_gettz as gettz +from pandas.compat import ( + IS64, + is_platform_windows, +) +from pandas.compat.numpy import np_version_gt2 +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, + date_range, + isna, +) +import pandas._testing as tm +from pandas.api.types import CategoricalDtype + + +class TestReindexSetIndex: + # Tests that check both reindex and set_index + + def test_dti_set_index_reindex_datetimeindex(self): + # GH#6631 + df = DataFrame(np.random.default_rng(2).random(6)) + idx1 = date_range("2011/01/01", periods=6, freq="ME", tz="US/Eastern") + idx2 = date_range("2013", periods=6, freq="YE", tz="Asia/Tokyo") + + df = df.set_index(idx1) + tm.assert_index_equal(df.index, idx1) + df = df.reindex(idx2) + tm.assert_index_equal(df.index, idx2) + + def test_dti_set_index_reindex_freq_with_tz(self): + # GH#11314 with tz + index = date_range( + datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="h", tz="US/Eastern" + ) + df = DataFrame( + np.random.default_rng(2).standard_normal((24, 1)), + columns=["a"], + index=index, + ) + new_index = date_range( + datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="h", tz="US/Eastern" + ) + + result = df.set_index(new_index) + assert result.index.freq == index.freq + + def test_set_reset_index_intervalindex(self): + df = DataFrame({"A": range(10)}) + ser = pd.cut(df.A, 5) + df["B"] = ser + df = df.set_index("B") + + df = df.reset_index() + + def test_setitem_reset_index_dtypes(self): + # GH 22060 + df = DataFrame(columns=["a", "b", "c"]).astype( + {"a": "datetime64[ns]", "b": np.int64, "c": np.float64} + ) + df1 = df.set_index(["a"]) + df1["d"] = [] + result = df1.reset_index() + expected = DataFrame(columns=["a", "b", "c", "d"], index=range(0)).astype( + {"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.float64} + ) + tm.assert_frame_equal(result, expected) + + df2 = df.set_index(["a", "b"]) + df2["d"] = [] + result = df2.reset_index() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "timezone, year, month, day, hour", + [["America/Chicago", 2013, 11, 3, 1], ["America/Santiago", 2021, 4, 3, 23]], + ) + def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour): + # see gh-40817 + test_timezone = gettz(timezone) + transition_1 = pd.Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=0, + tzinfo=test_timezone, + ) + transition_2 = pd.Timestamp( + year=year, + month=month, + day=day, + hour=hour, + minute=0, + fold=1, + tzinfo=test_timezone, + ) + df = ( + DataFrame({"index": [transition_1, transition_2], "vals": ["a", "b"]}) + .set_index("index") + .reindex(["1", "2"]) + ) + exp = DataFrame({"index": ["1", "2"], "vals": [np.nan, np.nan]}).set_index( + "index" + ) + exp = exp.astype(df.vals.dtype) + tm.assert_frame_equal( + df, + exp, + ) + + +class TestDataFrameSelectReindex: + # These are specific reindex-based tests; other indexing tests should go in + # test_indexing + + @pytest.mark.xfail( + not IS64 or (is_platform_windows() and not np_version_gt2), + reason="Passes int32 values to DatetimeArray in make_na_array on " + "windows, 32bit linux builds", + ) + @td.skip_array_manager_not_yet_implemented + def test_reindex_tzaware_fill_value(self): + # GH#52586 + df = DataFrame([[1]]) + + ts = pd.Timestamp("2023-04-10 17:32", tz="US/Pacific") + res = df.reindex([0, 1], axis=1, fill_value=ts) + assert res.dtypes[1] == pd.DatetimeTZDtype(unit="s", tz="US/Pacific") + expected = DataFrame({0: [1], 1: [ts]}) + expected[1] = expected[1].astype(res.dtypes[1]) + tm.assert_frame_equal(res, expected) + + per = ts.tz_localize(None).to_period("s") + res = df.reindex([0, 1], axis=1, fill_value=per) + assert res.dtypes[1] == pd.PeriodDtype("s") + expected = DataFrame({0: [1], 1: [per]}) + tm.assert_frame_equal(res, expected) + + interval = pd.Interval(ts, ts + pd.Timedelta(seconds=1)) + res = df.reindex([0, 1], axis=1, fill_value=interval) + assert res.dtypes[1] == pd.IntervalDtype("datetime64[s, US/Pacific]", "right") + expected = DataFrame({0: [1], 1: [interval]}) + expected[1] = expected[1].astype(res.dtypes[1]) + tm.assert_frame_equal(res, expected) + + def test_reindex_copies(self): + # based on asv time_reindex_axis1 + N = 10 + df = DataFrame(np.random.default_rng(2).standard_normal((N * 10, N))) + cols = np.arange(N) + np.random.default_rng(2).shuffle(cols) + + result = df.reindex(columns=cols, copy=True) + assert not np.shares_memory(result[0]._values, df[0]._values) + + # pass both columns and index + result2 = df.reindex(columns=cols, index=df.index, copy=True) + assert not np.shares_memory(result2[0]._values, df[0]._values) + + def test_reindex_copies_ea(self, using_copy_on_write): + # https://github.com/pandas-dev/pandas/pull/51197 + # also ensure to honor copy keyword for ExtensionDtypes + N = 10 + df = DataFrame( + np.random.default_rng(2).standard_normal((N * 10, N)), dtype="Float64" + ) + cols = np.arange(N) + np.random.default_rng(2).shuffle(cols) + + result = df.reindex(columns=cols, copy=True) + if using_copy_on_write: + assert np.shares_memory(result[0].array._data, df[0].array._data) + else: + assert not np.shares_memory(result[0].array._data, df[0].array._data) + + # pass both columns and index + result2 = df.reindex(columns=cols, index=df.index, copy=True) + if using_copy_on_write: + assert np.shares_memory(result2[0].array._data, df[0].array._data) + else: + assert not np.shares_memory(result2[0].array._data, df[0].array._data) + + @td.skip_array_manager_not_yet_implemented + def test_reindex_date_fill_value(self): + # passing date to dt64 is deprecated; enforced in 2.0 to cast to object + arr = date_range("2016-01-01", periods=6).values.reshape(3, 2) + df = DataFrame(arr, columns=["A", "B"], index=range(3)) + + ts = df.iloc[0, 0] + fv = ts.date() + + res = df.reindex(index=range(4), columns=["A", "B", "C"], fill_value=fv) + + expected = DataFrame( + {"A": df["A"].tolist() + [fv], "B": df["B"].tolist() + [fv], "C": [fv] * 4}, + dtype=object, + ) + tm.assert_frame_equal(res, expected) + + # only reindexing rows + res = df.reindex(index=range(4), fill_value=fv) + tm.assert_frame_equal(res, expected[["A", "B"]]) + + # same with a datetime-castable str + res = df.reindex( + index=range(4), columns=["A", "B", "C"], fill_value="2016-01-01" + ) + expected = DataFrame( + {"A": df["A"].tolist() + [ts], "B": df["B"].tolist() + [ts], "C": [ts] * 4}, + ) + tm.assert_frame_equal(res, expected) + + def test_reindex_with_multi_index(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests for reindexing a multi-indexed DataFrame with a new MultiIndex + # + # confirms that we can reindex a multi-indexed DataFrame with a new + # MultiIndex object correctly when using no filling, backfilling, and + # padding + # + # The DataFrame, `df`, used in this test is: + # c + # a b + # -1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 0 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # + # and the other MultiIndex, `new_multi_index`, is: + # 0: 0 0.5 + # 1: 2.0 + # 2: 5.0 + # 3: 5.8 + df = DataFrame( + { + "a": [-1] * 7 + [0] * 7 + [1] * 7, + "b": list(range(7)) * 3, + "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, + } + ).set_index(["a", "b"]) + new_index = [0.5, 2.0, 5.0, 5.8] + new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) + + # reindexing w/o a `method` value + reindexed = df.reindex(new_multi_index) + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} + ).set_index(["a", "b"]) + tm.assert_frame_equal(expected, reindexed) + + # reindexing with backfilling + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} + ).set_index(["a", "b"]) + reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + # reindexing with padding + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} + ).set_index(["a", "b"]) + reindexed_with_padding = df.reindex(new_multi_index, method="pad") + tm.assert_frame_equal(expected, reindexed_with_padding) + + reindexed_with_padding = df.reindex(new_multi_index, method="ffill") + tm.assert_frame_equal(expected, reindexed_with_padding) + + @pytest.mark.parametrize( + "method,expected_values", + [ + ("nearest", [0, 1, 1, 2]), + ("pad", [np.nan, 0, 1, 1]), + ("backfill", [0, 1, 2, 2]), + ], + ) + def test_reindex_methods(self, method, expected_values): + df = DataFrame({"x": list(range(5))}) + target = np.array([-0.1, 0.9, 1.1, 1.5]) + + expected = DataFrame({"x": expected_values}, index=target) + actual = df.reindex(target, method=method) + tm.assert_frame_equal(expected, actual) + + actual = df.reindex(target, method=method, tolerance=1) + tm.assert_frame_equal(expected, actual) + actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1]) + tm.assert_frame_equal(expected, actual) + + e2 = expected[::-1] + actual = df.reindex(target[::-1], method=method) + tm.assert_frame_equal(e2, actual) + + new_order = [3, 0, 2, 1] + e2 = expected.iloc[new_order] + actual = df.reindex(target[new_order], method=method) + tm.assert_frame_equal(e2, actual) + + switched_method = ( + "pad" if method == "backfill" else "backfill" if method == "pad" else method + ) + actual = df[::-1].reindex(target, method=switched_method) + tm.assert_frame_equal(expected, actual) + + def test_reindex_methods_nearest_special(self): + df = DataFrame({"x": list(range(5))}) + target = np.array([-0.1, 0.9, 1.1, 1.5]) + + expected = DataFrame({"x": [0, 1, 1, np.nan]}, index=target) + actual = df.reindex(target, method="nearest", tolerance=0.2) + tm.assert_frame_equal(expected, actual) + + expected = DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target) + actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1]) + tm.assert_frame_equal(expected, actual) + + def test_reindex_nearest_tz(self, tz_aware_fixture): + # GH26683 + tz = tz_aware_fixture + idx = date_range("2019-01-01", periods=5, tz=tz) + df = DataFrame({"x": list(range(5))}, index=idx) + + expected = df.head(3) + actual = df.reindex(idx[:3], method="nearest") + tm.assert_frame_equal(expected, actual) + + def test_reindex_nearest_tz_empty_frame(self): + # https://github.com/pandas-dev/pandas/issues/31964 + dti = pd.DatetimeIndex(["2016-06-26 14:27:26+00:00"]) + df = DataFrame(index=pd.DatetimeIndex(["2016-07-04 14:00:59+00:00"])) + expected = DataFrame(index=dti) + result = df.reindex(dti, method="nearest") + tm.assert_frame_equal(result, expected) + + def test_reindex_frame_add_nat(self): + rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") + df = DataFrame( + {"A": np.random.default_rng(2).standard_normal(len(rng)), "B": rng} + ) + + result = df.reindex(range(15)) + assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]")) + + mask = isna(result)["B"] + assert mask[-5:].all() + assert not mask[:-5].any() + + @pytest.mark.parametrize( + "method, exp_values", + [("ffill", [0, 1, 2, 3]), ("bfill", [1.0, 2.0, 3.0, np.nan])], + ) + def test_reindex_frame_tz_ffill_bfill(self, frame_or_series, method, exp_values): + # GH#38566 + obj = frame_or_series( + [0, 1, 2, 3], + index=date_range("2020-01-01 00:00:00", periods=4, freq="h", tz="UTC"), + ) + new_index = date_range("2020-01-01 00:01:00", periods=4, freq="h", tz="UTC") + result = obj.reindex(new_index, method=method, tolerance=pd.Timedelta("1 hour")) + expected = frame_or_series(exp_values, index=new_index) + tm.assert_equal(result, expected) + + def test_reindex_limit(self): + # GH 28631 + data = [["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"]] + exp_data = [ + ["A", "A", "A"], + ["B", "B", "B"], + ["C", "C", "C"], + ["D", "D", "D"], + ["D", "D", "D"], + [np.nan, np.nan, np.nan], + ] + df = DataFrame(data) + result = df.reindex([0, 1, 2, 3, 4, 5], method="ffill", limit=1) + expected = DataFrame(exp_data) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "idx, check_index_type", + [ + [["C", "B", "A"], True], + [["F", "C", "A", "D"], True], + [["A"], True], + [["A", "B", "C"], True], + [["C", "A", "B"], True], + [["C", "B"], True], + [["C", "A"], True], + [["A", "B"], True], + [["B", "A", "C"], True], + # reindex by these causes different MultiIndex levels + [["D", "F"], False], + [["A", "C", "B"], False], + ], + ) + def test_reindex_level_verify_first_level(self, idx, check_index_type): + df = DataFrame( + { + "jim": list("B" * 4 + "A" * 2 + "C" * 3), + "joe": list("abcdeabcd")[::-1], + "jolie": [10, 20, 30] * 3, + "joline": np.random.default_rng(2).integers(0, 1000, 9), + } + ) + icol = ["jim", "joe", "jolie"] + + def f(val): + return np.nonzero((df["jim"] == val).to_numpy())[0] + + i = np.concatenate(list(map(f, idx))) + left = df.set_index(icol).reindex(idx, level="jim") + right = df.iloc[i].set_index(icol) + tm.assert_frame_equal(left, right, check_index_type=check_index_type) + + @pytest.mark.parametrize( + "idx", + [ + ("mid",), + ("mid", "btm"), + ("mid", "btm", "top"), + ("mid",), + ("mid", "top"), + ("mid", "top", "btm"), + ("btm",), + ("btm", "mid"), + ("btm", "mid", "top"), + ("btm",), + ("btm", "top"), + ("btm", "top", "mid"), + ("top",), + ("top", "mid"), + ("top", "mid", "btm"), + ("top",), + ("top", "btm"), + ("top", "btm", "mid"), + ], + ) + def test_reindex_level_verify_first_level_repeats(self, idx): + df = DataFrame( + { + "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, + "joe": ["3rd"] * 2 + + ["1st"] * 3 + + ["2nd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["2nd"] * 2, + # this needs to be jointly unique with jim and joe or + # reindexing will fail ~1.5% of the time, this works + # out to needing unique groups of same size as joe + "jolie": np.concatenate( + [ + np.random.default_rng(2).choice(1000, x, replace=False) + for x in [2, 3, 3, 2, 3, 2, 3, 2] + ] + ), + "joline": np.random.default_rng(2).standard_normal(20).round(3) * 10, + } + ) + icol = ["jim", "joe", "jolie"] + + def f(val): + return np.nonzero((df["jim"] == val).to_numpy())[0] + + i = np.concatenate(list(map(f, idx))) + left = df.set_index(icol).reindex(idx, level="jim") + right = df.iloc[i].set_index(icol) + tm.assert_frame_equal(left, right) + + @pytest.mark.parametrize( + "idx, indexer", + [ + [ + ["1st", "2nd", "3rd"], + [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17], + ], + [ + ["3rd", "2nd", "1st"], + [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14], + ], + [["2nd", "3rd"], [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17]], + [["3rd", "1st"], [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14]], + ], + ) + def test_reindex_level_verify_repeats(self, idx, indexer): + df = DataFrame( + { + "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, + "joe": ["3rd"] * 2 + + ["1st"] * 3 + + ["2nd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["2nd"] * 2, + # this needs to be jointly unique with jim and joe or + # reindexing will fail ~1.5% of the time, this works + # out to needing unique groups of same size as joe + "jolie": np.concatenate( + [ + np.random.default_rng(2).choice(1000, x, replace=False) + for x in [2, 3, 3, 2, 3, 2, 3, 2] + ] + ), + "joline": np.random.default_rng(2).standard_normal(20).round(3) * 10, + } + ) + icol = ["jim", "joe", "jolie"] + left = df.set_index(icol).reindex(idx, level="joe") + right = df.iloc[indexer].set_index(icol) + tm.assert_frame_equal(left, right) + + @pytest.mark.parametrize( + "idx, indexer, check_index_type", + [ + [list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6], True], + [list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6], True], + [list("abc"), [3, 2, 1, 8, 7, 6], True], + [list("eca"), [1, 3, 4, 6, 8], True], + [list("edc"), [0, 1, 4, 5, 6], True], + [list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6], True], + [list("edwq"), [0, 4, 5], True], + [list("wq"), [], False], + ], + ) + def test_reindex_level_verify(self, idx, indexer, check_index_type): + df = DataFrame( + { + "jim": list("B" * 4 + "A" * 2 + "C" * 3), + "joe": list("abcdeabcd")[::-1], + "jolie": [10, 20, 30] * 3, + "joline": np.random.default_rng(2).integers(0, 1000, 9), + } + ) + icol = ["jim", "joe", "jolie"] + left = df.set_index(icol).reindex(idx, level="joe") + right = df.iloc[indexer].set_index(icol) + tm.assert_frame_equal(left, right, check_index_type=check_index_type) + + def test_non_monotonic_reindex_methods(self): + dr = date_range("2013-08-01", periods=6, freq="B") + data = np.random.default_rng(2).standard_normal((6, 1)) + df = DataFrame(data, index=dr, columns=list("A")) + df_rev = DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A")) + # index is not monotonic increasing or decreasing + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="pad") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="ffill") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="bfill") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="nearest") + + def test_reindex_sparse(self): + # https://github.com/pandas-dev/pandas/issues/35286 + df = DataFrame( + {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))} + ) + result = df.reindex([0, 2]) + expected = DataFrame( + { + "A": [0.0, np.nan], + "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)), + }, + index=[0, 2], + ) + tm.assert_frame_equal(result, expected) + + def test_reindex(self, float_frame, using_copy_on_write): + datetime_series = Series( + np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + ) + + newFrame = float_frame.reindex(datetime_series.index) + + for col in newFrame.columns: + for idx, val in newFrame[col].items(): + if idx in float_frame.index: + if np.isnan(val): + assert np.isnan(float_frame[col][idx]) + else: + assert val == float_frame[col][idx] + else: + assert np.isnan(val) + + for col, series in newFrame.items(): + tm.assert_index_equal(series.index, newFrame.index) + emptyFrame = float_frame.reindex(Index([])) + assert len(emptyFrame.index) == 0 + + # Cython code should be unit-tested directly + nonContigFrame = float_frame.reindex(datetime_series.index[::2]) + + for col in nonContigFrame.columns: + for idx, val in nonContigFrame[col].items(): + if idx in float_frame.index: + if np.isnan(val): + assert np.isnan(float_frame[col][idx]) + else: + assert val == float_frame[col][idx] + else: + assert np.isnan(val) + + for col, series in nonContigFrame.items(): + tm.assert_index_equal(series.index, nonContigFrame.index) + + # corner cases + + # Same index, copies values but not index if copy=False + newFrame = float_frame.reindex(float_frame.index, copy=False) + if using_copy_on_write: + assert newFrame.index.is_(float_frame.index) + else: + assert newFrame.index is float_frame.index + + # length zero + newFrame = float_frame.reindex([]) + assert newFrame.empty + assert len(newFrame.columns) == len(float_frame.columns) + + # length zero with columns reindexed with non-empty index + newFrame = float_frame.reindex([]) + newFrame = newFrame.reindex(float_frame.index) + assert len(newFrame.index) == len(float_frame.index) + assert len(newFrame.columns) == len(float_frame.columns) + + # pass non-Index + newFrame = float_frame.reindex(list(datetime_series.index)) + expected = datetime_series.index._with_freq(None) + tm.assert_index_equal(newFrame.index, expected) + + # copy with no axes + result = float_frame.reindex() + tm.assert_frame_equal(result, float_frame) + assert result is not float_frame + + def test_reindex_nan(self): + df = DataFrame( + [[1, 2], [3, 5], [7, 11], [9, 23]], + index=[2, np.nan, 1, 5], + columns=["joe", "jim"], + ) + + i, j = [np.nan, 5, 5, np.nan, 1, 2, np.nan], [1, 3, 3, 1, 2, 0, 1] + tm.assert_frame_equal(df.reindex(i), df.iloc[j]) + + df.index = df.index.astype("object") + tm.assert_frame_equal(df.reindex(i), df.iloc[j], check_index_type=False) + + # GH10388 + df = DataFrame( + { + "other": ["a", "b", np.nan, "c"], + "date": ["2015-03-22", np.nan, "2012-01-08", np.nan], + "amount": [2, 3, 4, 5], + } + ) + + df["date"] = pd.to_datetime(df.date) + df["delta"] = (pd.to_datetime("2015-06-18") - df["date"]).shift(1) + + left = df.set_index(["delta", "other", "date"]).reset_index() + right = df.reindex(columns=["delta", "other", "date", "amount"]) + tm.assert_frame_equal(left, right) + + def test_reindex_name_remains(self): + s = Series(np.random.default_rng(2).random(10)) + df = DataFrame(s, index=np.arange(len(s))) + i = Series(np.arange(10), name="iname") + + df = df.reindex(i) + assert df.index.name == "iname" + + df = df.reindex(Index(np.arange(10), name="tmpname")) + assert df.index.name == "tmpname" + + s = Series(np.random.default_rng(2).random(10)) + df = DataFrame(s.T, index=np.arange(len(s))) + i = Series(np.arange(10), name="iname") + df = df.reindex(columns=i) + assert df.columns.name == "iname" + + def test_reindex_int(self, int_frame): + smaller = int_frame.reindex(int_frame.index[::2]) + + assert smaller["A"].dtype == np.int64 + + bigger = smaller.reindex(int_frame.index) + assert bigger["A"].dtype == np.float64 + + smaller = int_frame.reindex(columns=["A", "B"]) + assert smaller["A"].dtype == np.int64 + + def test_reindex_columns(self, float_frame): + new_frame = float_frame.reindex(columns=["A", "B", "E"]) + + tm.assert_series_equal(new_frame["B"], float_frame["B"]) + assert np.isnan(new_frame["E"]).all() + assert "C" not in new_frame + + # Length zero + new_frame = float_frame.reindex(columns=[]) + assert new_frame.empty + + def test_reindex_columns_method(self): + # GH 14992, reindexing over columns ignored method + df = DataFrame( + data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]], + index=[1, 2, 4], + columns=[1, 2, 4], + dtype=float, + ) + + # default method + result = df.reindex(columns=range(6)) + expected = DataFrame( + data=[ + [np.nan, 11, 12, np.nan, 13, np.nan], + [np.nan, 21, 22, np.nan, 23, np.nan], + [np.nan, 31, 32, np.nan, 33, np.nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) + tm.assert_frame_equal(result, expected) + + # method='ffill' + result = df.reindex(columns=range(6), method="ffill") + expected = DataFrame( + data=[ + [np.nan, 11, 12, 12, 13, 13], + [np.nan, 21, 22, 22, 23, 23], + [np.nan, 31, 32, 32, 33, 33], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) + tm.assert_frame_equal(result, expected) + + # method='bfill' + result = df.reindex(columns=range(6), method="bfill") + expected = DataFrame( + data=[ + [11, 11, 12, 13, 13, np.nan], + [21, 21, 22, 23, 23, np.nan], + [31, 31, 32, 33, 33, np.nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) + tm.assert_frame_equal(result, expected) + + def test_reindex_axes(self): + # GH 3317, reindexing by both axes loses freq of the index + df = DataFrame( + np.ones((3, 3)), + index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], + columns=["a", "b", "c"], + ) + time_freq = date_range("2012-01-01", "2012-01-03", freq="d") + some_cols = ["a", "b"] + + index_freq = df.reindex(index=time_freq).index.freq + both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq + seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq + assert index_freq == both_freq + assert index_freq == seq_freq + + def test_reindex_fill_value(self): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) + + # axis=0 + result = df.reindex(list(range(15))) + assert np.isnan(result.values[-5:]).all() + + result = df.reindex(range(15), fill_value=0) + expected = df.reindex(range(15)).fillna(0) + tm.assert_frame_equal(result, expected) + + # axis=1 + result = df.reindex(columns=range(5), fill_value=0.0) + expected = df.copy() + expected[4] = 0.0 + tm.assert_frame_equal(result, expected) + + result = df.reindex(columns=range(5), fill_value=0) + expected = df.copy() + expected[4] = 0 + tm.assert_frame_equal(result, expected) + + result = df.reindex(columns=range(5), fill_value="foo") + expected = df.copy() + expected[4] = "foo" + tm.assert_frame_equal(result, expected) + + # other dtypes + df["foo"] = "foo" + result = df.reindex(range(15), fill_value="0") + expected = df.reindex(range(15)).fillna("0") + tm.assert_frame_equal(result, expected) + + def test_reindex_uint_dtypes_fill_value(self, any_unsigned_int_numpy_dtype): + # GH#48184 + df = DataFrame({"a": [1, 2], "b": [1, 2]}, dtype=any_unsigned_int_numpy_dtype) + result = df.reindex(columns=list("abcd"), index=[0, 1, 2, 3], fill_value=10) + expected = DataFrame( + {"a": [1, 2, 10, 10], "b": [1, 2, 10, 10], "c": 10, "d": 10}, + dtype=any_unsigned_int_numpy_dtype, + ) + tm.assert_frame_equal(result, expected) + + def test_reindex_single_column_ea_index_and_columns(self, any_numeric_ea_dtype): + # GH#48190 + df = DataFrame({"a": [1, 2]}, dtype=any_numeric_ea_dtype) + result = df.reindex(columns=list("ab"), index=[0, 1, 2], fill_value=10) + expected = DataFrame( + {"a": Series([1, 2, 10], dtype=any_numeric_ea_dtype), "b": 10} + ) + tm.assert_frame_equal(result, expected) + + def test_reindex_dups(self): + # GH4746, reindex on duplicate index error messages + arr = np.random.default_rng(2).standard_normal(10) + df = DataFrame(arr, index=[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]) + + # set index is ok + result = df.copy() + result.index = list(range(len(df))) + expected = DataFrame(arr, index=list(range(len(df)))) + tm.assert_frame_equal(result, expected) + + # reindex fails + msg = "cannot reindex on an axis with duplicate labels" + with pytest.raises(ValueError, match=msg): + df.reindex(index=list(range(len(df)))) + + def test_reindex_with_duplicate_columns(self): + # reindex is invalid! + df = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] + ) + msg = "cannot reindex on an axis with duplicate labels" + with pytest.raises(ValueError, match=msg): + df.reindex(columns=["bar"]) + with pytest.raises(ValueError, match=msg): + df.reindex(columns=["bar", "foo"]) + + def test_reindex_axis_style(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + expected = DataFrame( + {"A": [1, 2, np.nan], "B": [4, 5, np.nan]}, index=[0, 1, 3] + ) + result = df.reindex([0, 1, 3]) + tm.assert_frame_equal(result, expected) + + result = df.reindex([0, 1, 3], axis=0) + tm.assert_frame_equal(result, expected) + + result = df.reindex([0, 1, 3], axis="index") + tm.assert_frame_equal(result, expected) + + def test_reindex_positional_raises(self): + # https://github.com/pandas-dev/pandas/issues/12392 + # Enforced in 2.0 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + msg = r"reindex\(\) takes from 1 to 2 positional arguments but 3 were given" + with pytest.raises(TypeError, match=msg): + df.reindex([0, 1], ["A", "B", "C"]) + + def test_reindex_axis_style_raises(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex([0, 1], columns=["A"], axis=1) + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex([0, 1], columns=["A"], axis="index") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="index") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="columns") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(columns=[0, 1], axis="columns") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], columns=[0, 1], axis="columns") + + with pytest.raises(TypeError, match="Cannot specify all"): + df.reindex(labels=[0, 1], index=[0], columns=["A"]) + + # Mixing styles + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="index") + + with pytest.raises(TypeError, match="Cannot specify both 'axis'"): + df.reindex(index=[0, 1], axis="columns") + + # Duplicates + with pytest.raises(TypeError, match="multiple values"): + df.reindex([0, 1], labels=[0, 1]) + + def test_reindex_single_named_indexer(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}) + result = df.reindex([0, 1], columns=["A"]) + expected = DataFrame({"A": [1, 2]}) + tm.assert_frame_equal(result, expected) + + def test_reindex_api_equivalence(self): + # https://github.com/pandas-dev/pandas/issues/12392 + # equivalence of the labels/axis and index/columns API's + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.reindex(["b", "a"]) + res2 = df.reindex(index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"]) + res4 = df.reindex(labels=["b", "a"], axis=0) + res5 = df.reindex(["b", "a"], axis=0) + for res in [res2, res3, res4, res5]: + tm.assert_frame_equal(res1, res) + + res1 = df.reindex(columns=["e", "d"]) + res2 = df.reindex(["e", "d"], axis=1) + res3 = df.reindex(labels=["e", "d"], axis=1) + for res in [res2, res3]: + tm.assert_frame_equal(res1, res) + + res1 = df.reindex(index=["b", "a"], columns=["e", "d"]) + res2 = df.reindex(columns=["e", "d"], index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1) + for res in [res2, res3]: + tm.assert_frame_equal(res1, res) + + def test_reindex_boolean(self): + frame = DataFrame( + np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2] + ) + + reindexed = frame.reindex(np.arange(10)) + assert reindexed.values.dtype == np.object_ + assert isna(reindexed[0][1]) + + reindexed = frame.reindex(columns=range(3)) + assert reindexed.values.dtype == np.object_ + assert isna(reindexed[1]).all() + + def test_reindex_objects(self, float_string_frame): + reindexed = float_string_frame.reindex(columns=["foo", "A", "B"]) + assert "foo" in reindexed + + reindexed = float_string_frame.reindex(columns=["A", "B"]) + assert "foo" not in reindexed + + def test_reindex_corner(self, int_frame): + index = Index(["a", "b", "c"]) + dm = DataFrame({}).reindex(index=[1, 2, 3]) + reindexed = dm.reindex(columns=index) + tm.assert_index_equal(reindexed.columns, index) + + # ints are weird + smaller = int_frame.reindex(columns=["A", "B", "E"]) + assert smaller["E"].dtype == np.float64 + + def test_reindex_with_nans(self): + df = DataFrame( + [[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], + columns=["a", "b"], + index=[100.0, 101.0, np.nan, 102.0, 103.0], + ) + + result = df.reindex(index=[101.0, 102.0, 103.0]) + expected = df.iloc[[1, 3, 4]] + tm.assert_frame_equal(result, expected) + + result = df.reindex(index=[103.0]) + expected = df.iloc[[4]] + tm.assert_frame_equal(result, expected) + + result = df.reindex(index=[101.0]) + expected = df.iloc[[1]] + tm.assert_frame_equal(result, expected) + + def test_reindex_multi(self): + df = DataFrame(np.random.default_rng(2).standard_normal((3, 3))) + + result = df.reindex(index=range(4), columns=range(4)) + expected = df.reindex(list(range(4))).reindex(columns=range(4)) + + tm.assert_frame_equal(result, expected) + + df = DataFrame(np.random.default_rng(2).integers(0, 10, (3, 3))) + + result = df.reindex(index=range(4), columns=range(4)) + expected = df.reindex(list(range(4))).reindex(columns=range(4)) + + tm.assert_frame_equal(result, expected) + + df = DataFrame(np.random.default_rng(2).integers(0, 10, (3, 3))) + + result = df.reindex(index=range(2), columns=range(2)) + expected = df.reindex(range(2)).reindex(columns=range(2)) + + tm.assert_frame_equal(result, expected) + + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 3)) + 1j, + columns=["a", "b", "c"], + ) + + result = df.reindex(index=[0, 1], columns=["a", "b"]) + expected = df.reindex([0, 1]).reindex(columns=["a", "b"]) + + tm.assert_frame_equal(result, expected) + + def test_reindex_multi_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = MultiIndex.from_product( + [ + Categorical(["a", "b", "c"]), + Categorical(date_range("2012-01-01", periods=3, freq="h")), + ] + ) + df = DataFrame({"a": range(len(midx))}, index=midx) + df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]] + + result = df2.reindex(midx) + expected = DataFrame({"a": [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) + tm.assert_frame_equal(result, expected) + + def test_reindex_with_categoricalindex(self): + df = DataFrame( + { + "A": np.arange(3, dtype="int64"), + }, + index=CategoricalIndex( + list("abc"), dtype=CategoricalDtype(list("cabe")), name="B" + ), + ) + + # reindexing + # convert to a regular index + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["d"]) + expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + # since we are actually reindexing with a Categorical + # then return a Categorical + cats = list("cabe") + + result = df.reindex(Categorical(["a", "e"], categories=cats)) + expected = DataFrame( + {"A": [0, np.nan], "B": Series(list("ae")).astype(CategoricalDtype(cats))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(Categorical(["a"], categories=cats)) + expected = DataFrame( + {"A": [0], "B": Series(list("a")).astype(CategoricalDtype(cats))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + # give back the type of categorical that we received + result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) + expected = DataFrame( + { + "A": [0, np.nan], + "B": Series(list("ae")).astype(CategoricalDtype(cats, ordered=True)), + } + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) + expected = DataFrame( + { + "A": [0, np.nan], + "B": Series(list("ad")).astype(CategoricalDtype(["a", "d"])), + } + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + df2 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + }, + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B" + ), + ) + # passed duplicate indexers are not allowed + msg = "cannot reindex on an axis with duplicate labels" + with pytest.raises(ValueError, match=msg): + df2.reindex(["a", "b"]) + + # args NotImplemented ATM + msg = r"argument {} is not implemented for CategoricalIndex\.reindex" + with pytest.raises(NotImplementedError, match=msg.format("method")): + df.reindex(["a"], method="ffill") + with pytest.raises(NotImplementedError, match=msg.format("level")): + df.reindex(["a"], level=1) + with pytest.raises(NotImplementedError, match=msg.format("limit")): + df.reindex(["a"], limit=2) + + def test_reindex_signature(self): + sig = inspect.signature(DataFrame.reindex) + parameters = set(sig.parameters) + assert parameters == { + "self", + "labels", + "index", + "columns", + "axis", + "limit", + "copy", + "level", + "method", + "fill_value", + "tolerance", + } + + def test_reindex_multiindex_ffill_added_rows(self): + # GH#23693 + # reindex added rows with nan values even when fill method was specified + mi = MultiIndex.from_tuples([("a", "b"), ("d", "e")]) + df = DataFrame([[0, 7], [3, 4]], index=mi, columns=["x", "y"]) + mi2 = MultiIndex.from_tuples([("a", "b"), ("d", "e"), ("h", "i")]) + result = df.reindex(mi2, axis=0, method="ffill") + expected = DataFrame([[0, 7], [3, 4], [3, 4]], index=mi2, columns=["x", "y"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "kwargs", + [ + {"method": "pad", "tolerance": timedelta(seconds=9)}, + {"method": "backfill", "tolerance": timedelta(seconds=9)}, + {"method": "nearest"}, + {"method": None}, + ], + ) + def test_reindex_empty_frame(self, kwargs): + # GH#27315 + idx = date_range(start="2020", freq="30s", periods=3) + df = DataFrame([], index=Index([], name="time"), columns=["a"]) + result = df.reindex(idx, **kwargs) + expected = DataFrame({"a": [np.nan] * 3}, index=idx, dtype=object) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "src_idx", + [ + Index([]), + CategoricalIndex([]), + ], + ) + @pytest.mark.parametrize( + "cat_idx", + [ + # No duplicates + Index([]), + CategoricalIndex([]), + Index(["A", "B"]), + CategoricalIndex(["A", "B"]), + # Duplicates: GH#38906 + Index(["A", "A"]), + CategoricalIndex(["A", "A"]), + ], + ) + def test_reindex_empty(self, src_idx, cat_idx): + df = DataFrame(columns=src_idx, index=["K"], dtype="f8") + + result = df.reindex(columns=cat_idx) + expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]"]) + def test_reindex_datetimelike_to_object(self, dtype): + # GH#39755 dont cast dt64/td64 to ints + mi = MultiIndex.from_product([list("ABCDE"), range(2)]) + + dti = date_range("2016-01-01", periods=10) + fv = np.timedelta64("NaT", "ns") + if dtype == "m8[ns]": + dti = dti - dti[0] + fv = np.datetime64("NaT", "ns") + + ser = Series(dti, index=mi) + ser[::3] = pd.NaT + + df = ser.unstack() + + index = df.index.append(Index([1])) + columns = df.columns.append(Index(["foo"])) + + res = df.reindex(index=index, columns=columns, fill_value=fv) + + expected = DataFrame( + { + 0: df[0].tolist() + [fv], + 1: df[1].tolist() + [fv], + "foo": np.array(["NaT"] * 6, dtype=fv.dtype), + }, + index=index, + ) + assert (res.dtypes[[0, 1]] == object).all() + assert res.iloc[0, 0] is pd.NaT + assert res.iloc[-1, 0] is fv + assert res.iloc[-1, 1] is fv + tm.assert_frame_equal(res, expected) + + @pytest.mark.parametrize( + "index_df,index_res,index_exp", + [ + ( + CategoricalIndex([], categories=["A"]), + Index(["A"]), + Index(["A"]), + ), + ( + CategoricalIndex([], categories=["A"]), + Index(["B"]), + Index(["B"]), + ), + ( + CategoricalIndex([], categories=["A"]), + CategoricalIndex(["A"]), + CategoricalIndex(["A"]), + ), + ( + CategoricalIndex([], categories=["A"]), + CategoricalIndex(["B"]), + CategoricalIndex(["B"]), + ), + ], + ) + def test_reindex_not_category(self, index_df, index_res, index_exp): + # GH#28690 + df = DataFrame(index=index_df) + result = df.reindex(index=index_res) + expected = DataFrame(index=index_exp) + tm.assert_frame_equal(result, expected) + + def test_invalid_method(self): + df = DataFrame({"A": [1, np.nan, 2]}) + + msg = "Invalid fill method" + with pytest.raises(ValueError, match=msg): + df.reindex([1, 0, 2], method="asfreq") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reindex_like.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reindex_like.py new file mode 100644 index 0000000000000000000000000000000000000000..ce68ec28eec3dd85461fcecfe506524040f64542 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reindex_like.py @@ -0,0 +1,39 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestDataFrameReindexLike: + def test_reindex_like(self, float_frame): + other = float_frame.reindex(index=float_frame.index[:10], columns=["C", "B"]) + + tm.assert_frame_equal(other, float_frame.reindex_like(other)) + + @pytest.mark.parametrize( + "method,expected_values", + [ + ("nearest", [0, 1, 1, 2]), + ("pad", [np.nan, 0, 1, 1]), + ("backfill", [0, 1, 2, 2]), + ], + ) + def test_reindex_like_methods(self, method, expected_values): + df = DataFrame({"x": list(range(5))}) + + result = df.reindex_like(df, method=method, tolerance=0) + tm.assert_frame_equal(df, result) + result = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0]) + tm.assert_frame_equal(df, result) + + def test_reindex_like_subclass(self): + # https://github.com/pandas-dev/pandas/issues/31925 + class MyDataFrame(DataFrame): + pass + + expected = DataFrame() + df = MyDataFrame() + result = df.reindex_like(expected) + + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_rename.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_rename.py new file mode 100644 index 0000000000000000000000000000000000000000..c3bc96b44c80745d2b96a4c57f936778372affb8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_rename.py @@ -0,0 +1,415 @@ +from collections import ChainMap +import inspect + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + MultiIndex, + merge, +) +import pandas._testing as tm + + +class TestRename: + def test_rename_signature(self): + sig = inspect.signature(DataFrame.rename) + parameters = set(sig.parameters) + assert parameters == { + "self", + "mapper", + "index", + "columns", + "axis", + "inplace", + "copy", + "level", + "errors", + } + + def test_rename_mi(self, frame_or_series): + obj = frame_or_series( + [11, 21, 31], + index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), + ) + obj.rename(str.lower) + + def test_rename(self, float_frame): + mapping = {"A": "a", "B": "b", "C": "c", "D": "d"} + + renamed = float_frame.rename(columns=mapping) + renamed2 = float_frame.rename(columns=str.lower) + + tm.assert_frame_equal(renamed, renamed2) + tm.assert_frame_equal( + renamed2.rename(columns=str.upper), float_frame, check_names=False + ) + + # index + data = {"A": {"foo": 0, "bar": 1}} + + df = DataFrame(data) + renamed = df.rename(index={"foo": "bar", "bar": "foo"}) + tm.assert_index_equal(renamed.index, Index(["bar", "foo"])) + + renamed = df.rename(index=str.upper) + tm.assert_index_equal(renamed.index, Index(["FOO", "BAR"])) + + # have to pass something + with pytest.raises(TypeError, match="must pass an index to rename"): + float_frame.rename() + + # partial columns + renamed = float_frame.rename(columns={"C": "foo", "D": "bar"}) + tm.assert_index_equal(renamed.columns, Index(["A", "B", "foo", "bar"])) + + # other axis + renamed = float_frame.T.rename(index={"C": "foo", "D": "bar"}) + tm.assert_index_equal(renamed.index, Index(["A", "B", "foo", "bar"])) + + # index with name + index = Index(["foo", "bar"], name="name") + renamer = DataFrame(data, index=index) + renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) + tm.assert_index_equal(renamed.index, Index(["bar", "foo"], name="name")) + assert renamed.index.name == renamer.index.name + + @pytest.mark.parametrize( + "args,kwargs", + [ + ((ChainMap({"A": "a"}, {"B": "b"}),), {"axis": "columns"}), + ((), {"columns": ChainMap({"A": "a"}, {"B": "b"})}), + ], + ) + def test_rename_chainmap(self, args, kwargs): + # see gh-23859 + colAData = range(1, 11) + colBdata = np.random.default_rng(2).standard_normal(10) + + df = DataFrame({"A": colAData, "B": colBdata}) + result = df.rename(*args, **kwargs) + + expected = DataFrame({"a": colAData, "b": colBdata}) + tm.assert_frame_equal(result, expected) + + def test_rename_multiindex(self): + tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] + tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] + index = MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) + columns = MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) + df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns) + + # + # without specifying level -> across all levels + + renamed = df.rename( + index={"foo1": "foo3", "bar2": "bar3"}, + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, + ) + new_index = MultiIndex.from_tuples( + [("foo3", "bar1"), ("foo2", "bar3")], names=["foo", "bar"] + ) + new_columns = MultiIndex.from_tuples( + [("fizz3", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"] + ) + tm.assert_index_equal(renamed.index, new_index) + tm.assert_index_equal(renamed.columns, new_columns) + assert renamed.index.names == df.index.names + assert renamed.columns.names == df.columns.names + + # + # with specifying a level (GH13766) + + # dict + new_columns = MultiIndex.from_tuples( + [("fizz3", "buzz1"), ("fizz2", "buzz2")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0) + tm.assert_index_equal(renamed.columns, new_columns) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz") + tm.assert_index_equal(renamed.columns, new_columns) + + new_columns = MultiIndex.from_tuples( + [("fizz1", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1) + tm.assert_index_equal(renamed.columns, new_columns) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz") + tm.assert_index_equal(renamed.columns, new_columns) + + # function + func = str.upper + new_columns = MultiIndex.from_tuples( + [("FIZZ1", "buzz1"), ("FIZZ2", "buzz2")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns=func, level=0) + tm.assert_index_equal(renamed.columns, new_columns) + renamed = df.rename(columns=func, level="fizz") + tm.assert_index_equal(renamed.columns, new_columns) + + new_columns = MultiIndex.from_tuples( + [("fizz1", "BUZZ1"), ("fizz2", "BUZZ2")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns=func, level=1) + tm.assert_index_equal(renamed.columns, new_columns) + renamed = df.rename(columns=func, level="buzz") + tm.assert_index_equal(renamed.columns, new_columns) + + # index + new_index = MultiIndex.from_tuples( + [("foo3", "bar1"), ("foo2", "bar2")], names=["foo", "bar"] + ) + renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) + tm.assert_index_equal(renamed.index, new_index) + + def test_rename_nocopy(self, float_frame, using_copy_on_write, warn_copy_on_write): + renamed = float_frame.rename(columns={"C": "foo"}, copy=False) + + assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values) + + with tm.assert_cow_warning(warn_copy_on_write): + renamed.loc[:, "foo"] = 1.0 + if using_copy_on_write: + assert not (float_frame["C"] == 1.0).all() + else: + assert (float_frame["C"] == 1.0).all() + + def test_rename_inplace(self, float_frame): + float_frame.rename(columns={"C": "foo"}) + assert "C" in float_frame + assert "foo" not in float_frame + + c_values = float_frame["C"] + float_frame = float_frame.copy() + return_value = float_frame.rename(columns={"C": "foo"}, inplace=True) + assert return_value is None + + assert "C" not in float_frame + assert "foo" in float_frame + # GH 44153 + # Used to be id(float_frame["foo"]) != c_id, but flaky in the CI + assert float_frame["foo"] is not c_values + + def test_rename_bug(self): + # GH 5344 + # rename set ref_locs, and set_index was not resetting + df = DataFrame({0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]}) + df = df.rename(columns={0: "a"}) + df = df.rename(columns={1: "b"}) + df = df.set_index(["a", "b"]) + df.columns = ["2001-01-01"] + expected = DataFrame( + [[1], [2]], + index=MultiIndex.from_tuples( + [("foo", "bah"), ("bar", "bas")], names=["a", "b"] + ), + columns=["2001-01-01"], + ) + tm.assert_frame_equal(df, expected) + + def test_rename_bug2(self): + # GH 19497 + # rename was changing Index to MultiIndex if Index contained tuples + + df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)], columns=["a"]) + df = df.rename({(1, 1): (5, 4)}, axis="index") + expected = DataFrame( + data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)], columns=["a"] + ) + tm.assert_frame_equal(df, expected) + + def test_rename_errors_raises(self): + df = DataFrame(columns=["A", "B", "C", "D"]) + with pytest.raises(KeyError, match="'E'] not found in axis"): + df.rename(columns={"A": "a", "E": "e"}, errors="raise") + + @pytest.mark.parametrize( + "mapper, errors, expected_columns", + [ + ({"A": "a", "E": "e"}, "ignore", ["a", "B", "C", "D"]), + ({"A": "a"}, "raise", ["a", "B", "C", "D"]), + (str.lower, "raise", ["a", "b", "c", "d"]), + ], + ) + def test_rename_errors(self, mapper, errors, expected_columns): + # GH 13473 + # rename now works with errors parameter + df = DataFrame(columns=["A", "B", "C", "D"]) + result = df.rename(columns=mapper, errors=errors) + expected = DataFrame(columns=expected_columns) + tm.assert_frame_equal(result, expected) + + def test_rename_objects(self, float_string_frame): + renamed = float_string_frame.rename(columns=str.upper) + + assert "FOO" in renamed + assert "foo" not in renamed + + def test_rename_axis_style(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["X", "Y"]) + expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) + + result = df.rename(str.lower, axis=1) + tm.assert_frame_equal(result, expected) + + result = df.rename(str.lower, axis="columns") + tm.assert_frame_equal(result, expected) + + result = df.rename({"A": "a", "B": "b"}, axis=1) + tm.assert_frame_equal(result, expected) + + result = df.rename({"A": "a", "B": "b"}, axis="columns") + tm.assert_frame_equal(result, expected) + + # Index + expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) + result = df.rename(str.lower, axis=0) + tm.assert_frame_equal(result, expected) + + result = df.rename(str.lower, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.rename({"X": "x", "Y": "y"}, axis=0) + tm.assert_frame_equal(result, expected) + + result = df.rename({"X": "x", "Y": "y"}, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.rename(mapper=str.lower, axis="index") + tm.assert_frame_equal(result, expected) + + def test_rename_mapper_multi(self): + df = DataFrame({"A": ["a", "b"], "B": ["c", "d"], "C": [1, 2]}).set_index( + ["A", "B"] + ) + result = df.rename(str.upper) + expected = df.rename(index=str.upper) + tm.assert_frame_equal(result, expected) + + def test_rename_positional_named(self): + # https://github.com/pandas-dev/pandas/issues/12392 + df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) + result = df.rename(index=str.lower, columns=str.upper) + expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) + tm.assert_frame_equal(result, expected) + + def test_rename_axis_style_raises(self): + # see gh-12392 + df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["0", "1"]) + + # Named target and axis + over_spec_msg = "Cannot specify both 'axis' and any of 'index' or 'columns'" + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(index=str.lower, axis=1) + + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(index=str.lower, axis="columns") + + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(columns=str.lower, axis="columns") + + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(index=str.lower, axis=0) + + # Multiple targets and axis + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(str.lower, index=str.lower, axis="columns") + + # Too many targets + over_spec_msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'" + with pytest.raises(TypeError, match=over_spec_msg): + df.rename(str.lower, index=str.lower, columns=str.lower) + + # Duplicates + with pytest.raises(TypeError, match="multiple values"): + df.rename(id, mapper=id) + + def test_rename_positional_raises(self): + # GH 29136 + df = DataFrame(columns=["A", "B"]) + msg = r"rename\(\) takes from 1 to 2 positional arguments" + + with pytest.raises(TypeError, match=msg): + df.rename(None, str.lower) + + def test_rename_no_mappings_raises(self): + # GH 29136 + df = DataFrame([[1]]) + msg = "must pass an index to rename" + with pytest.raises(TypeError, match=msg): + df.rename() + + with pytest.raises(TypeError, match=msg): + df.rename(None, index=None) + + with pytest.raises(TypeError, match=msg): + df.rename(None, columns=None) + + with pytest.raises(TypeError, match=msg): + df.rename(None, columns=None, index=None) + + def test_rename_mapper_and_positional_arguments_raises(self): + # GH 29136 + df = DataFrame([[1]]) + msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'" + with pytest.raises(TypeError, match=msg): + df.rename({}, index={}) + + with pytest.raises(TypeError, match=msg): + df.rename({}, columns={}) + + with pytest.raises(TypeError, match=msg): + df.rename({}, columns={}, index={}) + + def test_rename_with_duplicate_columns(self): + # GH#4403 + df4 = DataFrame( + {"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]}, + index=MultiIndex.from_tuples( + [(600809, 20130331)], names=["STK_ID", "RPT_Date"] + ), + ) + + df5 = DataFrame( + { + "RPT_Date": [20120930, 20121231, 20130331], + "STK_ID": [600809] * 3, + "STK_Name": ["饡驦", "饡驦", "饡驦"], + "TClose": [38.05, 41.66, 30.01], + }, + index=MultiIndex.from_tuples( + [(600809, 20120930), (600809, 20121231), (600809, 20130331)], + names=["STK_ID", "RPT_Date"], + ), + ) + # TODO: can we construct this without merge? + k = merge(df4, df5, how="inner", left_index=True, right_index=True) + result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"}) + + expected = DataFrame( + [[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]], + columns=[ + "RT", + "TClose", + "TExg", + "RPT_Date", + "STK_ID", + "STK_Name", + "QT_Close", + ], + ).set_index(["STK_ID", "RPT_Date"], drop=False) + tm.assert_frame_equal(result, expected) + + def test_rename_boolean_index(self): + df = DataFrame(np.arange(15).reshape(3, 5), columns=[False, True, 2, 3, 4]) + mapper = {0: "foo", 1: "bar", 2: "bah"} + res = df.rename(index=mapper) + exp = DataFrame( + np.arange(15).reshape(3, 5), + columns=[False, True, 2, 3, 4], + index=["foo", "bar", "bah"], + ) + tm.assert_frame_equal(res, exp) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_rename_axis.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_rename_axis.py new file mode 100644 index 0000000000000000000000000000000000000000..dd4a77c6509b8de7eb767bb44238004399c159a4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_rename_axis.py @@ -0,0 +1,111 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + MultiIndex, +) +import pandas._testing as tm + + +class TestDataFrameRenameAxis: + def test_rename_axis_inplace(self, float_frame): + # GH#15704 + expected = float_frame.rename_axis("foo") + result = float_frame.copy() + return_value = no_return = result.rename_axis("foo", inplace=True) + assert return_value is None + + assert no_return is None + tm.assert_frame_equal(result, expected) + + expected = float_frame.rename_axis("bar", axis=1) + result = float_frame.copy() + return_value = no_return = result.rename_axis("bar", axis=1, inplace=True) + assert return_value is None + + assert no_return is None + tm.assert_frame_equal(result, expected) + + def test_rename_axis_raises(self): + # GH#17833 + df = DataFrame({"A": [1, 2], "B": [1, 2]}) + with pytest.raises(ValueError, match="Use `.rename`"): + df.rename_axis(id, axis=0) + + with pytest.raises(ValueError, match="Use `.rename`"): + df.rename_axis({0: 10, 1: 20}, axis=0) + + with pytest.raises(ValueError, match="Use `.rename`"): + df.rename_axis(id, axis=1) + + with pytest.raises(ValueError, match="Use `.rename`"): + df["A"].rename_axis(id) + + def test_rename_axis_mapper(self): + # GH#19978 + mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) + df = DataFrame( + {"x": list(range(len(mi))), "y": [i * 10 for i in range(len(mi))]}, index=mi + ) + + # Test for rename of the Index object of columns + result = df.rename_axis("cols", axis=1) + tm.assert_index_equal(result.columns, Index(["x", "y"], name="cols")) + + # Test for rename of the Index object of columns using dict + result = result.rename_axis(columns={"cols": "new"}, axis=1) + tm.assert_index_equal(result.columns, Index(["x", "y"], name="new")) + + # Test for renaming index using dict + result = df.rename_axis(index={"ll": "foo"}) + assert result.index.names == ["foo", "nn"] + + # Test for renaming index using a function + result = df.rename_axis(index=str.upper, axis=0) + assert result.index.names == ["LL", "NN"] + + # Test for renaming index providing complete list + result = df.rename_axis(index=["foo", "goo"]) + assert result.index.names == ["foo", "goo"] + + # Test for changing index and columns at same time + sdf = df.reset_index().set_index("nn").drop(columns=["ll", "y"]) + result = sdf.rename_axis(index="foo", columns="meh") + assert result.index.name == "foo" + assert result.columns.name == "meh" + + # Test different error cases + with pytest.raises(TypeError, match="Must pass"): + df.rename_axis(index="wrong") + + with pytest.raises(ValueError, match="Length of names"): + df.rename_axis(index=["wrong"]) + + with pytest.raises(TypeError, match="bogus"): + df.rename_axis(bogus=None) + + @pytest.mark.parametrize( + "kwargs, rename_index, rename_columns", + [ + ({"mapper": None, "axis": 0}, True, False), + ({"mapper": None, "axis": 1}, False, True), + ({"index": None}, True, False), + ({"columns": None}, False, True), + ({"index": None, "columns": None}, True, True), + ({}, False, False), + ], + ) + def test_rename_axis_none(self, kwargs, rename_index, rename_columns): + # GH 25034 + index = Index(list("abc"), name="foo") + columns = Index(["col1", "col2"], name="bar") + data = np.arange(6).reshape(3, 2) + df = DataFrame(data, index, columns) + + result = df.rename_axis(**kwargs) + expected_index = index.rename(None) if rename_index else index + expected_columns = columns.rename(None) if rename_columns else columns + expected = DataFrame(data, expected_index, expected_columns) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reorder_levels.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reorder_levels.py new file mode 100644 index 0000000000000000000000000000000000000000..5d6b65daae4d513b3d3333856a57a2199cb79ed0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reorder_levels.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + MultiIndex, +) +import pandas._testing as tm + + +class TestReorderLevels: + def test_reorder_levels(self, frame_or_series): + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + names=["L0", "L1", "L2"], + ) + df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index) + obj = tm.get_obj(df, frame_or_series) + + # no change, position + result = obj.reorder_levels([0, 1, 2]) + tm.assert_equal(obj, result) + + # no change, labels + result = obj.reorder_levels(["L0", "L1", "L2"]) + tm.assert_equal(obj, result) + + # rotate, position + result = obj.reorder_levels([1, 2, 0]) + e_idx = MultiIndex( + levels=[["one", "two", "three"], [0, 1], ["bar"]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], + names=["L1", "L2", "L0"], + ) + expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) + expected = tm.get_obj(expected, frame_or_series) + tm.assert_equal(result, expected) + + result = obj.reorder_levels([0, 0, 0]) + e_idx = MultiIndex( + levels=[["bar"], ["bar"], ["bar"]], + codes=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], + names=["L0", "L0", "L0"], + ) + expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) + expected = tm.get_obj(expected, frame_or_series) + tm.assert_equal(result, expected) + + result = obj.reorder_levels(["L0", "L0", "L0"]) + tm.assert_equal(result, expected) + + def test_reorder_levels_swaplevel_equivalence( + self, multiindex_year_month_day_dataframe_random_data + ): + ymd = multiindex_year_month_day_dataframe_random_data + + result = ymd.reorder_levels(["month", "day", "year"]) + expected = ymd.swaplevel(0, 1).swaplevel(1, 2) + tm.assert_frame_equal(result, expected) + + result = ymd["A"].reorder_levels(["month", "day", "year"]) + expected = ymd["A"].swaplevel(0, 1).swaplevel(1, 2) + tm.assert_series_equal(result, expected) + + result = ymd.T.reorder_levels(["month", "day", "year"], axis=1) + expected = ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) + tm.assert_frame_equal(result, expected) + + with pytest.raises(TypeError, match="hierarchical axis"): + ymd.reorder_levels([1, 2], axis=1) + + with pytest.raises(IndexError, match="Too many levels"): + ymd.index.reorder_levels([1, 2, 3]) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_replace.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_replace.py new file mode 100644 index 0000000000000000000000000000000000000000..8bfa98042eb073d4b97bc7a5642c7922f9af6d08 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_replace.py @@ -0,0 +1,1726 @@ +from __future__ import annotations + +from datetime import datetime +import re + +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +import pandas as pd +from pandas import ( + DataFrame, + Index, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture +def mix_ab() -> dict[str, list[int | str]]: + return {"a": list(range(4)), "b": list("ab..")} + + +@pytest.fixture +def mix_abc() -> dict[str, list[float | str]]: + return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} + + +class TestDataFrameReplace: + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_inplace(self, datetime_frame, float_string_frame): + datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan + datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan + + tsframe = datetime_frame.copy() + return_value = tsframe.replace(np.nan, 0, inplace=True) + assert return_value is None + tm.assert_frame_equal(tsframe, datetime_frame.fillna(0)) + + # mixed type + mf = float_string_frame + mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan + mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan + + result = float_string_frame.replace(np.nan, 0) + expected = float_string_frame.fillna(value=0) + tm.assert_frame_equal(result, expected) + + tsframe = datetime_frame.copy() + return_value = tsframe.replace([np.nan], [0], inplace=True) + assert return_value is None + tm.assert_frame_equal(tsframe, datetime_frame.fillna(0)) + + @pytest.mark.parametrize( + "to_replace,values,expected", + [ + # lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + ( + [r"\s*\.\s*", r"e|f|g"], + [np.nan, "crap"], + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap"] * 3 + ["h"], + "c": ["h", "crap", "l", "o"], + }, + ), + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + ( + [r"\s*(\.)\s*", r"(e|f|g)"], + [r"\1\1", r"\1_crap"], + { + "a": ["a", "b", "..", ".."], + "b": ["e_crap", "f_crap", "g_crap", "h"], + "c": ["h", "e_crap", "l", "o"], + }, + ), + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + ( + [r"\s*(\.)\s*", r"e"], + [r"\1\1", r"crap"], + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + }, + ), + ], + ) + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("use_value_regex_args", [True, False]) + def test_regex_replace_list_obj( + self, to_replace, values, expected, inplace, use_value_regex_args + ): + df = DataFrame({"a": list("ab.."), "b": list("efgh"), "c": list("helo")}) + + if use_value_regex_args: + result = df.replace(value=values, regex=to_replace, inplace=inplace) + else: + result = df.replace(to_replace, values, regex=True, inplace=inplace) + + if inplace: + assert result is None + result = df + + expected = DataFrame(expected) + tm.assert_frame_equal(result, expected) + + def test_regex_replace_list_mixed(self, mix_ab): + # mixed frame to make sure this doesn't break things + dfmix = DataFrame(mix_ab) + + # lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r"\s*\.\s*", r"a"] + values = [np.nan, "crap"] + mix2 = {"a": list(range(4)), "b": list("ab.."), "c": list("halo")} + dfmix2 = DataFrame(mix2) + res = dfmix2.replace(to_replace_res, values, regex=True) + expec = DataFrame( + { + "a": mix2["a"], + "b": ["crap", "b", np.nan, np.nan], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] + values = [r"\1\1", r"\1_crap"] + res = dfmix.replace(to_replace_res, values, regex=True) + expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.replace(to_replace_res, values, regex=True) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.replace(regex=to_replace_res, value=values) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + def test_regex_replace_list_mixed_inplace(self, mix_ab): + dfmix = DataFrame(mix_ab) + # the same inplace + # lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r"\s*\.\s*", r"a"] + values = [np.nan, "crap"] + res = dfmix.copy() + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b", np.nan, np.nan]}) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] + values = [r"\1\1", r"\1_crap"] + res = dfmix.copy() + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None + expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.copy() + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] + res = dfmix.copy() + return_value = res.replace(regex=to_replace_res, value=values, inplace=True) + assert return_value is None + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) + tm.assert_frame_equal(res, expec) + + def test_regex_replace_dict_mixed(self, mix_abc): + dfmix = DataFrame(mix_abc) + + # dicts + # single dict {re1: v1}, search the whole frame + # need test for this... + + # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole + # frame + res = dfmix.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True) + res2 = dfmix.copy() + return_value = res2.replace( + {"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True + ) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the + # whole frame + res = dfmix.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True) + res2 = dfmix.copy() + return_value = res2.replace( + {"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True + ) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + res = dfmix.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}) + res2 = dfmix.copy() + return_value = res2.replace( + regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True + ) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + # scalar -> dict + # to_replace regex, {value: value} + expec = DataFrame( + {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} + ) + res = dfmix.replace("a", {"b": np.nan}, regex=True) + res2 = dfmix.copy() + return_value = res2.replace("a", {"b": np.nan}, regex=True, inplace=True) + assert return_value is None + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + res = dfmix.replace("a", {"b": np.nan}, regex=True) + res2 = dfmix.copy() + return_value = res2.replace(regex="a", value={"b": np.nan}, inplace=True) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + + def test_regex_replace_dict_nested(self, mix_abc): + # nested dicts will not work until this is implemented for Series + dfmix = DataFrame(mix_abc) + res = dfmix.replace({"b": {r"\s*\.\s*": np.nan}}, regex=True) + res2 = dfmix.copy() + res4 = dfmix.copy() + return_value = res2.replace( + {"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True + ) + assert return_value is None + res3 = dfmix.replace(regex={"b": {r"\s*\.\s*": np.nan}}) + return_value = res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + tm.assert_frame_equal(res4, expec) + + def test_regex_replace_dict_nested_non_first_character( + self, any_string_dtype, using_infer_string + ): + # GH 25259 + dtype = any_string_dtype + df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) + if using_infer_string and any_string_dtype == "object": + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) + + else: + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_regex_replace_dict_nested_gh4115(self): + df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) + expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"Type": {"Q": 0, "T": 1}}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_regex_replace_list_to_scalar(self, mix_abc): + df = DataFrame(mix_abc) + expec = DataFrame( + { + "a": mix_abc["a"], + "b": np.array([np.nan] * 4), + "c": [np.nan, np.nan, np.nan, "d"], + } + ) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) + res2 = df.copy() + res3 = df.copy() + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = res2.replace( + [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True + ) + assert return_value is None + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = res3.replace( + regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True + ) + assert return_value is None + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_regex_replace_str_to_numeric(self, mix_abc): + # what happens when you try to replace a numeric value with a regex? + df = DataFrame(mix_abc) + res = df.replace(r"\s*\.\s*", 0, regex=True) + res2 = df.copy() + return_value = res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True) + assert return_value is None + res3 = df.copy() + return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) + assert return_value is None + expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_regex_replace_regex_list_to_numeric(self, mix_abc): + df = DataFrame(mix_abc) + res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) + res2 = df.copy() + return_value = res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True) + assert return_value is None + res3 = df.copy() + return_value = res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", 0, 0, 0], "c": ["a", 0, np.nan, "d"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + def test_regex_replace_series_of_regexes(self, mix_abc): + df = DataFrame(mix_abc) + s1 = Series({"b": r"\s*\.\s*"}) + s2 = Series({"b": np.nan}) + res = df.replace(s1, s2, regex=True) + res2 = df.copy() + return_value = res2.replace(s1, s2, inplace=True, regex=True) + assert return_value is None + res3 = df.copy() + return_value = res3.replace(regex=s1, value=s2, inplace=True) + assert return_value is None + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) + tm.assert_frame_equal(res, expec) + tm.assert_frame_equal(res2, expec) + tm.assert_frame_equal(res3, expec) + + def test_regex_replace_numeric_to_object_conversion(self, mix_abc): + df = DataFrame(mix_abc) + expec = DataFrame({"a": ["a", 1, 2, 3], "b": mix_abc["b"], "c": mix_abc["c"]}) + res = df.replace(0, "a") + tm.assert_frame_equal(res, expec) + assert res.a.dtype == np.object_ + + @pytest.mark.parametrize( + "to_replace", [{"": np.nan, ",": ""}, {",": "", "": np.nan}] + ) + def test_joint_simple_replace_and_regex_replace(self, to_replace): + # GH-39338 + df = DataFrame( + { + "col1": ["1,000", "a", "3"], + "col2": ["a", "", "b"], + "col3": ["a", "b", "c"], + } + ) + result = df.replace(regex=to_replace) + expected = DataFrame( + { + "col1": ["1000", "a", "3"], + "col2": ["a", np.nan, "b"], + "col3": ["a", "b", "c"], + } + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"]) + def test_replace_regex_metachar(self, metachar): + df = DataFrame({"a": [metachar, "else"]}) + result = df.replace({"a": {metachar: "paren"}}) + expected = DataFrame({"a": ["paren", "else"]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "data,to_replace,expected", + [ + (["xax", "xbx"], {"a": "c", "b": "d"}, ["xcx", "xdx"]), + (["d", "", ""], {r"^\s*$": pd.NA}, ["d", pd.NA, pd.NA]), + ], + ) + def test_regex_replace_string_types( + self, + data, + to_replace, + expected, + frame_or_series, + any_string_dtype, + using_infer_string, + request, + ): + # GH-41333, GH-35977 + dtype = any_string_dtype + obj = frame_or_series(data, dtype=dtype) + if using_infer_string and any_string_dtype == "object": + if len(to_replace) > 1 and isinstance(obj, DataFrame): + request.node.add_marker( + pytest.mark.xfail( + reason="object input array that gets downcasted raises on " + "second pass" + ) + ) + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = obj.replace(to_replace, regex=True) + dtype = "string[pyarrow_numpy]" + else: + result = obj.replace(to_replace, regex=True) + expected = frame_or_series(expected, dtype=dtype) + + tm.assert_equal(result, expected) + + def test_replace(self, datetime_frame): + datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan + datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan + + zero_filled = datetime_frame.replace(np.nan, -1e8) + tm.assert_frame_equal(zero_filled, datetime_frame.fillna(-1e8)) + tm.assert_frame_equal(zero_filled.replace(-1e8, np.nan), datetime_frame) + + datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan + datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan + datetime_frame.loc[datetime_frame.index[:5], "B"] = -1e8 + + # empty + df = DataFrame(index=["a", "b"]) + tm.assert_frame_equal(df, df.replace(5, 7)) + + # GH 11698 + # test for mixed data types. + df = DataFrame( + [("-", pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] + ) + df1 = df.replace("-", np.nan) + expected_df = DataFrame( + [(np.nan, pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] + ) + tm.assert_frame_equal(df1, expected_df) + + def test_replace_list(self): + obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} + dfobj = DataFrame(obj) + + # lists of regexes and values + # list of [v1, v2, ..., vN] -> [v1, v2, ..., vN] + to_replace_res = [r".", r"e"] + values = [np.nan, "crap"] + res = dfobj.replace(to_replace_res, values) + expec = DataFrame( + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + # list of [v1, v2, ..., vN] -> [v1, v2, .., vN] + to_replace_res = [r".", r"f"] + values = [r"..", r"crap"] + res = dfobj.replace(to_replace_res, values) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["e", "crap", "g", "h"], + "c": ["h", "e", "l", "o"], + } + ) + tm.assert_frame_equal(res, expec) + + def test_replace_with_empty_list(self, frame_or_series): + # GH 21977 + ser = Series([["a", "b"], [], np.nan, [1]]) + obj = DataFrame({"col": ser}) + obj = tm.get_obj(obj, frame_or_series) + expected = obj + result = obj.replace([], np.nan) + tm.assert_equal(result, expected) + + # GH 19266 + msg = ( + "NumPy boolean array indexing assignment cannot assign {size} " + "input values to the 1 output values where the mask is true" + ) + with pytest.raises(ValueError, match=msg.format(size=0)): + obj.replace({np.nan: []}) + with pytest.raises(ValueError, match=msg.format(size=2)): + obj.replace({np.nan: ["dummy", "alt"]}) + + def test_replace_series_dict(self): + # from GH 3064 + df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) + result = df.replace(0, {"zero": 0.5, "one": 1.0}) + expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}}) + tm.assert_frame_equal(result, expected) + + result = df.replace(0, df.mean()) + tm.assert_frame_equal(result, expected) + + # series to series/dict + df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) + s = Series({"zero": 0.0, "one": 2.0}) + result = df.replace(s, {"zero": 0.5, "one": 1.0}) + expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}}) + tm.assert_frame_equal(result, expected) + + result = df.replace(s, df.mean()) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_convert(self): + # gh 3907 + df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) + m = {"foo": 1, "bar": 2, "bah": 3} + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + rep = df.replace(m) + expec = Series([np.int64] * 3) + res = rep.dtypes + tm.assert_series_equal(expec, res) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_mixed(self, float_string_frame): + mf = float_string_frame + mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan + mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan + + result = float_string_frame.replace(np.nan, -18) + expected = float_string_frame.fillna(value=-18) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame) + + result = float_string_frame.replace(np.nan, -1e8) + expected = float_string_frame.fillna(value=-1e8) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame) + + def test_replace_mixed_int_block_upcasting(self): + # int block upcasting + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0.5, 1], dtype="float64"), + } + ) + result = df.replace(0, 0.5) + tm.assert_frame_equal(result, expected) + + return_value = df.replace(0, 0.5, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + + def test_replace_mixed_int_block_splitting(self): + # int block splitting + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + "C": Series([1, 2], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0.5, 1], dtype="float64"), + "C": Series([1, 2], dtype="int64"), + } + ) + result = df.replace(0, 0.5) + tm.assert_frame_equal(result, expected) + + def test_replace_mixed2(self, using_infer_string): + # to object block upcasting + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1, "foo"], dtype="object"), + "B": Series([0, 1], dtype="int64"), + } + ) + result = df.replace(2, "foo") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + "A": Series(["foo", "bar"]), + "B": Series([0, "foo"], dtype="object"), + } + ) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace([1, 2], ["foo", "bar"]) + else: + result = df.replace([1, 2], ["foo", "bar"]) + tm.assert_frame_equal(result, expected) + + def test_replace_mixed3(self): + # test case from + df = DataFrame( + {"A": Series([3, 0], dtype="int64"), "B": Series([0, 3], dtype="int64")} + ) + result = df.replace(3, df.mean().to_dict()) + expected = df.copy().astype("float64") + m = df.mean() + expected.iloc[0, 0] = m.iloc[0] + expected.iloc[1, 1] = m.iloc[1] + tm.assert_frame_equal(result, expected) + + def test_replace_nullable_int_with_string_doesnt_cast(self): + # GH#25438 don't cast df['a'] to float64 + df = DataFrame({"a": [1, 2, 3, np.nan], "b": ["some", "strings", "here", "he"]}) + df["a"] = df["a"].astype("Int64") + + res = df.replace("", np.nan) + tm.assert_series_equal(res["a"], df["a"]) + + @pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"]) + def test_replace_with_nullable_column(self, dtype): + # GH-44499 + nullable_ser = Series([1, 0, 1], dtype=dtype) + df = DataFrame({"A": ["A", "B", "x"], "B": nullable_ser}) + result = df.replace("x", "X") + expected = DataFrame({"A": ["A", "B", "X"], "B": nullable_ser}) + tm.assert_frame_equal(result, expected) + + def test_replace_simple_nested_dict(self): + df = DataFrame({"col": range(1, 5)}) + expected = DataFrame({"col": ["a", 2, 3, "b"]}) + + result = df.replace({"col": {1: "a", 4: "b"}}) + tm.assert_frame_equal(expected, result) + + # in this case, should be the same as the not nested version + result = df.replace({1: "a", 4: "b"}) + tm.assert_frame_equal(expected, result) + + def test_replace_simple_nested_dict_with_nonexistent_value(self): + df = DataFrame({"col": range(1, 5)}) + expected = DataFrame({"col": ["a", 2, 3, "b"]}) + + result = df.replace({-1: "-", 1: "a", 4: "b"}) + tm.assert_frame_equal(expected, result) + + result = df.replace({"col": {-1: "-", 1: "a", 4: "b"}}) + tm.assert_frame_equal(expected, result) + + def test_replace_NA_with_None(self): + # gh-45601 + df = DataFrame({"value": [42, None]}).astype({"value": "Int64"}) + result = df.replace({pd.NA: None}) + expected = DataFrame({"value": [42, None]}, dtype=object) + tm.assert_frame_equal(result, expected) + + def test_replace_NAT_with_None(self): + # gh-45836 + df = DataFrame([pd.NaT, pd.NaT]) + result = df.replace({pd.NaT: None, np.nan: None}) + expected = DataFrame([None, None]) + tm.assert_frame_equal(result, expected) + + def test_replace_with_None_keeps_categorical(self): + # gh-46634 + cat_series = Series(["b", "b", "b", "d"], dtype="category") + df = DataFrame( + { + "id": Series([5, 4, 3, 2], dtype="float64"), + "col": cat_series, + } + ) + result = df.replace({3: None}) + + expected = DataFrame( + { + "id": Series([5.0, 4.0, None, 2.0], dtype="object"), + "col": cat_series, + } + ) + tm.assert_frame_equal(result, expected) + + def test_replace_value_is_none(self, datetime_frame): + orig_value = datetime_frame.iloc[0, 0] + orig2 = datetime_frame.iloc[1, 0] + + datetime_frame.iloc[0, 0] = np.nan + datetime_frame.iloc[1, 0] = 1 + + result = datetime_frame.replace(to_replace={np.nan: 0}) + expected = datetime_frame.T.replace(to_replace={np.nan: 0}).T + tm.assert_frame_equal(result, expected) + + result = datetime_frame.replace(to_replace={np.nan: 0, 1: -1e8}) + tsframe = datetime_frame.copy() + tsframe.iloc[0, 0] = 0 + tsframe.iloc[1, 0] = -1e8 + expected = tsframe + tm.assert_frame_equal(expected, result) + datetime_frame.iloc[0, 0] = orig_value + datetime_frame.iloc[1, 0] = orig2 + + def test_replace_for_new_dtypes(self, datetime_frame): + # dtypes + tsframe = datetime_frame.copy().astype(np.float32) + tsframe.loc[tsframe.index[:5], "A"] = np.nan + tsframe.loc[tsframe.index[-5:], "A"] = np.nan + + zero_filled = tsframe.replace(np.nan, -1e8) + tm.assert_frame_equal(zero_filled, tsframe.fillna(-1e8)) + tm.assert_frame_equal(zero_filled.replace(-1e8, np.nan), tsframe) + + tsframe.loc[tsframe.index[:5], "A"] = np.nan + tsframe.loc[tsframe.index[-5:], "A"] = np.nan + tsframe.loc[tsframe.index[:5], "B"] = np.nan + msg = "DataFrame.fillna with 'method' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + # TODO: what is this even testing? + result = tsframe.fillna(method="bfill") + tm.assert_frame_equal(result, tsframe.fillna(method="bfill")) + + @pytest.mark.parametrize( + "frame, to_replace, value, expected", + [ + (DataFrame({"ints": [1, 2, 3]}), 1, 0, DataFrame({"ints": [0, 2, 3]})), + ( + DataFrame({"ints": [1, 2, 3]}, dtype=np.int32), + 1, + 0, + DataFrame({"ints": [0, 2, 3]}, dtype=np.int32), + ), + ( + DataFrame({"ints": [1, 2, 3]}, dtype=np.int16), + 1, + 0, + DataFrame({"ints": [0, 2, 3]}, dtype=np.int16), + ), + ( + DataFrame({"bools": [True, False, True]}), + False, + True, + DataFrame({"bools": [True, True, True]}), + ), + ( + DataFrame({"complex": [1j, 2j, 3j]}), + 1j, + 0, + DataFrame({"complex": [0j, 2j, 3j]}), + ), + ( + DataFrame( + { + "datetime64": Index( + [ + datetime(2018, 5, 28), + datetime(2018, 7, 28), + datetime(2018, 5, 28), + ] + ) + } + ), + datetime(2018, 5, 28), + datetime(2018, 7, 28), + DataFrame({"datetime64": Index([datetime(2018, 7, 28)] * 3)}), + ), + # GH 20380 + ( + DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["foo"]}), + "foo", + "bar", + DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["bar"]}), + ), + # GH 36782 + ( + DataFrame({"dt": [datetime(2920, 10, 1)]}), + datetime(2920, 10, 1), + datetime(2020, 10, 1), + DataFrame({"dt": [datetime(2020, 10, 1)]}), + ), + ( + DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [0, np.nan, 2], + } + ), + Timestamp("20130102", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + DataFrame( + { + "A": pd.DatetimeIndex( + [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ] + ).as_unit("ns"), + "B": [0, np.nan, 2], + } + ), + ), + # GH 35376 + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1.0, + 5, + DataFrame([[5, 5.0], [2, 2.0]]), + ), + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1, + 5, + DataFrame([[5, 5.0], [2, 2.0]]), + ), + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1.0, + 5.0, + DataFrame([[5, 5.0], [2, 2.0]]), + ), + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1, + 5.0, + DataFrame([[5, 5.0], [2, 2.0]]), + ), + ], + ) + def test_replace_dtypes(self, frame, to_replace, value, expected): + warn = None + if isinstance(to_replace, datetime) and to_replace.year == 2920: + warn = FutureWarning + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(warn, match=msg): + result = frame.replace(to_replace, value) + tm.assert_frame_equal(result, expected) + + def test_replace_input_formats_listlike(self): + # both dicts + to_rep = {"A": np.nan, "B": 0, "C": ""} + values = {"A": 0, "B": -1, "C": "missing"} + df = DataFrame( + {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) + filled = df.replace(to_rep, values) + expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()} + tm.assert_frame_equal(filled, DataFrame(expected)) + + result = df.replace([0, 2, 5], [5, 2, 0]) + expected = DataFrame( + {"A": [np.nan, 5, np.inf], "B": [5, 2, 0], "C": ["", "asdf", "fd"]} + ) + tm.assert_frame_equal(result, expected) + + # scalar to dict + values = {"A": 0, "B": -1, "C": "missing"} + df = DataFrame( + {"A": [np.nan, 0, np.nan], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) + filled = df.replace(np.nan, values) + expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()} + tm.assert_frame_equal(filled, DataFrame(expected)) + + # list to list + to_rep = [np.nan, 0, ""] + values = [-2, -1, "missing"] + result = df.replace(to_rep, values) + expected = df.copy() + for rep, value in zip(to_rep, values): + return_value = expected.replace(rep, value, inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + msg = r"Replacement lists must match in length\. Expecting 3 got 2" + with pytest.raises(ValueError, match=msg): + df.replace(to_rep, values[1:]) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_input_formats_scalar(self): + df = DataFrame( + {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) + + # dict to scalar + to_rep = {"A": np.nan, "B": 0, "C": ""} + filled = df.replace(to_rep, 0) + expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()} + tm.assert_frame_equal(filled, DataFrame(expected)) + + msg = "value argument must be scalar, dict, or Series" + with pytest.raises(TypeError, match=msg): + df.replace(to_rep, [np.nan, 0, ""]) + + # list to scalar + to_rep = [np.nan, 0, ""] + result = df.replace(to_rep, -1) + expected = df.copy() + for rep in to_rep: + return_value = expected.replace(rep, -1, inplace=True) + assert return_value is None + tm.assert_frame_equal(result, expected) + + def test_replace_limit(self): + # TODO + pass + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_dict_no_regex(self): + answer = Series( + { + 0: "Strongly Agree", + 1: "Agree", + 2: "Neutral", + 3: "Disagree", + 4: "Strongly Disagree", + } + ) + weights = { + "Agree": 4, + "Disagree": 2, + "Neutral": 3, + "Strongly Agree": 5, + "Strongly Disagree": 1, + } + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) + tm.assert_series_equal(result, expected) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_series_no_regex(self): + answer = Series( + { + 0: "Strongly Agree", + 1: "Agree", + 2: "Neutral", + 3: "Disagree", + 4: "Strongly Disagree", + } + ) + weights = Series( + { + "Agree": 4, + "Disagree": 2, + "Neutral": 3, + "Strongly Agree": 5, + "Strongly Disagree": 1, + } + ) + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) + tm.assert_series_equal(result, expected) + + def test_replace_dict_tuple_list_ordering_remains_the_same(self): + df = DataFrame({"A": [np.nan, 1]}) + res1 = df.replace(to_replace={np.nan: 0, 1: -1e8}) + res2 = df.replace(to_replace=(1, np.nan), value=[-1e8, 0]) + res3 = df.replace(to_replace=[1, np.nan], value=[-1e8, 0]) + + expected = DataFrame({"A": [0, -1e8]}) + tm.assert_frame_equal(res1, res2) + tm.assert_frame_equal(res2, res3) + tm.assert_frame_equal(res3, expected) + + def test_replace_doesnt_replace_without_regex(self): + df = DataFrame( + { + "fol": [1, 2, 2, 3], + "T_opp": ["0", "vr", "0", "0"], + "T_Dir": ["0", "0", "0", "bt"], + "T_Enh": ["vo", "0", "0", "0"], + } + ) + res = df.replace({r"\D": 1}) + tm.assert_frame_equal(df, res) + + def test_replace_bool_with_string(self): + df = DataFrame({"a": [True, False], "b": list("ab")}) + result = df.replace(True, "a") + expected = DataFrame({"a": ["a", False], "b": df.b}) + tm.assert_frame_equal(result, expected) + + def test_replace_pure_bool_with_string_no_op(self): + df = DataFrame(np.random.default_rng(2).random((2, 2)) > 0.5) + result = df.replace("asdf", "fdsa") + tm.assert_frame_equal(df, result) + + def test_replace_bool_with_bool(self): + df = DataFrame(np.random.default_rng(2).random((2, 2)) > 0.5) + result = df.replace(False, True) + expected = DataFrame(np.ones((2, 2), dtype=bool)) + tm.assert_frame_equal(result, expected) + + def test_replace_with_dict_with_bool_keys(self): + df = DataFrame({0: [True, False], 1: [False, True]}) + result = df.replace({"asdf": "asdb", True: "yes"}) + expected = DataFrame({0: ["yes", False], 1: [False, "yes"]}) + tm.assert_frame_equal(result, expected) + + def test_replace_dict_strings_vs_ints(self): + # GH#34789 + df = DataFrame({"Y0": [1, 2], "Y1": [3, 4]}) + result = df.replace({"replace_string": "test"}) + + tm.assert_frame_equal(result, df) + + result = df["Y0"].replace({"replace_string": "test"}) + tm.assert_series_equal(result, df["Y0"]) + + def test_replace_truthy(self): + df = DataFrame({"a": [True, True]}) + r = df.replace([np.inf, -np.inf], np.nan) + e = df + tm.assert_frame_equal(r, e) + + def test_nested_dict_overlapping_keys_replace_int(self): + # GH 27660 keep behaviour consistent for simple dictionary and + # nested dictionary replacement + df = DataFrame({"a": list(range(1, 5))}) + + result = df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) + expected = df.replace(dict(zip(range(1, 5), range(2, 6)))) + tm.assert_frame_equal(result, expected) + + def test_nested_dict_overlapping_keys_replace_str(self): + # GH 27660 + a = np.arange(1, 5) + astr = a.astype(str) + bstr = np.arange(2, 6).astype(str) + df = DataFrame({"a": astr}) + result = df.replace(dict(zip(astr, bstr))) + expected = df.replace({"a": dict(zip(astr, bstr))}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_swapping_bug(self, using_infer_string): + df = DataFrame({"a": [True, False, True]}) + res = df.replace({"a": {True: "Y", False: "N"}}) + expect = DataFrame({"a": ["Y", "N", "Y"]}) + tm.assert_frame_equal(res, expect) + + df = DataFrame({"a": [0, 1, 0]}) + res = df.replace({"a": {0: "Y", 1: "N"}}) + expect = DataFrame({"a": ["Y", "N", "Y"]}) + tm.assert_frame_equal(res, expect) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_period(self): + d = { + "fname": { + "out_augmented_AUG_2011.json": pd.Period(year=2011, month=8, freq="M"), + "out_augmented_JAN_2011.json": pd.Period(year=2011, month=1, freq="M"), + "out_augmented_MAY_2012.json": pd.Period(year=2012, month=5, freq="M"), + "out_augmented_SUBSIDY_WEEK.json": pd.Period( + year=2011, month=4, freq="M" + ), + "out_augmented_AUG_2012.json": pd.Period(year=2012, month=8, freq="M"), + "out_augmented_MAY_2011.json": pd.Period(year=2011, month=5, freq="M"), + "out_augmented_SEP_2013.json": pd.Period(year=2013, month=9, freq="M"), + } + } + + df = DataFrame( + [ + "out_augmented_AUG_2012.json", + "out_augmented_SEP_2013.json", + "out_augmented_SUBSIDY_WEEK.json", + "out_augmented_MAY_2012.json", + "out_augmented_MAY_2011.json", + "out_augmented_AUG_2011.json", + "out_augmented_JAN_2011.json", + ], + columns=["fname"], + ) + assert set(df.fname.values) == set(d["fname"].keys()) + + expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) + assert expected.dtypes.iloc[0] == "Period[M]" + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_datetime(self): + d = { + "fname": { + "out_augmented_AUG_2011.json": Timestamp("2011-08"), + "out_augmented_JAN_2011.json": Timestamp("2011-01"), + "out_augmented_MAY_2012.json": Timestamp("2012-05"), + "out_augmented_SUBSIDY_WEEK.json": Timestamp("2011-04"), + "out_augmented_AUG_2012.json": Timestamp("2012-08"), + "out_augmented_MAY_2011.json": Timestamp("2011-05"), + "out_augmented_SEP_2013.json": Timestamp("2013-09"), + } + } + + df = DataFrame( + [ + "out_augmented_AUG_2012.json", + "out_augmented_SEP_2013.json", + "out_augmented_SUBSIDY_WEEK.json", + "out_augmented_MAY_2012.json", + "out_augmented_MAY_2011.json", + "out_augmented_AUG_2011.json", + "out_augmented_JAN_2011.json", + ], + columns=["fname"], + ) + assert set(df.fname.values) == set(d["fname"].keys()) + expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) + tm.assert_frame_equal(result, expected) + + def test_replace_datetimetz(self): + # GH 11326 + # behaving poorly when presented with a datetime64[ns, tz] + df = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [0, np.nan, 2], + } + ) + result = df.replace(np.nan, 1) + expected = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": Series([0, 1, 2], dtype="float64"), + } + ) + tm.assert_frame_equal(result, expected) + + result = df.fillna(1) + tm.assert_frame_equal(result, expected) + + result = df.replace(0, np.nan) + expected = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [np.nan, np.nan, 2], + } + ) + tm.assert_frame_equal(result, expected) + + result = df.replace( + Timestamp("20130102", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + ) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) + expected["A"] = expected["A"].dt.as_unit("ns") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.iloc[1, 0] = np.nan + result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Eastern")) + tm.assert_frame_equal(result, expected) + + # pre-2.0 this would coerce to object with mismatched tzs + result = df.copy() + result.iloc[1, 0] = np.nan + result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Pacific")) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Pacific").tz_convert("US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) + expected["A"] = expected["A"].dt.as_unit("ns") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.iloc[1, 0] = np.nan + result = result.replace({"A": np.nan}, Timestamp("20130104")) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) + tm.assert_frame_equal(result, expected) + + def test_replace_with_empty_dictlike(self, mix_abc): + # GH 15289 + df = DataFrame(mix_abc) + tm.assert_frame_equal(df, df.replace({})) + tm.assert_frame_equal(df, df.replace(Series([], dtype=object))) + + tm.assert_frame_equal(df, df.replace({"b": {}})) + tm.assert_frame_equal(df, df.replace(Series({"b": {}}))) + + @pytest.mark.parametrize( + "to_replace, method, expected", + [ + (0, "bfill", {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), + ( + np.nan, + "bfill", + {"A": [0, 1, 2], "B": [5.0, 7.0, 7.0], "C": ["a", "b", "c"]}, + ), + ("d", "ffill", {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), + ( + [0, 2], + "bfill", + {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + [1, 2], + "pad", + {"A": [0, 0, 0], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + (1, 2), + "bfill", + {"A": [0, 2, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + ["b", "c"], + "ffill", + {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "a", "a"]}, + ), + ], + ) + def test_replace_method(self, to_replace, method, expected): + # GH 19632 + df = DataFrame({"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}) + + msg = "The 'method' keyword in DataFrame.replace is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(to_replace=to_replace, value=None, method=method) + expected = DataFrame(expected) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + + final_data = np.array(final_data) + + a = pd.Categorical(final_data[:, 0], categories=[3, 2]) + + ex_cat = [3, 2] if replace_dict["b"] == 1 else [1, 3] + b = pd.Categorical(final_data[:, 1], categories=ex_cat) + + expected = DataFrame({"a": a, "b": b}) + msg2 = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg2): + result = df.replace(replace_dict, 3) + tm.assert_frame_equal(result, expected) + msg = ( + r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " + "different" + ) + with pytest.raises(AssertionError, match=msg): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + with tm.assert_produces_warning(FutureWarning, match=msg2): + return_value = df.replace(replace_dict, 3, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "df, to_replace, exp", + [ + ( + {"col1": [1, 2, 3], "col2": [4, 5, 6]}, + {4: 5, 5: 6, 6: 7}, + {"col1": [1, 2, 3], "col2": [5, 6, 7]}, + ), + ( + {"col1": [1, 2, 3], "col2": ["4", "5", "6"]}, + {"4": "5", "5": "6", "6": "7"}, + {"col1": [1, 2, 3], "col2": ["5", "6", "7"]}, + ), + ], + ) + def test_replace_commutative(self, df, to_replace, exp): + # GH 16051 + # DataFrame.replace() overwrites when values are non-numeric + # also added to data frame whilst issue was for series + + df = DataFrame(df) + + expected = DataFrame(exp) + result = df.replace(to_replace) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + @pytest.mark.parametrize( + "replacer", + [ + Timestamp("20170827"), + np.int8(1), + np.int16(1), + np.float32(1), + np.float64(1), + ], + ) + def test_replace_replacer_dtype(self, replacer): + # GH26632 + df = DataFrame(["a"]) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"a": replacer, "b": replacer}) + expected = DataFrame([replacer]) + tm.assert_frame_equal(result, expected) + + def test_replace_after_convert_dtypes(self): + # GH31517 + df = DataFrame({"grp": [1, 2, 3, 4, 5]}, dtype="Int64") + result = df.replace(1, 10) + expected = DataFrame({"grp": [10, 2, 3, 4, 5]}, dtype="Int64") + tm.assert_frame_equal(result, expected) + + def test_replace_invalid_to_replace(self): + # GH 18634 + # API: replace() should raise an exception if invalid argument is given + df = DataFrame({"one": ["a", "b ", "c"], "two": ["d ", "e ", "f "]}) + msg = ( + r"Expecting 'to_replace' to be either a scalar, array-like, " + r"dict or None, got invalid type.*" + ) + msg2 = ( + "DataFrame.replace without 'value' and with non-dict-like " + "'to_replace' is deprecated" + ) + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg2): + df.replace(lambda x: x.strip()) + + @pytest.mark.parametrize("dtype", ["float", "float64", "int64", "Int64", "boolean"]) + @pytest.mark.parametrize("value", [np.nan, pd.NA]) + def test_replace_no_replacement_dtypes(self, dtype, value): + # https://github.com/pandas-dev/pandas/issues/32988 + df = DataFrame(np.eye(2), dtype=dtype) + result = df.replace(to_replace=[None, -np.inf, np.inf], value=value) + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize("replacement", [np.nan, 5]) + def test_replace_with_duplicate_columns(self, replacement): + # GH 24798 + result = DataFrame({"A": [1, 2, 3], "A1": [4, 5, 6], "B": [7, 8, 9]}) + result.columns = list("AAB") + + expected = DataFrame( + {"A": [1, 2, 3], "A1": [4, 5, 6], "B": [replacement, 8, 9]} + ) + expected.columns = list("AAB") + + result["B"] = result["B"].replace(7, replacement) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("value", [pd.Period("2020-01"), pd.Interval(0, 5)]) + def test_replace_ea_ignore_float(self, frame_or_series, value): + # GH#34871 + obj = DataFrame({"Per": [value] * 3}) + obj = tm.get_obj(obj, frame_or_series) + + expected = obj.copy() + result = obj.replace(1.0, 0.0) + tm.assert_equal(expected, result) + + def test_replace_value_category_type(self): + """ + Test for #23305: to ensure category dtypes are maintained + after replace with direct values + """ + + # create input data + input_dict = { + "col1": [1, 2, 3, 4], + "col2": ["a", "b", "c", "d"], + "col3": [1.5, 2.5, 3.5, 4.5], + "col4": ["cat1", "cat2", "cat3", "cat4"], + "col5": ["obj1", "obj2", "obj3", "obj4"], + } + # explicitly cast columns as category and order them + input_df = DataFrame(data=input_dict).astype( + {"col2": "category", "col4": "category"} + ) + input_df["col2"] = input_df["col2"].cat.reorder_categories( + ["a", "b", "c", "d"], ordered=True + ) + input_df["col4"] = input_df["col4"].cat.reorder_categories( + ["cat1", "cat2", "cat3", "cat4"], ordered=True + ) + + # create expected dataframe + expected_dict = { + "col1": [1, 2, 3, 4], + "col2": ["a", "b", "c", "z"], + "col3": [1.5, 2.5, 3.5, 4.5], + "col4": ["cat1", "catX", "cat3", "cat4"], + "col5": ["obj9", "obj2", "obj3", "obj4"], + } + # explicitly cast columns as category and order them + expected = DataFrame(data=expected_dict).astype( + {"col2": "category", "col4": "category"} + ) + expected["col2"] = expected["col2"].cat.reorder_categories( + ["a", "b", "c", "z"], ordered=True + ) + expected["col4"] = expected["col4"].cat.reorder_categories( + ["cat1", "catX", "cat3", "cat4"], ordered=True + ) + + # replace values in input dataframe + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + input_df = input_df.replace("d", "z") + input_df = input_df.replace("obj1", "obj9") + result = input_df.replace("cat2", "catX") + + tm.assert_frame_equal(result, expected) + + def test_replace_dict_category_type(self): + """ + Test to ensure category dtypes are maintained + after replace with dict values + """ + # GH#35268, GH#44940 + + # create input dataframe + input_dict = {"col1": ["a"], "col2": ["obj1"], "col3": ["cat1"]} + # explicitly cast columns as category + input_df = DataFrame(data=input_dict).astype( + {"col1": "category", "col2": "category", "col3": "category"} + ) + + # create expected dataframe + expected_dict = {"col1": ["z"], "col2": ["obj9"], "col3": ["catX"]} + # explicitly cast columns as category + expected = DataFrame(data=expected_dict).astype( + {"col1": "category", "col2": "category", "col3": "category"} + ) + + # replace values in input dataframe using a dict + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) + + tm.assert_frame_equal(result, expected) + + def test_replace_with_compiled_regex(self): + # https://github.com/pandas-dev/pandas/issues/35680 + df = DataFrame(["a", "b", "c"]) + regex = re.compile("^a$") + result = df.replace({regex: "z"}, regex=True) + expected = DataFrame(["z", "b", "c"]) + tm.assert_frame_equal(result, expected) + + def test_replace_intervals(self, using_infer_string): + # https://github.com/pandas-dev/pandas/issues/35931 + df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="Downcasting"): + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + expected = DataFrame({"a": ["x", "x"]}) + tm.assert_frame_equal(result, expected) + + def test_replace_unicode(self): + # GH: 16784 + columns_values_map = {"positive": {"正面": 1, "中立": 1, "负面": 0}} + df1 = DataFrame({"positive": np.ones(3)}) + result = df1.replace(columns_values_map) + expected = DataFrame({"positive": np.ones(3)}) + tm.assert_frame_equal(result, expected) + + def test_replace_bytes(self, frame_or_series): + # GH#38900 + obj = frame_or_series(["o"]).astype("|S") + expected = obj.copy() + obj = obj.replace({None: np.nan}) + tm.assert_equal(obj, expected) + + @pytest.mark.parametrize( + "data, to_replace, value, expected", + [ + ([1], [1.0], [0], [0]), + ([1], [1], [0], [0]), + ([1.0], [1.0], [0], [0.0]), + ([1.0], [1], [0], [0.0]), + ], + ) + @pytest.mark.parametrize("box", [list, tuple, np.array]) + def test_replace_list_with_mixed_type( + self, data, to_replace, value, expected, box, frame_or_series + ): + # GH#40371 + obj = frame_or_series(data) + expected = frame_or_series(expected) + result = obj.replace(box(to_replace), value) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("val", [2, np.nan, 2.0]) + def test_replace_value_none_dtype_numeric(self, val): + # GH#48231 + df = DataFrame({"a": [1, val]}) + result = df.replace(val, None) + expected = DataFrame({"a": [1, None]}, dtype=object) + tm.assert_frame_equal(result, expected) + + df = DataFrame({"a": [1, val]}) + result = df.replace({val: None}) + tm.assert_frame_equal(result, expected) + + def test_replace_with_nil_na(self): + # GH 32075 + ser = DataFrame({"a": ["nil", pd.NA]}) + expected = DataFrame({"a": ["anything else", pd.NA]}, index=[0, 1]) + result = ser.replace("nil", "anything else") + tm.assert_frame_equal(expected, result) + + +class TestDataFrameReplaceRegex: + @pytest.mark.parametrize( + "data", + [ + {"a": list("ab.."), "b": list("efgh")}, + {"a": list("ab.."), "b": list(range(4))}, + ], + ) + @pytest.mark.parametrize( + "to_replace,value", [(r"\s*\.\s*", np.nan), (r"\s*(\.)\s*", r"\1\1\1")] + ) + @pytest.mark.parametrize("compile_regex", [True, False]) + @pytest.mark.parametrize("regex_kwarg", [True, False]) + @pytest.mark.parametrize("inplace", [True, False]) + def test_regex_replace_scalar( + self, data, to_replace, value, compile_regex, regex_kwarg, inplace + ): + df = DataFrame(data) + expected = df.copy() + + if compile_regex: + to_replace = re.compile(to_replace) + + if regex_kwarg: + regex = to_replace + to_replace = None + else: + regex = True + + result = df.replace(to_replace, value, inplace=inplace, regex=regex) + + if inplace: + assert result is None + result = df + + if value is np.nan: + expected_replace_val = np.nan + else: + expected_replace_val = "..." + + expected.loc[expected["a"] == ".", "a"] = expected_replace_val + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + @pytest.mark.parametrize("regex", [False, True]) + def test_replace_regex_dtype_frame(self, regex): + # GH-48644 + df1 = DataFrame({"A": ["0"], "B": ["0"]}) + expected_df1 = DataFrame({"A": [1], "B": [1]}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df1 = df1.replace(to_replace="0", value=1, regex=regex) + tm.assert_frame_equal(result_df1, expected_df1) + + df2 = DataFrame({"A": ["0"], "B": ["1"]}) + expected_df2 = DataFrame({"A": [1], "B": ["1"]}) + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df2 = df2.replace(to_replace="0", value=1, regex=regex) + tm.assert_frame_equal(result_df2, expected_df2) + + def test_replace_with_value_also_being_replaced(self): + # GH46306 + df = DataFrame({"A": [0, 1, 2], "B": [1, 0, 2]}) + result = df.replace({0: 1, 1: np.nan}) + expected = DataFrame({"A": [1, np.nan, 2], "B": [np.nan, 1, 2]}) + tm.assert_frame_equal(result, expected) + + def test_replace_categorical_no_replacement(self): + # GH#46672 + df = DataFrame( + { + "a": ["one", "two", None, "three"], + "b": ["one", None, "two", "three"], + }, + dtype="category", + ) + expected = df.copy() + + result = df.replace(to_replace=[".", "def"], value=["_", None]) + tm.assert_frame_equal(result, expected) + + def test_replace_object_splitting(self, using_infer_string): + # GH#53977 + df = DataFrame({"a": ["a"], "b": "b"}) + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 + df.replace(to_replace=r"^\s*$", value="", inplace=True, regex=True) + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reset_index.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reset_index.py new file mode 100644 index 0000000000000000000000000000000000000000..fbf36dbc4fb023364a1def844463679fa7174757 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_reset_index.py @@ -0,0 +1,782 @@ +from datetime import datetime +from itertools import product + +import numpy as np +import pytest + +from pandas.core.dtypes.common import ( + is_float_dtype, + is_integer_dtype, +) + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Index, + Interval, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + Timestamp, + cut, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture() +def multiindex_df(): + levels = [["A", ""], ["B", "b"]] + return DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + + +class TestResetIndex: + def test_reset_index_empty_rangeindex(self): + # GH#45230 + df = DataFrame( + columns=["brand"], dtype=np.int64, index=RangeIndex(0, 0, 1, name="foo") + ) + + df2 = df.set_index([df.index, "brand"]) + + result = df2.reset_index([1], drop=True) + tm.assert_frame_equal(result, df[[]], check_index_type=True) + + def test_set_reset(self): + idx = Index([2**63, 2**63 + 5, 2**63 + 10], name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype == np.dtype("uint64") + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + + def test_set_index_reset_index_dt64tz(self): + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype == "datetime64[ns, US/Eastern]" + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + + def test_reset_index_tz(self, tz_aware_fixture): + # GH 3950 + # reset_index with single level + tz = tz_aware_fixture + idx = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx") + df = DataFrame({"a": range(5), "b": ["A", "B", "C", "D", "E"]}, index=idx) + + expected = DataFrame( + { + "idx": idx, + "a": range(5), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx", "a", "b"], + ) + result = df.reset_index() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_frame_reset_index_tzaware_index(self, tz): + dr = date_range("2012-06-02", periods=10, tz=tz) + df = DataFrame(np.random.default_rng(2).standard_normal(len(dr)), dr) + roundtripped = df.reset_index().set_index("index") + xp = df.index.tz + rs = roundtripped.index.tz + assert xp == rs + + def test_reset_index_with_intervals(self): + idx = IntervalIndex.from_breaks(np.arange(11), name="x") + original = DataFrame({"x": idx, "y": np.arange(10)})[["x", "y"]] + + result = original.set_index("x") + expected = DataFrame({"y": np.arange(10)}, index=idx) + tm.assert_frame_equal(result, expected) + + result2 = result.reset_index() + tm.assert_frame_equal(result2, original) + + def test_reset_index(self, float_frame): + stacked = float_frame.stack(future_stack=True)[::2] + stacked = DataFrame({"foo": stacked, "bar": stacked}) + + names = ["first", "second"] + stacked.index.names = names + deleveled = stacked.reset_index() + for i, (lev, level_codes) in enumerate( + zip(stacked.index.levels, stacked.index.codes) + ): + values = lev.take(level_codes) + name = names[i] + tm.assert_index_equal(values, Index(deleveled[name])) + + stacked.index.names = [None, None] + deleveled2 = stacked.reset_index() + tm.assert_series_equal( + deleveled["first"], deleveled2["level_0"], check_names=False + ) + tm.assert_series_equal( + deleveled["second"], deleveled2["level_1"], check_names=False + ) + + # default name assigned + rdf = float_frame.reset_index() + exp = Series(float_frame.index.values, name="index") + tm.assert_series_equal(rdf["index"], exp) + + # default name assigned, corner case + df = float_frame.copy() + df["index"] = "foo" + rdf = df.reset_index() + exp = Series(float_frame.index.values, name="level_0") + tm.assert_series_equal(rdf["level_0"], exp) + + # but this is ok + float_frame.index.name = "index" + deleveled = float_frame.reset_index() + tm.assert_series_equal(deleveled["index"], Series(float_frame.index)) + tm.assert_index_equal(deleveled.index, Index(range(len(deleveled))), exact=True) + + # preserve column names + float_frame.columns.name = "columns" + reset = float_frame.reset_index() + assert reset.columns.name == "columns" + + # only remove certain columns + df = float_frame.reset_index().set_index(["index", "A", "B"]) + rs = df.reset_index(["A", "B"]) + + tm.assert_frame_equal(rs, float_frame) + + rs = df.reset_index(["index", "A", "B"]) + tm.assert_frame_equal(rs, float_frame.reset_index()) + + rs = df.reset_index(["index", "A", "B"]) + tm.assert_frame_equal(rs, float_frame.reset_index()) + + rs = df.reset_index("A") + xp = float_frame.reset_index().set_index(["index", "B"]) + tm.assert_frame_equal(rs, xp) + + # test resetting in place + df = float_frame.copy() + reset = float_frame.reset_index() + return_value = df.reset_index(inplace=True) + assert return_value is None + tm.assert_frame_equal(df, reset) + + df = float_frame.reset_index().set_index(["index", "A", "B"]) + rs = df.reset_index("A", drop=True) + xp = float_frame.copy() + del xp["A"] + xp = xp.set_index(["B"], append=True) + tm.assert_frame_equal(rs, xp) + + def test_reset_index_name(self): + df = DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], + columns=["A", "B", "C", "D"], + index=Index(range(2), name="x"), + ) + assert df.reset_index().index.name is None + assert df.reset_index(drop=True).index.name is None + return_value = df.reset_index(inplace=True) + assert return_value is None + assert df.index.name is None + + @pytest.mark.parametrize("levels", [["A", "B"], [0, 1]]) + def test_reset_index_level(self, levels): + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"]) + + # With MultiIndex + result = df.set_index(["A", "B"]).reset_index(level=levels[0]) + tm.assert_frame_equal(result, df.set_index("B")) + + result = df.set_index(["A", "B"]).reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df.set_index("B")) + + result = df.set_index(["A", "B"]).reset_index(level=levels) + tm.assert_frame_equal(result, df) + + result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True) + tm.assert_frame_equal(result, df[["C", "D"]]) + + # With single-level Index (GH 16263) + result = df.set_index("A").reset_index(level=levels[0]) + tm.assert_frame_equal(result, df) + + result = df.set_index("A").reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df) + + result = df.set_index(["A"]).reset_index(level=levels[0], drop=True) + tm.assert_frame_equal(result, df[["B", "C", "D"]]) + + @pytest.mark.parametrize("idx_lev", [["A", "B"], ["A"]]) + def test_reset_index_level_missing(self, idx_lev): + # Missing levels - for both MultiIndex and single-level Index: + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"]) + + with pytest.raises(KeyError, match=r"(L|l)evel \(?E\)?"): + df.set_index(idx_lev).reset_index(level=["A", "E"]) + with pytest.raises(IndexError, match="Too many levels"): + df.set_index(idx_lev).reset_index(level=[0, 1, 2]) + + def test_reset_index_right_dtype(self): + time = np.arange(0.0, 10, np.sqrt(2) / 2) + s1 = Series( + (9.81 * time**2) / 2, index=Index(time, name="time"), name="speed" + ) + df = DataFrame(s1) + + reset = s1.reset_index() + assert reset["time"].dtype == np.float64 + + reset = df.reset_index() + assert reset["time"].dtype == np.float64 + + def test_reset_index_multiindex_col(self): + vals = np.random.default_rng(2).standard_normal((3, 3)).astype(object) + idx = ["x", "y", "z"] + full = np.hstack(([[x] for x in idx], vals)) + df = DataFrame( + vals, + Index(idx, name="a"), + columns=[["b", "b", "c"], ["mean", "median", "mean"]], + ) + rs = df.reset_index() + xp = DataFrame( + full, columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]] + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index(col_fill=None) + xp = DataFrame( + full, columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]] + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index(col_level=1, col_fill="blah") + xp = DataFrame( + full, columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]] + ) + tm.assert_frame_equal(rs, xp) + + df = DataFrame( + vals, + MultiIndex.from_arrays([[0, 1, 2], ["x", "y", "z"]], names=["d", "a"]), + columns=[["b", "b", "c"], ["mean", "median", "mean"]], + ) + rs = df.reset_index("a") + xp = DataFrame( + full, + Index([0, 1, 2], name="d"), + columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]], + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index("a", col_fill=None) + xp = DataFrame( + full, + Index(range(3), name="d"), + columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]], + ) + tm.assert_frame_equal(rs, xp) + + rs = df.reset_index("a", col_fill="blah", col_level=1) + xp = DataFrame( + full, + Index(range(3), name="d"), + columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]], + ) + tm.assert_frame_equal(rs, xp) + + def test_reset_index_multiindex_nan(self): + # GH#6322, testing reset_index on MultiIndexes + # when we have a nan or all nan + df = DataFrame( + { + "A": ["a", "b", "c"], + "B": [0, 1, np.nan], + "C": np.random.default_rng(2).random(3), + } + ) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + df = DataFrame( + { + "A": [np.nan, "b", "c"], + "B": [0, 1, 2], + "C": np.random.default_rng(2).random(3), + } + ) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + df = DataFrame({"A": ["a", "b", "c"], "B": [0, 1, 2], "C": [np.nan, 1.1, 2.2]}) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + df = DataFrame( + { + "A": ["a", "b", "c"], + "B": [np.nan, np.nan, np.nan], + "C": np.random.default_rng(2).random(3), + } + ) + rs = df.set_index(["A", "B"]).reset_index() + tm.assert_frame_equal(rs, df) + + @pytest.mark.parametrize( + "name", + [ + None, + "foo", + 2, + 3.0, + pd.Timedelta(6), + Timestamp("2012-12-30", tz="UTC"), + "2012-12-31", + ], + ) + def test_reset_index_with_datetimeindex_cols(self, name): + # GH#5818 + df = DataFrame( + [[1, 2], [3, 4]], + columns=date_range("1/1/2013", "1/2/2013"), + index=["A", "B"], + ) + df.index.name = name + + result = df.reset_index() + + item = name if name is not None else "index" + columns = Index([item, datetime(2013, 1, 1), datetime(2013, 1, 2)]) + if isinstance(item, str) and item == "2012-12-31": + columns = columns.astype("datetime64[ns]") + else: + assert columns.dtype == object + + expected = DataFrame( + [["A", 1, 2], ["B", 3, 4]], + columns=columns, + ) + tm.assert_frame_equal(result, expected) + + def test_reset_index_range(self): + # GH#12071 + df = DataFrame([[0, 0], [1, 1]], columns=["A", "B"], index=RangeIndex(stop=2)) + result = df.reset_index() + assert isinstance(result.index, RangeIndex) + expected = DataFrame( + [[0, 0, 0], [1, 1, 1]], + columns=["index", "A", "B"], + index=RangeIndex(stop=2), + ) + tm.assert_frame_equal(result, expected) + + def test_reset_index_multiindex_columns(self, multiindex_df): + result = multiindex_df[["B"]].rename_axis("A").reset_index() + tm.assert_frame_equal(result, multiindex_df) + + # GH#16120: already existing column + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): + multiindex_df.rename_axis("A").reset_index() + + # GH#16164: multiindex (tuple) full key + result = multiindex_df.set_index([("A", "")]).reset_index() + tm.assert_frame_equal(result, multiindex_df) + + # with additional (unnamed) index level + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")]) + ) + expected = pd.concat([idx_col, multiindex_df[[("B", "b"), ("A", "")]]], axis=1) + result = multiindex_df.set_index([("B", "b")], append=True).reset_index() + tm.assert_frame_equal(result, expected) + + # with index name which is a too long tuple... + msg = "Item must have length equal to number of levels." + with pytest.raises(ValueError, match=msg): + multiindex_df.rename_axis([("C", "c", "i")]).reset_index() + + # or too short... + levels = [["A", "a", ""], ["B", "b", "i"]] + df2 = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("C", "c", "ii")]) + ) + expected = pd.concat([idx_col, df2], axis=1) + result = df2.rename_axis([("C", "c")]).reset_index(col_fill="ii") + tm.assert_frame_equal(result, expected) + + # ... which is incompatible with col_fill=None + with pytest.raises( + ValueError, + match=( + "col_fill=None is incompatible with " + r"incomplete column name \('C', 'c'\)" + ), + ): + df2.rename_axis([("C", "c")]).reset_index(col_fill=None) + + # with col_level != 0 + result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("flag", [False, True]) + @pytest.mark.parametrize("allow_duplicates", [False, True]) + def test_reset_index_duplicate_columns_allow( + self, multiindex_df, flag, allow_duplicates + ): + # GH#44755 reset_index with duplicate column labels + df = multiindex_df.rename_axis("A") + df = df.set_flags(allows_duplicate_labels=flag) + + if flag and allow_duplicates: + result = df.reset_index(allow_duplicates=allow_duplicates) + levels = [["A", ""], ["A", ""], ["B", "b"]] + expected = DataFrame( + [[0, 0, 2], [1, 1, 3]], columns=MultiIndex.from_tuples(levels) + ) + tm.assert_frame_equal(result, expected) + else: + if not flag and allow_duplicates: + msg = ( + "Cannot specify 'allow_duplicates=True' when " + "'self.flags.allows_duplicate_labels' is False" + ) + else: + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): + df.reset_index(allow_duplicates=allow_duplicates) + + @pytest.mark.parametrize("flag", [False, True]) + def test_reset_index_duplicate_columns_default(self, multiindex_df, flag): + df = multiindex_df.rename_axis("A") + df = df.set_flags(allows_duplicate_labels=flag) + + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): + df.reset_index() + + @pytest.mark.parametrize("allow_duplicates", ["bad value"]) + def test_reset_index_allow_duplicates_check(self, multiindex_df, allow_duplicates): + with pytest.raises(ValueError, match="expected type bool"): + multiindex_df.reset_index(allow_duplicates=allow_duplicates) + + def test_reset_index_datetime(self, tz_naive_fixture): + # GH#3950 + tz = tz_naive_fixture + idx1 = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") + idx2 = Index(range(5), name="idx2", dtype="int64") + idx = MultiIndex.from_arrays([idx1, idx2]) + df = DataFrame( + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": idx1, + "idx2": np.arange(5, dtype="int64"), + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "a", "b"], + ) + + tm.assert_frame_equal(df.reset_index(), expected) + + def test_reset_index_datetime2(self, tz_naive_fixture): + tz = tz_naive_fixture + idx1 = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") + idx2 = Index(range(5), name="idx2", dtype="int64") + idx3 = date_range( + "1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3" + ) + idx = MultiIndex.from_arrays([idx1, idx2, idx3]) + df = DataFrame( + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": idx1, + "idx2": np.arange(5, dtype="int64"), + "idx3": idx3, + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "idx3", "a", "b"], + ) + result = df.reset_index() + tm.assert_frame_equal(result, expected) + + def test_reset_index_datetime3(self, tz_naive_fixture): + # GH#7793 + tz = tz_naive_fixture + dti = date_range("20130101", periods=3, tz=tz) + idx = MultiIndex.from_product([["a", "b"], dti]) + df = DataFrame( + np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx + ) + + expected = DataFrame( + { + "level_0": "a a a b b b".split(), + "level_1": dti.append(dti), + "a": np.arange(6, dtype="int64"), + }, + columns=["level_0", "level_1", "a"], + ) + result = df.reset_index() + tm.assert_frame_equal(result, expected) + + def test_reset_index_period(self): + # GH#7746 + idx = MultiIndex.from_product( + [pd.period_range("20130101", periods=3, freq="M"), list("abc")], + names=["month", "feature"], + ) + + df = DataFrame( + np.arange(9, dtype="int64").reshape(-1, 1), index=idx, columns=["a"] + ) + expected = DataFrame( + { + "month": ( + [pd.Period("2013-01", freq="M")] * 3 + + [pd.Period("2013-02", freq="M")] * 3 + + [pd.Period("2013-03", freq="M")] * 3 + ), + "feature": ["a", "b", "c"] * 3, + "a": np.arange(9, dtype="int64"), + }, + columns=["month", "feature", "a"], + ) + result = df.reset_index() + tm.assert_frame_equal(result, expected) + + def test_reset_index_delevel_infer_dtype(self): + tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1])) + index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((8, 3)), + columns=["A", "B", "C"], + index=index, + ) + deleveled = df.reset_index() + assert is_integer_dtype(deleveled["prm1"]) + assert is_float_dtype(deleveled["prm2"]) + + def test_reset_index_with_drop( + self, multiindex_year_month_day_dataframe_random_data + ): + ymd = multiindex_year_month_day_dataframe_random_data + + deleveled = ymd.reset_index(drop=True) + assert len(deleveled.columns) == len(ymd.columns) + assert deleveled.index.name == ymd.index.name + + @pytest.mark.parametrize( + "ix_data, exp_data", + [ + ( + [(pd.NaT, 1), (pd.NaT, 2)], + {"a": [pd.NaT, pd.NaT], "b": [1, 2], "x": [11, 12]}, + ), + ( + [(pd.NaT, 1), (Timestamp("2020-01-01"), 2)], + {"a": [pd.NaT, Timestamp("2020-01-01")], "b": [1, 2], "x": [11, 12]}, + ), + ( + [(pd.NaT, 1), (pd.Timedelta(123, "d"), 2)], + {"a": [pd.NaT, pd.Timedelta(123, "d")], "b": [1, 2], "x": [11, 12]}, + ), + ], + ) + def test_reset_index_nat_multiindex(self, ix_data, exp_data): + # GH#36541: that reset_index() does not raise ValueError + ix = MultiIndex.from_tuples(ix_data, names=["a", "b"]) + result = DataFrame({"x": [11, 12]}, index=ix) + result = result.reset_index() + + expected = DataFrame(exp_data) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "codes", ([[0, 0, 1, 1], [0, 1, 0, 1]], [[0, 0, -1, 1], [0, 1, 0, 1]]) + ) + def test_rest_index_multiindex_categorical_with_missing_values(self, codes): + # GH#24206 + + index = MultiIndex( + [CategoricalIndex(["A", "B"]), CategoricalIndex(["a", "b"])], codes + ) + data = {"col": range(len(index))} + df = DataFrame(data=data, index=index) + + expected = DataFrame( + { + "level_0": Categorical.from_codes(codes[0], categories=["A", "B"]), + "level_1": Categorical.from_codes(codes[1], categories=["a", "b"]), + "col": range(4), + } + ) + + res = df.reset_index() + tm.assert_frame_equal(res, expected) + + # roundtrip + res = expected.set_index(["level_0", "level_1"]).reset_index() + tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize( + "array, dtype", + [ + (["a", "b"], object), + ( + pd.period_range("12-1-2000", periods=2, freq="Q-DEC"), + pd.PeriodDtype(freq="Q-DEC"), + ), + ], +) +def test_reset_index_dtypes_on_empty_frame_with_multiindex( + array, dtype, using_infer_string +): + # GH 19602 - Preserve dtype on empty DataFrame with MultiIndex + idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) + result = DataFrame(index=idx)[:0].reset_index().dtypes + if using_infer_string and dtype == object: + dtype = "string" + expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) + tm.assert_series_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex(): + # https://github.com/pandas-dev/pandas/issues/35606 + dti = pd.DatetimeIndex(["2020-07-20 00:00:00"], dtype="M8[ns]") + idx = MultiIndex.from_product([dti, [3, 4]], names=["a", "b"])[:0] + df = DataFrame(index=idx, columns=["c", "d"]) + result = df.reset_index() + expected = DataFrame( + columns=list("abcd"), index=RangeIndex(start=0, stop=0, step=1) + ) + expected["a"] = expected["a"].astype("datetime64[ns]") + expected["b"] = expected["b"].astype("int64") + tm.assert_frame_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( + using_infer_string, +): + # https://github.com/pandas-dev/pandas/issues/35657 + dti = pd.DatetimeIndex(["2020-01-01"], dtype="M8[ns]") + df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": dti}) + df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum() + result = df.reset_index() + expected = DataFrame( + columns=["c2", "c3", "c1"], index=RangeIndex(start=0, stop=0, step=1) + ) + expected["c3"] = expected["c3"].astype("datetime64[ns]") + expected["c1"] = expected["c1"].astype("float64") + if using_infer_string: + expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") + tm.assert_frame_equal(result, expected) + + +def test_reset_index_multiindex_nat(): + # GH 11479 + idx = range(3) + tstamp = date_range("2015-07-01", freq="D", periods=3) + df = DataFrame({"id": idx, "tstamp": tstamp, "a": list("abc")}) + df.loc[2, "tstamp"] = pd.NaT + result = df.set_index(["id", "tstamp"]).reset_index("id") + exp_dti = pd.DatetimeIndex( + ["2015-07-01", "2015-07-02", "NaT"], dtype="M8[ns]", name="tstamp" + ) + expected = DataFrame( + {"id": range(3), "a": list("abc")}, + index=exp_dti, + ) + tm.assert_frame_equal(result, expected) + + +def test_reset_index_interval_columns_object_cast(): + # GH 19136 + df = DataFrame( + np.eye(2), index=Index([1, 2], name="Year"), columns=cut([1, 2], [0, 1, 2]) + ) + result = df.reset_index() + expected = DataFrame( + [[1, 1.0, 0.0], [2, 0.0, 1.0]], + columns=Index(["Year", Interval(0, 1), Interval(1, 2)]), + ) + tm.assert_frame_equal(result, expected) + + +def test_reset_index_rename(float_frame): + # GH 6878 + result = float_frame.reset_index(names="new_name") + expected = Series(float_frame.index.values, name="new_name") + tm.assert_series_equal(result["new_name"], expected) + + result = float_frame.reset_index(names=123) + expected = Series(float_frame.index.values, name=123) + tm.assert_series_equal(result[123], expected) + + +def test_reset_index_rename_multiindex(float_frame): + # GH 6878 + stacked_df = float_frame.stack(future_stack=True)[::2] + stacked_df = DataFrame({"foo": stacked_df, "bar": stacked_df}) + + names = ["first", "second"] + stacked_df.index.names = names + + result = stacked_df.reset_index() + expected = stacked_df.reset_index(names=["new_first", "new_second"]) + tm.assert_series_equal(result["first"], expected["new_first"], check_names=False) + tm.assert_series_equal(result["second"], expected["new_second"], check_names=False) + + +def test_errorreset_index_rename(float_frame): + # GH 6878 + stacked_df = float_frame.stack(future_stack=True)[::2] + stacked_df = DataFrame({"first": stacked_df, "second": stacked_df}) + + with pytest.raises( + ValueError, match="Index names must be str or 1-dimensional list" + ): + stacked_df.reset_index(names={"first": "new_first", "second": "new_second"}) + + with pytest.raises(IndexError, match="list index out of range"): + stacked_df.reset_index(names=["new_first"]) + + +def test_reset_index_false_index_name(): + result_series = Series(data=range(5, 10), index=range(5)) + result_series.index.name = False + result_series.reset_index() + expected_series = Series(range(5, 10), RangeIndex(range(5), name=False)) + tm.assert_series_equal(result_series, expected_series) + + # GH 38147 + result_frame = DataFrame(data=range(5, 10), index=range(5)) + result_frame.index.name = False + result_frame.reset_index() + expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False)) + tm.assert_frame_equal(result_frame, expected_frame) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_round.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_round.py new file mode 100644 index 0000000000000000000000000000000000000000..a96df27b48d7d8dc0b2f26cc25f0dca16ea3b462 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_round.py @@ -0,0 +1,225 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Series, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameRound: + def test_round(self): + # GH#2665 + + # Test that rounding an empty DataFrame does nothing + df = DataFrame() + tm.assert_frame_equal(df, df.round()) + + # Here's the test frame we'll be working with + df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) + + # Default round to integer (i.e. decimals=0) + expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) + tm.assert_frame_equal(df.round(), expected_rounded) + + # Round with an integer + decimals = 2 + expected_rounded = DataFrame( + {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} + ) + tm.assert_frame_equal(df.round(decimals), expected_rounded) + + # This should also work with np.round (since np.round dispatches to + # df.round) + tm.assert_frame_equal(np.round(df, decimals), expected_rounded) + + # Round with a list + round_list = [1, 2] + msg = "decimals must be an integer, a dict-like or a Series" + with pytest.raises(TypeError, match=msg): + df.round(round_list) + + # Round with a dictionary + expected_rounded = DataFrame( + {"col1": [1.1, 2.1, 3.1], "col2": [1.23, 2.23, 3.23]} + ) + round_dict = {"col1": 1, "col2": 2} + tm.assert_frame_equal(df.round(round_dict), expected_rounded) + + # Incomplete dict + expected_partially_rounded = DataFrame( + {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} + ) + partial_round_dict = {"col2": 1} + tm.assert_frame_equal(df.round(partial_round_dict), expected_partially_rounded) + + # Dict with unknown elements + wrong_round_dict = {"col3": 2, "col2": 1} + tm.assert_frame_equal(df.round(wrong_round_dict), expected_partially_rounded) + + # float input to `decimals` + non_int_round_dict = {"col1": 1, "col2": 0.5} + msg = "Values in decimals must be integers" + with pytest.raises(TypeError, match=msg): + df.round(non_int_round_dict) + + # String input + non_int_round_dict = {"col1": 1, "col2": "foo"} + with pytest.raises(TypeError, match=msg): + df.round(non_int_round_dict) + + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError, match=msg): + df.round(non_int_round_Series) + + # List input + non_int_round_dict = {"col1": 1, "col2": [1, 2]} + with pytest.raises(TypeError, match=msg): + df.round(non_int_round_dict) + + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError, match=msg): + df.round(non_int_round_Series) + + # Non integer Series inputs + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError, match=msg): + df.round(non_int_round_Series) + + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError, match=msg): + df.round(non_int_round_Series) + + # Negative numbers + negative_round_dict = {"col1": -1, "col2": -2} + big_df = df * 100 + expected_neg_rounded = DataFrame( + {"col1": [110.0, 210, 310], "col2": [100.0, 200, 300]} + ) + tm.assert_frame_equal(big_df.round(negative_round_dict), expected_neg_rounded) + + # nan in Series round + nan_round_Series = Series({"col1": np.nan, "col2": 1}) + + with pytest.raises(TypeError, match=msg): + df.round(nan_round_Series) + + # Make sure this doesn't break existing Series.round + tm.assert_series_equal(df["col1"].round(1), expected_rounded["col1"]) + + # named columns + # GH#11986 + decimals = 2 + expected_rounded = DataFrame( + {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} + ) + df.columns.name = "cols" + expected_rounded.columns.name = "cols" + tm.assert_frame_equal(df.round(decimals), expected_rounded) + + # interaction of named columns & series + tm.assert_series_equal(df["col1"].round(decimals), expected_rounded["col1"]) + tm.assert_series_equal(df.round(decimals)["col1"], expected_rounded["col1"]) + + def test_round_numpy(self): + # GH#12600 + df = DataFrame([[1.53, 1.36], [0.06, 7.01]]) + out = np.round(df, decimals=0) + expected = DataFrame([[2.0, 1.0], [0.0, 7.0]]) + tm.assert_frame_equal(out, expected) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.round(df, decimals=0, out=df) + + def test_round_numpy_with_nan(self): + # See GH#14197 + df = Series([1.53, np.nan, 0.06]).to_frame() + with tm.assert_produces_warning(None): + result = df.round() + expected = Series([2.0, np.nan, 0.0]).to_frame() + tm.assert_frame_equal(result, expected) + + def test_round_mixed_type(self): + # GH#11885 + df = DataFrame( + { + "col1": [1.1, 2.2, 3.3, 4.4], + "col2": ["1", "a", "c", "f"], + "col3": date_range("20111111", periods=4), + } + ) + round_0 = DataFrame( + { + "col1": [1.0, 2.0, 3.0, 4.0], + "col2": ["1", "a", "c", "f"], + "col3": date_range("20111111", periods=4), + } + ) + tm.assert_frame_equal(df.round(), round_0) + tm.assert_frame_equal(df.round(1), df) + tm.assert_frame_equal(df.round({"col1": 1}), df) + tm.assert_frame_equal(df.round({"col1": 0}), round_0) + tm.assert_frame_equal(df.round({"col1": 0, "col2": 1}), round_0) + tm.assert_frame_equal(df.round({"col3": 1}), df) + + def test_round_with_duplicate_columns(self): + # GH#11611 + + df = DataFrame( + np.random.default_rng(2).random([3, 3]), + columns=["A", "B", "C"], + index=["first", "second", "third"], + ) + + dfs = pd.concat((df, df), axis=1) + rounded = dfs.round() + tm.assert_index_equal(rounded.index, dfs.index) + + decimals = Series([1, 0, 2], index=["A", "B", "A"]) + msg = "Index of decimals must be unique" + with pytest.raises(ValueError, match=msg): + df.round(decimals) + + def test_round_builtin(self): + # GH#11763 + # Here's the test frame we'll be working with + df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) + + # Default round to integer (i.e. decimals=0) + expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) + tm.assert_frame_equal(round(df), expected_rounded) + + def test_round_nonunique_categorical(self): + # See GH#21809 + idx = pd.CategoricalIndex(["low"] * 3 + ["hi"] * 3) + df = DataFrame(np.random.default_rng(2).random((6, 3)), columns=list("abc")) + + expected = df.round(3) + expected.index = idx + + df_categorical = df.copy().set_index(idx) + assert df_categorical.shape == (6, 3) + result = df_categorical.round(3) + assert result.shape == (6, 3) + + tm.assert_frame_equal(result, expected) + + def test_round_interval_category_columns(self): + # GH#30063 + columns = pd.CategoricalIndex(pd.interval_range(0, 2)) + df = DataFrame([[0.66, 1.1], [0.3, 0.25]], columns=columns) + + result = df.round() + expected = DataFrame([[1.0, 1.0], [0.0, 0.0]], columns=columns) + tm.assert_frame_equal(result, expected) + + def test_round_empty_not_input(self): + # GH#51032 + df = DataFrame() + result = df.round() + tm.assert_frame_equal(df, result) + assert df is not result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_sample.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_sample.py new file mode 100644 index 0000000000000000000000000000000000000000..6b3459fbdc0359c4d234dd9d1f3c2bbfbcb5c260 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_sample.py @@ -0,0 +1,372 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm +import pandas.core.common as com + + +class TestSample: + @pytest.fixture + def obj(self, frame_or_series): + if frame_or_series is Series: + arr = np.random.default_rng(2).standard_normal(10) + else: + arr = np.random.default_rng(2).standard_normal((10, 10)) + return frame_or_series(arr, dtype=None) + + @pytest.mark.parametrize("test", list(range(10))) + def test_sample(self, test, obj): + # Fixes issue: 2419 + # Check behavior of random_state argument + # Check for stability when receives seed or random state -- run 10 + # times. + + seed = np.random.default_rng(2).integers(0, 100) + tm.assert_equal( + obj.sample(n=4, random_state=seed), obj.sample(n=4, random_state=seed) + ) + + tm.assert_equal( + obj.sample(frac=0.7, random_state=seed), + obj.sample(frac=0.7, random_state=seed), + ) + + tm.assert_equal( + obj.sample(n=4, random_state=np.random.default_rng(test)), + obj.sample(n=4, random_state=np.random.default_rng(test)), + ) + + tm.assert_equal( + obj.sample(frac=0.7, random_state=np.random.default_rng(test)), + obj.sample(frac=0.7, random_state=np.random.default_rng(test)), + ) + + tm.assert_equal( + obj.sample( + frac=2, + replace=True, + random_state=np.random.default_rng(test), + ), + obj.sample( + frac=2, + replace=True, + random_state=np.random.default_rng(test), + ), + ) + + os1, os2 = [], [] + for _ in range(2): + os1.append(obj.sample(n=4, random_state=test)) + os2.append(obj.sample(frac=0.7, random_state=test)) + tm.assert_equal(*os1) + tm.assert_equal(*os2) + + def test_sample_lengths(self, obj): + # Check lengths are right + assert len(obj.sample(n=4) == 4) + assert len(obj.sample(frac=0.34) == 3) + assert len(obj.sample(frac=0.36) == 4) + + def test_sample_invalid_random_state(self, obj): + # Check for error when random_state argument invalid. + msg = ( + "random_state must be an integer, array-like, a BitGenerator, Generator, " + "a numpy RandomState, or None" + ) + with pytest.raises(ValueError, match=msg): + obj.sample(random_state="a_string") + + def test_sample_wont_accept_n_and_frac(self, obj): + # Giving both frac and N throws error + msg = "Please enter a value for `frac` OR `n`, not both" + with pytest.raises(ValueError, match=msg): + obj.sample(n=3, frac=0.3) + + def test_sample_requires_positive_n_frac(self, obj): + with pytest.raises( + ValueError, + match="A negative number of rows requested. Please provide `n` >= 0", + ): + obj.sample(n=-3) + with pytest.raises( + ValueError, + match="A negative number of rows requested. Please provide `frac` >= 0", + ): + obj.sample(frac=-0.3) + + def test_sample_requires_integer_n(self, obj): + # Make sure float values of `n` give error + with pytest.raises(ValueError, match="Only integers accepted as `n` values"): + obj.sample(n=3.2) + + def test_sample_invalid_weight_lengths(self, obj): + # Weight length must be right + msg = "Weights and axis to be sampled must be of same length" + with pytest.raises(ValueError, match=msg): + obj.sample(n=3, weights=[0, 1]) + + with pytest.raises(ValueError, match=msg): + bad_weights = [0.5] * 11 + obj.sample(n=3, weights=bad_weights) + + with pytest.raises(ValueError, match="Fewer non-zero entries in p than size"): + bad_weight_series = Series([0, 0, 0.2]) + obj.sample(n=4, weights=bad_weight_series) + + def test_sample_negative_weights(self, obj): + # Check won't accept negative weights + bad_weights = [-0.1] * 10 + msg = "weight vector many not include negative values" + with pytest.raises(ValueError, match=msg): + obj.sample(n=3, weights=bad_weights) + + def test_sample_inf_weights(self, obj): + # Check inf and -inf throw errors: + + weights_with_inf = [0.1] * 10 + weights_with_inf[0] = np.inf + msg = "weight vector may not include `inf` values" + with pytest.raises(ValueError, match=msg): + obj.sample(n=3, weights=weights_with_inf) + + weights_with_ninf = [0.1] * 10 + weights_with_ninf[0] = -np.inf + with pytest.raises(ValueError, match=msg): + obj.sample(n=3, weights=weights_with_ninf) + + def test_sample_zero_weights(self, obj): + # All zeros raises errors + + zero_weights = [0] * 10 + with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"): + obj.sample(n=3, weights=zero_weights) + + def test_sample_missing_weights(self, obj): + # All missing weights + + nan_weights = [np.nan] * 10 + with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"): + obj.sample(n=3, weights=nan_weights) + + def test_sample_none_weights(self, obj): + # Check None are also replaced by zeros. + weights_with_None = [None] * 10 + weights_with_None[5] = 0.5 + tm.assert_equal( + obj.sample(n=1, axis=0, weights=weights_with_None), obj.iloc[5:6] + ) + + @pytest.mark.parametrize( + "func_str,arg", + [ + ("np.array", [2, 3, 1, 0]), + ("np.random.MT19937", 3), + ("np.random.PCG64", 11), + ], + ) + def test_sample_random_state(self, func_str, arg, frame_or_series): + # GH#32503 + obj = DataFrame({"col1": range(10, 20), "col2": range(20, 30)}) + obj = tm.get_obj(obj, frame_or_series) + result = obj.sample(n=3, random_state=eval(func_str)(arg)) + expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg))) + tm.assert_equal(result, expected) + + def test_sample_generator(self, frame_or_series): + # GH#38100 + obj = frame_or_series(np.arange(100)) + rng = np.random.default_rng(2) + + # Consecutive calls should advance the seed + result1 = obj.sample(n=50, random_state=rng) + result2 = obj.sample(n=50, random_state=rng) + assert not (result1.index.values == result2.index.values).all() + + # Matching generator initialization must give same result + # Consecutive calls should advance the seed + result1 = obj.sample(n=50, random_state=np.random.default_rng(11)) + result2 = obj.sample(n=50, random_state=np.random.default_rng(11)) + tm.assert_equal(result1, result2) + + def test_sample_upsampling_without_replacement(self, frame_or_series): + # GH#27451 + + obj = DataFrame({"A": list("abc")}) + obj = tm.get_obj(obj, frame_or_series) + + msg = ( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) + with pytest.raises(ValueError, match=msg): + obj.sample(frac=2, replace=False) + + +class TestSampleDataFrame: + # Tests which are relevant only for DataFrame, so these are + # as fully parametrized as they can get. + + def test_sample(self): + # GH#2419 + # additional specific object based tests + + # A few dataframe test with degenerate weights. + easy_weight_list = [0] * 10 + easy_weight_list[5] = 1 + + df = DataFrame( + { + "col1": range(10, 20), + "col2": range(20, 30), + "colString": ["a"] * 10, + "easyweights": easy_weight_list, + } + ) + sample1 = df.sample(n=1, weights="easyweights") + tm.assert_frame_equal(sample1, df.iloc[5:6]) + + # Ensure proper error if string given as weight for Series or + # DataFrame with axis = 1. + ser = Series(range(10)) + msg = "Strings cannot be passed as weights when sampling from a Series." + with pytest.raises(ValueError, match=msg): + ser.sample(n=3, weights="weight_column") + + msg = ( + "Strings can only be passed to weights when sampling from rows on a " + "DataFrame" + ) + with pytest.raises(ValueError, match=msg): + df.sample(n=1, weights="weight_column", axis=1) + + # Check weighting key error + with pytest.raises( + KeyError, match="'String passed to weights not a valid column'" + ): + df.sample(n=3, weights="not_a_real_column_name") + + # Check that re-normalizes weights that don't sum to one. + weights_less_than_1 = [0] * 10 + weights_less_than_1[0] = 0.5 + tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) + + ### + # Test axis argument + ### + + # Test axis argument + df = DataFrame({"col1": range(10), "col2": ["a"] * 10}) + second_column_weight = [0, 1] + tm.assert_frame_equal( + df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]] + ) + + # Different axis arg types + tm.assert_frame_equal( + df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]] + ) + + weight = [0] * 10 + weight[5] = 0.5 + tm.assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6]) + tm.assert_frame_equal( + df.sample(n=1, axis="index", weights=weight), df.iloc[5:6] + ) + + # Check out of range axis values + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.sample(n=1, axis=2) + + msg = "No axis named not_a_name for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.sample(n=1, axis="not_a_name") + + ser = Series(range(10)) + with pytest.raises(ValueError, match="No axis named 1 for object type Series"): + ser.sample(n=1, axis=1) + + # Test weight length compared to correct axis + msg = "Weights and axis to be sampled must be of same length" + with pytest.raises(ValueError, match=msg): + df.sample(n=1, axis=1, weights=[0.5] * 10) + + def test_sample_axis1(self): + # Check weights with axis = 1 + easy_weight_list = [0] * 3 + easy_weight_list[2] = 1 + + df = DataFrame( + {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10} + ) + sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) + tm.assert_frame_equal(sample1, df[["colString"]]) + + # Test default axes + tm.assert_frame_equal( + df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42) + ) + + def test_sample_aligns_weights_with_frame(self): + # Test that function aligns weights with frame + df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3]) + ser = Series([1, 0, 0], index=[3, 5, 9]) + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser)) + + # Weights have index values to be dropped because not in + # sampled DataFrame + ser2 = Series([0.001, 0, 10000], index=[3, 5, 10]) + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser2)) + + # Weights have empty values to be filed with zeros + ser3 = Series([0.01, 0], index=[3, 5]) + tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser3)) + + # No overlap in weight and sampled DataFrame indices + ser4 = Series([1, 0], index=[1, 2]) + + with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"): + df.sample(1, weights=ser4) + + def test_sample_is_copy(self): + # GH#27357, GH#30784: ensure the result of sample is an actual copy and + # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 3)), columns=["a", "b", "c"] + ) + df2 = df.sample(3) + + with tm.assert_produces_warning(None): + df2["d"] = 1 + + def test_sample_does_not_modify_weights(self): + # GH-42843 + result = np.array([np.nan, 1, np.nan]) + expected = result.copy() + ser = Series([1, 2, 3]) + + # Test numpy array weights won't be modified in place + ser.sample(weights=result) + tm.assert_numpy_array_equal(result, expected) + + # Test DataFrame column won't be modified in place + df = DataFrame({"values": [1, 1, 1], "weights": [1, np.nan, np.nan]}) + expected = df["weights"].copy() + + df.sample(frac=1.0, replace=True, weights="weights") + result = df["weights"] + tm.assert_series_equal(result, expected) + + def test_sample_ignore_index(self): + # GH 38581 + df = DataFrame( + {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10} + ) + result = df.sample(3, ignore_index=True) + expected_index = Index(range(3)) + tm.assert_index_equal(result.index, expected_index, exact=True) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_select_dtypes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_select_dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..d1bee6a3de613918e79e0ad23b2b4345532630e1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_select_dtypes.py @@ -0,0 +1,469 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import ExtensionDtype + +import pandas as pd +from pandas import ( + DataFrame, + Timestamp, +) +import pandas._testing as tm +from pandas.core.arrays import ExtensionArray + + +class DummyDtype(ExtensionDtype): + type = int + + def __init__(self, numeric) -> None: + self._numeric = numeric + + @property + def name(self): + return "Dummy" + + @property + def _is_numeric(self): + return self._numeric + + +class DummyArray(ExtensionArray): + def __init__(self, data, dtype) -> None: + self.data = data + self._dtype = dtype + + def __array__(self, dtype=None, copy=None): + return self.data + + @property + def dtype(self): + return self._dtype + + def __len__(self) -> int: + return len(self.data) + + def __getitem__(self, item): + pass + + def copy(self): + return self + + +class TestSelectDtypes: + def test_select_dtypes_include_using_list_like(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, tz="CET"), + "j": pd.period_range("2013-01", periods=3, freq="M"), + "k": pd.timedelta_range("1 day", periods=3), + } + ) + + ri = df.select_dtypes(include=[np.number]) + ei = df[["b", "c", "d", "k"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=[np.number], exclude=["timedelta"]) + ei = df[["b", "c", "d"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=[np.number, "category"], exclude=["timedelta"]) + ei = df[["b", "c", "d", "f"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=["datetime"]) + ei = df[["g"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=["datetime64"]) + ei = df[["g"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=["datetimetz"]) + ei = df[["h", "i"]] + tm.assert_frame_equal(ri, ei) + + with pytest.raises(NotImplementedError, match=r"^$"): + df.select_dtypes(include=["period"]) + + def test_select_dtypes_exclude_using_list_like(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + } + ) + re = df.select_dtypes(exclude=[np.number]) + ee = df[["a", "e"]] + tm.assert_frame_equal(re, ee) + + def test_select_dtypes_exclude_include_using_list_like(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6, dtype="u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + exclude = (np.datetime64,) + include = np.bool_, "integer" + r = df.select_dtypes(include=include, exclude=exclude) + e = df[["b", "c", "e"]] + tm.assert_frame_equal(r, e) + + exclude = ("datetime",) + include = "bool", "int64", "int32" + r = df.select_dtypes(include=include, exclude=exclude) + e = df[["b", "e"]] + tm.assert_frame_equal(r, e) + + @pytest.mark.parametrize( + "include", [(np.bool_, "int"), (np.bool_, "integer"), ("bool", int)] + ) + def test_select_dtypes_exclude_include_int(self, include): + # Fix select_dtypes(include='int') for Windows, FYI #36596 + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6, dtype="int32"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + exclude = (np.datetime64,) + result = df.select_dtypes(include=include, exclude=exclude) + expected = df[["b", "c", "e"]] + tm.assert_frame_equal(result, expected) + + def test_select_dtypes_include_using_scalars(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, tz="CET"), + "j": pd.period_range("2013-01", periods=3, freq="M"), + "k": pd.timedelta_range("1 day", periods=3), + } + ) + + ri = df.select_dtypes(include=np.number) + ei = df[["b", "c", "d", "k"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include="datetime") + ei = df[["g"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include="datetime64") + ei = df[["g"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include="category") + ei = df[["f"]] + tm.assert_frame_equal(ri, ei) + + with pytest.raises(NotImplementedError, match=r"^$"): + df.select_dtypes(include="period") + + def test_select_dtypes_exclude_using_scalars(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, tz="CET"), + "j": pd.period_range("2013-01", periods=3, freq="M"), + "k": pd.timedelta_range("1 day", periods=3), + } + ) + + ri = df.select_dtypes(exclude=np.number) + ei = df[["a", "e", "f", "g", "h", "i", "j"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(exclude="category") + ei = df[["a", "b", "c", "d", "e", "g", "h", "i", "j", "k"]] + tm.assert_frame_equal(ri, ei) + + with pytest.raises(NotImplementedError, match=r"^$"): + df.select_dtypes(exclude="period") + + def test_select_dtypes_include_exclude_using_scalars(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, tz="CET"), + "j": pd.period_range("2013-01", periods=3, freq="M"), + "k": pd.timedelta_range("1 day", periods=3), + } + ) + + ri = df.select_dtypes(include=np.number, exclude="floating") + ei = df[["b", "c", "k"]] + tm.assert_frame_equal(ri, ei) + + def test_select_dtypes_include_exclude_mixed_scalars_lists(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, tz="CET"), + "j": pd.period_range("2013-01", periods=3, freq="M"), + "k": pd.timedelta_range("1 day", periods=3), + } + ) + + ri = df.select_dtypes(include=np.number, exclude=["floating", "timedelta"]) + ei = df[["b", "c"]] + tm.assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=[np.number, "category"], exclude="floating") + ei = df[["b", "c", "f", "k"]] + tm.assert_frame_equal(ri, ei) + + def test_select_dtypes_duplicate_columns(self): + # GH20839 + df = DataFrame( + { + "a": ["a", "b", "c"], + "b": [1, 2, 3], + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + df.columns = ["a", "a", "b", "b", "b", "c"] + + expected = DataFrame( + {"a": list(range(1, 4)), "b": np.arange(3, 6).astype("u1")} + ) + + result = df.select_dtypes(include=[np.number], exclude=["floating"]) + tm.assert_frame_equal(result, expected) + + def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_string): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + df["g"] = df.f.diff() + assert not hasattr(np, "u8") + r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) + if using_infer_string: + e = df[["b"]] + else: + e = df[["a", "b"]] + tm.assert_frame_equal(r, e) + + r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) + if using_infer_string: + e = df[["b", "g"]] + else: + e = df[["a", "b", "g"]] + tm.assert_frame_equal(r, e) + + def test_select_dtypes_empty(self): + df = DataFrame({"a": list("abc"), "b": list(range(1, 4))}) + msg = "at least one of include or exclude must be nonempty" + with pytest.raises(ValueError, match=msg): + df.select_dtypes() + + def test_select_dtypes_bad_datetime64(self): + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + with pytest.raises(ValueError, match=".+ is too specific"): + df.select_dtypes(include=["datetime64[D]"]) + + with pytest.raises(ValueError, match=".+ is too specific"): + df.select_dtypes(exclude=["datetime64[as]"]) + + def test_select_dtypes_datetime_with_tz(self): + df2 = DataFrame( + { + "A": Timestamp("20130102", tz="US/Eastern"), + "B": Timestamp("20130603", tz="CET"), + }, + index=range(5), + ) + df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) + result = df3.select_dtypes(include=["datetime64[ns]"]) + expected = df3.reindex(columns=[]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) + @pytest.mark.parametrize("arg", ["include", "exclude"]) + def test_select_dtypes_str_raises(self, dtype, arg): + df = DataFrame( + { + "a": list("abc"), + "g": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + msg = "string dtypes are not allowed" + kwargs = {arg: [dtype]} + + with pytest.raises(TypeError, match=msg): + df.select_dtypes(**kwargs) + + def test_select_dtypes_bad_arg_raises(self): + df = DataFrame( + { + "a": list("abc"), + "g": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("now", periods=3).values, + } + ) + + msg = "data type.*not understood" + with pytest.raises(TypeError, match=msg): + df.select_dtypes(["blargy, blarg, blarg"]) + + def test_select_dtypes_typecodes(self): + # GH 11990 + df = DataFrame(np.random.default_rng(2).random((5, 3))) + FLOAT_TYPES = list(np.typecodes["AllFloat"]) + tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), df) + + @pytest.mark.parametrize( + "arr,expected", + ( + (np.array([1, 2], dtype=np.int32), True), + (pd.array([1, 2], dtype="Int32"), True), + (DummyArray([1, 2], dtype=DummyDtype(numeric=True)), True), + (DummyArray([1, 2], dtype=DummyDtype(numeric=False)), False), + ), + ) + def test_select_dtypes_numeric(self, arr, expected): + # GH 35340 + + df = DataFrame(arr) + is_selected = df.select_dtypes(np.number).shape == df.shape + assert is_selected == expected + + def test_select_dtypes_numeric_nullable_string(self, nullable_string_dtype): + arr = pd.array(["a", "b"], dtype=nullable_string_dtype) + df = DataFrame(arr) + is_selected = df.select_dtypes(np.number).shape == df.shape + assert not is_selected + + @pytest.mark.parametrize( + "expected, float_dtypes", + [ + [ + DataFrame( + {"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)} + ).astype(dtype={"A": float, "B": np.float64, "C": np.float32}), + float, + ], + [ + DataFrame( + {"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)} + ).astype(dtype={"A": float, "B": np.float64, "C": np.float32}), + "float", + ], + [DataFrame({"C": range(10, 7, -1)}, dtype=np.float32), np.float32], + [ + DataFrame({"A": range(3), "B": range(5, 8)}).astype( + dtype={"A": float, "B": np.float64} + ), + np.float64, + ], + ], + ) + def test_select_dtypes_float_dtype(self, expected, float_dtypes): + # GH#42452 + dtype_dict = {"A": float, "B": np.float64, "C": np.float32} + df = DataFrame( + {"A": range(3), "B": range(5, 8), "C": range(10, 7, -1)}, + ) + df = df.astype(dtype_dict) + result = df.select_dtypes(include=float_dtypes) + tm.assert_frame_equal(result, expected) + + def test_np_bool_ea_boolean_include_number(self): + # GH 46870 + df = DataFrame( + { + "a": [1, 2, 3], + "b": pd.Series([True, False, True], dtype="boolean"), + "c": np.array([True, False, True]), + "d": pd.Categorical([True, False, True]), + "e": pd.arrays.SparseArray([True, False, True]), + } + ) + result = df.select_dtypes(include="number") + expected = DataFrame({"a": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + + def test_select_dtypes_no_view(self): + # https://github.com/pandas-dev/pandas/issues/48090 + # result of this method is not a view on the original dataframe + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df_orig = df.copy() + result = df.select_dtypes(include=["number"]) + result.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_set_axis.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_set_axis.py new file mode 100644 index 0000000000000000000000000000000000000000..8d249bc7b7fa471db401ed44e50fdb514cf85a51 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_set_axis.py @@ -0,0 +1,143 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class SharedSetAxisTests: + @pytest.fixture + def obj(self): + raise NotImplementedError("Implemented by subclasses") + + def test_set_axis(self, obj): + # GH14636; this tests setting index for both Series and DataFrame + new_index = list("abcd")[: len(obj)] + expected = obj.copy() + expected.index = new_index + result = obj.set_axis(new_index, axis=0) + tm.assert_equal(expected, result) + + def test_set_axis_copy(self, obj, using_copy_on_write): + # Test copy keyword GH#47932 + new_index = list("abcd")[: len(obj)] + + orig = obj.iloc[:] + expected = obj.copy() + expected.index = new_index + + result = obj.set_axis(new_index, axis=0, copy=True) + tm.assert_equal(expected, result) + assert result is not obj + # check we DID make a copy + if not using_copy_on_write: + if obj.ndim == 1: + assert not tm.shares_memory(result, obj) + else: + assert not any( + tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) + for i in range(obj.shape[1]) + ) + + result = obj.set_axis(new_index, axis=0, copy=False) + tm.assert_equal(expected, result) + assert result is not obj + # check we did NOT make a copy + if obj.ndim == 1: + assert tm.shares_memory(result, obj) + else: + assert all( + tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) + for i in range(obj.shape[1]) + ) + + # copy defaults to True + result = obj.set_axis(new_index, axis=0) + tm.assert_equal(expected, result) + assert result is not obj + if using_copy_on_write: + # check we DID NOT make a copy + if obj.ndim == 1: + assert tm.shares_memory(result, obj) + else: + assert any( + tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) + for i in range(obj.shape[1]) + ) + # check we DID make a copy + elif obj.ndim == 1: + assert not tm.shares_memory(result, obj) + else: + assert not any( + tm.shares_memory(result.iloc[:, i], obj.iloc[:, i]) + for i in range(obj.shape[1]) + ) + + res = obj.set_axis(new_index, copy=False) + tm.assert_equal(expected, res) + # check we did NOT make a copy + if res.ndim == 1: + assert tm.shares_memory(res, orig) + else: + assert all( + tm.shares_memory(res.iloc[:, i], orig.iloc[:, i]) + for i in range(res.shape[1]) + ) + + def test_set_axis_unnamed_kwarg_warns(self, obj): + # omitting the "axis" parameter + new_index = list("abcd")[: len(obj)] + + expected = obj.copy() + expected.index = new_index + + result = obj.set_axis(new_index) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("axis", [3, "foo"]) + def test_set_axis_invalid_axis_name(self, axis, obj): + # wrong values for the "axis" parameter + with pytest.raises(ValueError, match="No axis named"): + obj.set_axis(list("abc"), axis=axis) + + def test_set_axis_setattr_index_not_collection(self, obj): + # wrong type + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, None was passed" + ) + with pytest.raises(TypeError, match=msg): + obj.index = None + + def test_set_axis_setattr_index_wrong_length(self, obj): + # wrong length + msg = ( + f"Length mismatch: Expected axis has {len(obj)} elements, " + f"new values have {len(obj)-1} elements" + ) + with pytest.raises(ValueError, match=msg): + obj.index = np.arange(len(obj) - 1) + + if obj.ndim == 2: + with pytest.raises(ValueError, match="Length mismatch"): + obj.columns = obj.columns[::2] + + +class TestDataFrameSetAxis(SharedSetAxisTests): + @pytest.fixture + def obj(self): + df = DataFrame( + {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, + index=[2010, 2011, 2012], + ) + return df + + +class TestSeriesSetAxis(SharedSetAxisTests): + @pytest.fixture + def obj(self): + ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") + return ser diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_set_index.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_set_index.py new file mode 100644 index 0000000000000000000000000000000000000000..5724f79b82578c9842f510781081151d5ec20940 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_set_index.py @@ -0,0 +1,734 @@ +""" +See also: test_reindex.py:TestReindexSetIndex +""" + +from datetime import ( + datetime, + timedelta, +) + +import numpy as np +import pytest + +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + + +@pytest.fixture +def frame_of_index_cols(): + """ + Fixture for DataFrame of columns that can be used for indexing + + Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; + 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. + + A B C D E (tuple, as, label) + 0 foo one a 0.608477 -0.012500 -1.664297 + 1 foo two b -0.633460 0.249614 -0.364411 + 2 foo three c 0.615256 2.154968 -0.834666 + 3 bar one d 0.234246 1.085675 0.718445 + 4 bar two e 0.533841 -0.005702 -3.533912 + """ + df = DataFrame( + { + "A": ["foo", "foo", "foo", "bar", "bar"], + "B": ["one", "two", "three", "one", "two"], + "C": ["a", "b", "c", "d", "e"], + "D": np.random.default_rng(2).standard_normal(5), + "E": np.random.default_rng(2).standard_normal(5), + ("tuple", "as", "label"): np.random.default_rng(2).standard_normal(5), + } + ) + return df + + +class TestSetIndex: + def test_set_index_multiindex(self): + # segfault in GH#3308 + d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]} + df = DataFrame(d) + tuples = [(0, 1), (0, 2), (1, 2)] + df["tuples"] = tuples + + index = MultiIndex.from_tuples(df["tuples"]) + # it works! + df.set_index(index) + + def test_set_index_empty_column(self): + # GH#1971 + df = DataFrame( + [ + {"a": 1, "p": 0}, + {"a": 2, "m": 10}, + {"a": 3, "m": 11, "p": 20}, + {"a": 4, "m": 12, "p": 21}, + ], + columns=["a", "m", "p", "x"], + ) + + result = df.set_index(["a", "x"]) + + expected = df[["m", "p"]] + expected.index = MultiIndex.from_arrays([df["a"], df["x"]], names=["a", "x"]) + tm.assert_frame_equal(result, expected) + + def test_set_index_empty_dataframe(self): + # GH#38419 + df1 = DataFrame( + {"a": Series(dtype="datetime64[ns]"), "b": Series(dtype="int64"), "c": []} + ) + + df2 = df1.set_index(["a", "b"]) + result = df2.index.to_frame().dtypes + expected = df1[["a", "b"]].dtypes + tm.assert_series_equal(result, expected) + + def test_set_index_multiindexcolumns(self): + columns = MultiIndex.from_tuples([("foo", 1), ("foo", 2), ("bar", 1)]) + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), columns=columns + ) + + result = df.set_index(df.columns[0]) + + expected = df.iloc[:, 1:] + expected.index = df.iloc[:, 0].values + expected.index.names = [df.columns[0]] + tm.assert_frame_equal(result, expected) + + def test_set_index_timezone(self): + # GH#12358 + # tz-aware Series should retain the tz + idx = DatetimeIndex(["2014-01-01 10:10:10"], tz="UTC").tz_convert("Europe/Rome") + df = DataFrame({"A": idx}) + assert df.set_index(idx).index[0].hour == 11 + assert DatetimeIndex(Series(df.A))[0].hour == 11 + assert df.set_index(df.A).index[0].hour == 11 + + def test_set_index_cast_datetimeindex(self): + df = DataFrame( + { + "A": [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], + "B": np.random.default_rng(2).standard_normal(1000), + } + ) + + idf = df.set_index("A") + assert isinstance(idf.index, DatetimeIndex) + + def test_set_index_dst(self): + di = date_range("2006-10-29 00:00:00", periods=3, freq="h", tz="US/Pacific") + + df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index() + # single level + res = df.set_index("index") + exp = DataFrame( + data={"a": [0, 1, 2], "b": [3, 4, 5]}, + index=Index(di, name="index"), + ) + exp.index = exp.index._with_freq(None) + tm.assert_frame_equal(res, exp) + + # GH#12920 + res = df.set_index(["index", "a"]) + exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], names=["index", "a"]) + exp = DataFrame({"b": [3, 4, 5]}, index=exp_index) + tm.assert_frame_equal(res, exp) + + def test_set_index(self, float_string_frame): + df = float_string_frame + idx = Index(np.arange(len(df))[::-1]) + + df = df.set_index(idx) + tm.assert_index_equal(df.index, idx) + with pytest.raises(ValueError, match="Length mismatch"): + df.set_index(idx[::2]) + + def test_set_index_names(self): + df = DataFrame( + np.ones((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), + ) + df.index.name = "name" + + assert df.set_index(df.index).index.names == ["name"] + + mi = MultiIndex.from_arrays(df[["A", "B"]].T.values, names=["A", "B"]) + mi2 = MultiIndex.from_arrays( + df[["A", "B", "A", "B"]].T.values, names=["A", "B", "C", "D"] + ) + + df = df.set_index(["A", "B"]) + + assert df.set_index(df.index).index.names == ["A", "B"] + + # Check that set_index isn't converting a MultiIndex into an Index + assert isinstance(df.set_index(df.index).index, MultiIndex) + + # Check actual equality + tm.assert_index_equal(df.set_index(df.index).index, mi) + + idx2 = df.index.rename(["C", "D"]) + + # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather + # than a pair of tuples + assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex) + + # Check equality + tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2) + + # A has duplicate values, C does not + @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")]) + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys): + df = frame_of_index_cols + + if isinstance(keys, list): + idx = MultiIndex.from_arrays([df[x] for x in keys], names=keys) + else: + idx = Index(df[keys], name=keys) + expected = df.drop(keys, axis=1) if drop else df + expected.index = idx + + if inplace: + result = df.copy() + return_value = result.set_index(keys, drop=drop, inplace=True) + assert return_value is None + else: + result = df.set_index(keys, drop=drop) + + tm.assert_frame_equal(result, expected) + + # A has duplicate values, C does not + @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_append(self, frame_of_index_cols, drop, keys): + df = frame_of_index_cols + + keys = keys if isinstance(keys, list) else [keys] + idx = MultiIndex.from_arrays( + [df.index] + [df[x] for x in keys], names=[None] + keys + ) + expected = df.drop(keys, axis=1) if drop else df.copy() + expected.index = idx + + result = df.set_index(keys, drop=drop, append=True) + + tm.assert_frame_equal(result, expected) + + # A has duplicate values, C does not + @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys): + # append to existing multiindex + df = frame_of_index_cols.set_index(["D"], drop=drop, append=True) + + keys = keys if isinstance(keys, list) else [keys] + expected = frame_of_index_cols.set_index(["D"] + keys, drop=drop, append=True) + + result = df.set_index(keys, drop=drop, append=True) + + tm.assert_frame_equal(result, expected) + + def test_set_index_after_mutation(self): + # GH#1590 + df = DataFrame({"val": [0, 1, 2], "key": ["a", "b", "c"]}) + expected = DataFrame({"val": [1, 2]}, Index(["b", "c"], name="key")) + + df2 = df.loc[df.index.map(lambda indx: indx >= 1)] + result = df2.set_index("key") + tm.assert_frame_equal(result, expected) + + # MultiIndex constructor does not work directly on Series -> lambda + # Add list-of-list constructor because list is ambiguous -> lambda + # also test index name if append=True (name is duplicate here for B) + @pytest.mark.parametrize( + "box", + [ + Series, + Index, + np.array, + list, + lambda x: [list(x)], + lambda x: MultiIndex.from_arrays([x]), + ], + ) + @pytest.mark.parametrize( + "append, index_name", [(True, None), (True, "B"), (True, "test"), (False, None)] + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_single_array( + self, frame_of_index_cols, drop, append, index_name, box + ): + df = frame_of_index_cols + df.index.name = index_name + + key = box(df["B"]) + if box == list: + # list of strings gets interpreted as list of keys + msg = "['one', 'two', 'three', 'one', 'two']" + with pytest.raises(KeyError, match=msg): + df.set_index(key, drop=drop, append=append) + else: + # np.array/list-of-list "forget" the name of B + name_mi = getattr(key, "names", None) + name = [getattr(key, "name", None)] if name_mi is None else name_mi + + result = df.set_index(key, drop=drop, append=append) + + # only valid column keys are dropped + # since B is always passed as array above, nothing is dropped + expected = df.set_index(["B"], drop=False, append=append) + expected.index.names = [index_name] + name if append else name + + tm.assert_frame_equal(result, expected) + + # MultiIndex constructor does not work directly on Series -> lambda + # also test index name if append=True (name is duplicate here for A & B) + @pytest.mark.parametrize( + "box", [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])] + ) + @pytest.mark.parametrize( + "append, index_name", + [(True, None), (True, "A"), (True, "B"), (True, "test"), (False, None)], + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_arrays( + self, frame_of_index_cols, drop, append, index_name, box + ): + df = frame_of_index_cols + df.index.name = index_name + + keys = ["A", box(df["B"])] + # np.array/list "forget" the name of B + names = ["A", None if box in [np.array, list, tuple, iter] else "B"] + + result = df.set_index(keys, drop=drop, append=append) + + # only valid column keys are dropped + # since B is always passed as array above, only A is dropped, if at all + expected = df.set_index(["A", "B"], drop=False, append=append) + expected = expected.drop("A", axis=1) if drop else expected + expected.index.names = [index_name] + names if append else names + + tm.assert_frame_equal(result, expected) + + # MultiIndex constructor does not work directly on Series -> lambda + # We also emulate a "constructor" for the label -> lambda + # also test index name if append=True (name is duplicate here for A) + @pytest.mark.parametrize( + "box2", + [ + Series, + Index, + np.array, + list, + iter, + lambda x: MultiIndex.from_arrays([x]), + lambda x: x.name, + ], + ) + @pytest.mark.parametrize( + "box1", + [ + Series, + Index, + np.array, + list, + iter, + lambda x: MultiIndex.from_arrays([x]), + lambda x: x.name, + ], + ) + @pytest.mark.parametrize( + "append, index_name", [(True, None), (True, "A"), (True, "test"), (False, None)] + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_arrays_duplicate( + self, frame_of_index_cols, drop, append, index_name, box1, box2 + ): + df = frame_of_index_cols + df.index.name = index_name + + keys = [box1(df["A"]), box2(df["A"])] + result = df.set_index(keys, drop=drop, append=append) + + # if either box is iter, it has been consumed; re-read + keys = [box1(df["A"]), box2(df["A"])] + + # need to adapt first drop for case that both keys are 'A' -- + # cannot drop the same column twice; + # plain == would give ambiguous Boolean error for containers + first_drop = ( + False + if ( + isinstance(keys[0], str) + and keys[0] == "A" + and isinstance(keys[1], str) + and keys[1] == "A" + ) + else drop + ) + # to test against already-tested behaviour, we add sequentially, + # hence second append always True; must wrap keys in list, otherwise + # box = list would be interpreted as keys + expected = df.set_index([keys[0]], drop=first_drop, append=append) + expected = expected.set_index([keys[1]], drop=drop, append=True) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_multiindex(self, frame_of_index_cols, drop, append): + df = frame_of_index_cols + keys = MultiIndex.from_arrays([df["A"], df["B"]], names=["A", "B"]) + + result = df.set_index(keys, drop=drop, append=append) + + # setting with a MultiIndex will never drop columns + expected = df.set_index(["A", "B"], drop=False, append=append) + + tm.assert_frame_equal(result, expected) + + def test_construction_with_categorical_index(self): + ci = CategoricalIndex(list("ab") * 5, name="B") + + # with Categorical + df = DataFrame( + {"A": np.random.default_rng(2).standard_normal(10), "B": ci.values} + ) + idf = df.set_index("B") + tm.assert_index_equal(idf.index, ci) + + # from a CategoricalIndex + df = DataFrame({"A": np.random.default_rng(2).standard_normal(10), "B": ci}) + idf = df.set_index("B") + tm.assert_index_equal(idf.index, ci) + + # round-trip + idf = idf.reset_index().set_index("B") + tm.assert_index_equal(idf.index, ci) + + def test_set_index_preserve_categorical_dtype(self): + # GH#13743, GH#13854 + df = DataFrame( + { + "A": [1, 2, 1, 1, 2], + "B": [10, 16, 22, 28, 34], + "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), + "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), + } + ) + for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]: + result = df.set_index(cols).reset_index() + result = result.reindex(columns=df.columns) + tm.assert_frame_equal(result, df) + + def test_set_index_datetime(self): + # GH#3950 + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "datetime": [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + "value": range(6), + } + ) + df.index = to_datetime(df.pop("datetime"), utc=True) + df.index = df.index.tz_convert("US/Pacific") + + expected = DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + name="datetime", + ) + expected = expected.tz_localize("UTC").tz_convert("US/Pacific") + + df = df.set_index("label", append=True) + tm.assert_index_equal(df.index.levels[0], expected) + tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) + assert df.index.names == ["datetime", "label"] + + df = df.swaplevel(0, 1) + tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) + tm.assert_index_equal(df.index.levels[1], expected) + assert df.index.names == ["label", "datetime"] + + df = DataFrame(np.random.default_rng(2).random(6)) + idx1 = DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + tz="US/Eastern", + ) + idx2 = DatetimeIndex( + [ + "2012-04-01 09:00", + "2012-04-01 09:00", + "2012-04-01 09:00", + "2012-04-02 09:00", + "2012-04-02 09:00", + "2012-04-02 09:00", + ], + tz="US/Eastern", + ) + idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo") + idx3 = idx3._with_freq(None) + + df = df.set_index(idx1) + df = df.set_index(idx2, append=True) + df = df.set_index(idx3, append=True) + + expected1 = DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + tz="US/Eastern", + ) + expected2 = DatetimeIndex( + ["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern" + ) + + tm.assert_index_equal(df.index.levels[0], expected1) + tm.assert_index_equal(df.index.levels[1], expected2) + tm.assert_index_equal(df.index.levels[2], idx3) + + # GH#7092 + tm.assert_index_equal(df.index.get_level_values(0), idx1) + tm.assert_index_equal(df.index.get_level_values(1), idx2) + tm.assert_index_equal(df.index.get_level_values(2), idx3) + + def test_set_index_period(self): + # GH#6631 + df = DataFrame(np.random.default_rng(2).random(6)) + idx1 = period_range("2011-01-01", periods=3, freq="M") + idx1 = idx1.append(idx1) + idx2 = period_range("2013-01-01 09:00", periods=2, freq="h") + idx2 = idx2.append(idx2).append(idx2) + idx3 = period_range("2005", periods=6, freq="Y") + + df = df.set_index(idx1) + df = df.set_index(idx2, append=True) + df = df.set_index(idx3, append=True) + + expected1 = period_range("2011-01-01", periods=3, freq="M") + expected2 = period_range("2013-01-01 09:00", periods=2, freq="h") + + tm.assert_index_equal(df.index.levels[0], expected1) + tm.assert_index_equal(df.index.levels[1], expected2) + tm.assert_index_equal(df.index.levels[2], idx3) + + tm.assert_index_equal(df.index.get_level_values(0), idx1) + tm.assert_index_equal(df.index.get_level_values(1), idx2) + tm.assert_index_equal(df.index.get_level_values(2), idx3) + + +class TestSetIndexInvalid: + def test_set_index_verify_integrity(self, frame_of_index_cols): + df = frame_of_index_cols + + with pytest.raises(ValueError, match="Index has duplicate keys"): + df.set_index("A", verify_integrity=True) + # with MultiIndex + with pytest.raises(ValueError, match="Index has duplicate keys"): + df.set_index([df["A"], df["A"]], verify_integrity=True) + + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): + df = frame_of_index_cols + + with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"): + # column names are A-E, as well as one tuple + df.set_index(["foo", "bar", "baz"], drop=drop, append=append) + + # non-existent key in list with arrays + with pytest.raises(KeyError, match="X"): + df.set_index([df["A"], df["B"], "X"], drop=drop, append=append) + + msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]" + # tuples always raise KeyError + with pytest.raises(KeyError, match=msg): + df.set_index(tuple(df["A"]), drop=drop, append=append) + + # also within a list + with pytest.raises(KeyError, match=msg): + df.set_index(["A", df["A"], tuple(df["A"])], drop=drop, append=append) + + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + @pytest.mark.parametrize("box", [set], ids=["set"]) + def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append): + df = frame_of_index_cols + + msg = 'The parameter "keys" may be a column key, .*' + # forbidden type, e.g. set + with pytest.raises(TypeError, match=msg): + df.set_index(box(df["A"]), drop=drop, append=append) + + # forbidden type in list, e.g. set + with pytest.raises(TypeError, match=msg): + df.set_index(["A", df["A"], box(df["A"])], drop=drop, append=append) + + # MultiIndex constructor does not work directly on Series -> lambda + @pytest.mark.parametrize( + "box", + [Series, Index, np.array, iter, lambda x: MultiIndex.from_arrays([x])], + ids=["Series", "Index", "np.array", "iter", "MultiIndex"], + ) + @pytest.mark.parametrize("length", [4, 6], ids=["too_short", "too_long"]) + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_raise_on_len( + self, frame_of_index_cols, box, length, drop, append + ): + # GH 24984 + df = frame_of_index_cols # has length 5 + + values = np.random.default_rng(2).integers(0, 10, (length,)) + + msg = "Length mismatch: Expected 5 rows, received array of length.*" + + # wrong length directly + with pytest.raises(ValueError, match=msg): + df.set_index(box(values), drop=drop, append=append) + + # wrong length in list + with pytest.raises(ValueError, match=msg): + df.set_index(["A", df.A, box(values)], drop=drop, append=append) + + +class TestSetIndexCustomLabelType: + def test_set_index_custom_label_type(self): + # GH#24969 + + class Thing: + def __init__(self, name, color) -> None: + self.name = name + self.color = color + + def __str__(self) -> str: + return f"" + + # necessary for pretty KeyError + __repr__ = __str__ + + thing1 = Thing("One", "red") + thing2 = Thing("Two", "blue") + df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) + expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2)) + + # use custom label directly + result = df.set_index(thing2) + tm.assert_frame_equal(result, expected) + + # custom label wrapped in list + result = df.set_index([thing2]) + tm.assert_frame_equal(result, expected) + + # missing key + thing3 = Thing("Three", "pink") + msg = "" + with pytest.raises(KeyError, match=msg): + # missing label directly + df.set_index(thing3) + + with pytest.raises(KeyError, match=msg): + # missing label in list + df.set_index([thing3]) + + def test_set_index_custom_label_hashable_iterable(self): + # GH#24969 + + # actual example discussed in GH 24984 was e.g. for shapely.geometry + # objects (e.g. a collection of Points) that can be both hashable and + # iterable; using frozenset as a stand-in for testing here + + class Thing(frozenset): + # need to stabilize repr for KeyError (due to random order in sets) + def __repr__(self) -> str: + tmp = sorted(self) + joined_reprs = ", ".join(map(repr, tmp)) + # double curly brace prints one brace in format string + return f"frozenset({{{joined_reprs}}})" + + thing1 = Thing(["One", "red"]) + thing2 = Thing(["Two", "blue"]) + df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) + expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2)) + + # use custom label directly + result = df.set_index(thing2) + tm.assert_frame_equal(result, expected) + + # custom label wrapped in list + result = df.set_index([thing2]) + tm.assert_frame_equal(result, expected) + + # missing key + thing3 = Thing(["Three", "pink"]) + msg = r"frozenset\(\{'Three', 'pink'\}\)" + with pytest.raises(KeyError, match=msg): + # missing label directly + df.set_index(thing3) + + with pytest.raises(KeyError, match=msg): + # missing label in list + df.set_index([thing3]) + + def test_set_index_custom_label_type_raises(self): + # GH#24969 + + # purposefully inherit from something unhashable + class Thing(set): + def __init__(self, name, color) -> None: + self.name = name + self.color = color + + def __str__(self) -> str: + return f"" + + thing1 = Thing("One", "red") + thing2 = Thing("Two", "blue") + df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2]) + + msg = 'The parameter "keys" may be a column key, .*' + + with pytest.raises(TypeError, match=msg): + # use custom label directly + df.set_index(thing2) + + with pytest.raises(TypeError, match=msg): + # custom label wrapped in list + df.set_index([thing2]) + + def test_set_index_periodindex(self): + # GH#6631 + df = DataFrame(np.random.default_rng(2).random(6)) + idx1 = period_range("2011/01/01", periods=6, freq="M") + idx2 = period_range("2013", periods=6, freq="Y") + + df = df.set_index(idx1) + tm.assert_index_equal(df.index, idx1) + df = df.set_index(idx2) + tm.assert_index_equal(df.index, idx2) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_shift.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_shift.py new file mode 100644 index 0000000000000000000000000000000000000000..abb30595fdcb8466f1873642c2c355c92a61cd49 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_shift.py @@ -0,0 +1,764 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + CategoricalIndex, + DataFrame, + Index, + NaT, + Series, + date_range, + offsets, +) +import pandas._testing as tm + + +class TestDataFrameShift: + def test_shift_axis1_with_valid_fill_value_one_array(self): + # Case with axis=1 that does not go through the "len(arrays)>1" path + # in DataFrame.shift + data = np.random.default_rng(2).standard_normal((5, 3)) + df = DataFrame(data) + res = df.shift(axis=1, periods=1, fill_value=12345) + expected = df.T.shift(periods=1, fill_value=12345).T + tm.assert_frame_equal(res, expected) + + # same but with an 1D ExtensionArray backing it + df2 = df[[0]].astype("Float64") + res2 = df2.shift(axis=1, periods=1, fill_value=12345) + expected2 = DataFrame([12345] * 5, dtype="Float64") + tm.assert_frame_equal(res2, expected2) + + def test_shift_deprecate_freq_and_fill_value(self, frame_or_series): + # Can't pass both! + obj = frame_or_series( + np.random.default_rng(2).standard_normal(5), + index=date_range("1/1/2000", periods=5, freq="h"), + ) + + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + obj.shift(1, fill_value=1, freq="h") + + if frame_or_series is DataFrame: + obj.columns = date_range("1/1/2000", periods=1, freq="h") + with tm.assert_produces_warning(FutureWarning, match=msg): + obj.shift(1, axis=1, fill_value=1, freq="h") + + @pytest.mark.parametrize( + "input_data, output_data", + [(np.empty(shape=(0,)), []), (np.ones(shape=(2,)), [np.nan, 1.0])], + ) + def test_shift_non_writable_array(self, input_data, output_data, frame_or_series): + # GH21049 Verify whether non writable numpy array is shiftable + input_data.setflags(write=False) + + result = frame_or_series(input_data).shift(1) + if frame_or_series is not Series: + # need to explicitly specify columns in the empty case + expected = frame_or_series( + output_data, + index=range(len(output_data)), + columns=range(1), + dtype="float64", + ) + else: + expected = frame_or_series(output_data, dtype="float64") + + tm.assert_equal(result, expected) + + def test_shift_mismatched_freq(self, frame_or_series): + ts = frame_or_series( + np.random.default_rng(2).standard_normal(5), + index=date_range("1/1/2000", periods=5, freq="h"), + ) + + result = ts.shift(1, freq="5min") + exp_index = ts.index.shift(1, freq="5min") + tm.assert_index_equal(result.index, exp_index) + + # GH#1063, multiple of same base + result = ts.shift(1, freq="4h") + exp_index = ts.index + offsets.Hour(4) + tm.assert_index_equal(result.index, exp_index) + + @pytest.mark.parametrize( + "obj", + [ + Series([np.arange(5)]), + date_range("1/1/2011", periods=24, freq="h"), + Series(range(5), index=date_range("2017", periods=5)), + ], + ) + @pytest.mark.parametrize("shift_size", [0, 1, 2]) + def test_shift_always_copy(self, obj, shift_size, frame_or_series): + # GH#22397 + if frame_or_series is not Series: + obj = obj.to_frame() + assert obj.shift(shift_size) is not obj + + def test_shift_object_non_scalar_fill(self): + # shift requires scalar fill_value except for object dtype + ser = Series(range(3)) + with pytest.raises(ValueError, match="fill_value must be a scalar"): + ser.shift(1, fill_value=[]) + + df = ser.to_frame() + with pytest.raises(ValueError, match="fill_value must be a scalar"): + df.shift(1, fill_value=np.arange(3)) + + obj_ser = ser.astype(object) + result = obj_ser.shift(1, fill_value={}) + assert result[0] == {} + + obj_df = obj_ser.to_frame() + result = obj_df.shift(1, fill_value={}) + assert result.iloc[0, 0] == {} + + def test_shift_int(self, datetime_frame, frame_or_series): + ts = tm.get_obj(datetime_frame, frame_or_series).astype(int) + shifted = ts.shift(1) + expected = ts.astype(float).shift(1) + tm.assert_equal(shifted, expected) + + @pytest.mark.parametrize("dtype", ["int32", "int64"]) + def test_shift_32bit_take(self, frame_or_series, dtype): + # 32-bit taking + # GH#8129 + index = date_range("2000-01-01", periods=5) + arr = np.arange(5, dtype=dtype) + s1 = frame_or_series(arr, index=index) + p = arr[1] + result = s1.shift(periods=p) + expected = frame_or_series([np.nan, 0, 1, 2, 3], index=index) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("periods", [1, 2, 3, 4]) + def test_shift_preserve_freqstr(self, periods, frame_or_series): + # GH#21275 + obj = frame_or_series( + range(periods), + index=date_range("2016-1-1 00:00:00", periods=periods, freq="h"), + ) + + result = obj.shift(1, "2h") + + expected = frame_or_series( + range(periods), + index=date_range("2016-1-1 02:00:00", periods=periods, freq="h"), + ) + tm.assert_equal(result, expected) + + def test_shift_dst(self, frame_or_series): + # GH#13926 + dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern") + obj = frame_or_series(dates) + + res = obj.shift(0) + tm.assert_equal(res, obj) + assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]" + + res = obj.shift(1) + exp_vals = [NaT] + dates.astype(object).values.tolist()[:9] + exp = frame_or_series(exp_vals) + tm.assert_equal(res, exp) + assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]" + + res = obj.shift(-2) + exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT] + exp = frame_or_series(exp_vals) + tm.assert_equal(res, exp) + assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]" + + @pytest.mark.parametrize("ex", [10, -10, 20, -20]) + def test_shift_dst_beyond(self, frame_or_series, ex): + # GH#13926 + dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern") + obj = frame_or_series(dates) + res = obj.shift(ex) + exp = frame_or_series([NaT] * 10, dtype="datetime64[ns, US/Eastern]") + tm.assert_equal(res, exp) + assert tm.get_dtype(res) == "datetime64[ns, US/Eastern]" + + def test_shift_by_zero(self, datetime_frame, frame_or_series): + # shift by 0 + obj = tm.get_obj(datetime_frame, frame_or_series) + unshifted = obj.shift(0) + tm.assert_equal(unshifted, obj) + + def test_shift(self, datetime_frame): + # naive shift + ser = datetime_frame["A"] + + shifted = datetime_frame.shift(5) + tm.assert_index_equal(shifted.index, datetime_frame.index) + + shifted_ser = ser.shift(5) + tm.assert_series_equal(shifted["A"], shifted_ser) + + shifted = datetime_frame.shift(-5) + tm.assert_index_equal(shifted.index, datetime_frame.index) + + shifted_ser = ser.shift(-5) + tm.assert_series_equal(shifted["A"], shifted_ser) + + unshifted = datetime_frame.shift(5).shift(-5) + tm.assert_numpy_array_equal( + unshifted.dropna().values, datetime_frame.values[:-5] + ) + + unshifted_ser = ser.shift(5).shift(-5) + tm.assert_numpy_array_equal(unshifted_ser.dropna().values, ser.values[:-5]) + + def test_shift_by_offset(self, datetime_frame, frame_or_series): + # shift by DateOffset + obj = tm.get_obj(datetime_frame, frame_or_series) + offset = offsets.BDay() + + shifted = obj.shift(5, freq=offset) + assert len(shifted) == len(obj) + unshifted = shifted.shift(-5, freq=offset) + tm.assert_equal(unshifted, obj) + + shifted2 = obj.shift(5, freq="B") + tm.assert_equal(shifted, shifted2) + + unshifted = obj.shift(0, freq=offset) + tm.assert_equal(unshifted, obj) + + d = obj.index[0] + shifted_d = d + offset * 5 + if frame_or_series is DataFrame: + tm.assert_series_equal(obj.xs(d), shifted.xs(shifted_d), check_names=False) + else: + tm.assert_almost_equal(obj.at[d], shifted.at[shifted_d]) + + def test_shift_with_periodindex(self, frame_or_series): + # Shifting with PeriodIndex + ps = DataFrame( + np.arange(4, dtype=float), index=pd.period_range("2020-01-01", periods=4) + ) + ps = tm.get_obj(ps, frame_or_series) + + shifted = ps.shift(1) + unshifted = shifted.shift(-1) + tm.assert_index_equal(shifted.index, ps.index) + tm.assert_index_equal(unshifted.index, ps.index) + if frame_or_series is DataFrame: + tm.assert_numpy_array_equal( + unshifted.iloc[:, 0].dropna().values, ps.iloc[:-1, 0].values + ) + else: + tm.assert_numpy_array_equal(unshifted.dropna().values, ps.values[:-1]) + + shifted2 = ps.shift(1, "D") + shifted3 = ps.shift(1, offsets.Day()) + tm.assert_equal(shifted2, shifted3) + tm.assert_equal(ps, shifted2.shift(-1, "D")) + + msg = "does not match PeriodIndex freq" + with pytest.raises(ValueError, match=msg): + ps.shift(freq="W") + + # legacy support + shifted4 = ps.shift(1, freq="D") + tm.assert_equal(shifted2, shifted4) + + shifted5 = ps.shift(1, freq=offsets.Day()) + tm.assert_equal(shifted5, shifted4) + + def test_shift_other_axis(self): + # shift other axis + # GH#6371 + df = DataFrame(np.random.default_rng(2).random((10, 5))) + expected = pd.concat( + [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], + ignore_index=True, + axis=1, + ) + result = df.shift(1, axis=1) + tm.assert_frame_equal(result, expected) + + def test_shift_named_axis(self): + # shift named axis + df = DataFrame(np.random.default_rng(2).random((10, 5))) + expected = pd.concat( + [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], + ignore_index=True, + axis=1, + ) + result = df.shift(1, axis="columns") + tm.assert_frame_equal(result, expected) + + def test_shift_other_axis_with_freq(self, datetime_frame): + obj = datetime_frame.T + offset = offsets.BDay() + + # GH#47039 + shifted = obj.shift(5, freq=offset, axis=1) + assert len(shifted) == len(obj) + unshifted = shifted.shift(-5, freq=offset, axis=1) + tm.assert_equal(unshifted, obj) + + def test_shift_bool(self): + df = DataFrame({"high": [True, False], "low": [False, False]}) + rs = df.shift(1) + xp = DataFrame( + np.array([[np.nan, np.nan], [True, False]], dtype=object), + columns=["high", "low"], + ) + tm.assert_frame_equal(rs, xp) + + def test_shift_categorical1(self, frame_or_series): + # GH#9416 + obj = frame_or_series(["a", "b", "c", "d"], dtype="category") + + rt = obj.shift(1).shift(-1) + tm.assert_equal(obj.iloc[:-1], rt.dropna()) + + def get_cat_values(ndframe): + # For Series we could just do ._values; for DataFrame + # we may be able to do this if we ever have 2D Categoricals + return ndframe._mgr.arrays[0] + + cat = get_cat_values(obj) + + sp1 = obj.shift(1) + tm.assert_index_equal(obj.index, sp1.index) + assert np.all(get_cat_values(sp1).codes[:1] == -1) + assert np.all(cat.codes[:-1] == get_cat_values(sp1).codes[1:]) + + sn2 = obj.shift(-2) + tm.assert_index_equal(obj.index, sn2.index) + assert np.all(get_cat_values(sn2).codes[-2:] == -1) + assert np.all(cat.codes[2:] == get_cat_values(sn2).codes[:-2]) + + tm.assert_index_equal(cat.categories, get_cat_values(sp1).categories) + tm.assert_index_equal(cat.categories, get_cat_values(sn2).categories) + + def test_shift_categorical(self): + # GH#9416 + s1 = Series(["a", "b", "c"], dtype="category") + s2 = Series(["A", "B", "C"], dtype="category") + df = DataFrame({"one": s1, "two": s2}) + rs = df.shift(1) + xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)}) + tm.assert_frame_equal(rs, xp) + + def test_shift_categorical_fill_value(self, frame_or_series): + ts = frame_or_series(["a", "b", "c", "d"], dtype="category") + res = ts.shift(1, fill_value="a") + expected = frame_or_series( + pd.Categorical( + ["a", "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False + ) + ) + tm.assert_equal(res, expected) + + # check for incorrect fill_value + msg = r"Cannot setitem on a Categorical with a new category \(f\)" + with pytest.raises(TypeError, match=msg): + ts.shift(1, fill_value="f") + + def test_shift_fill_value(self, frame_or_series): + # GH#24128 + dti = date_range("1/1/2000", periods=5, freq="h") + + ts = frame_or_series([1.0, 2.0, 3.0, 4.0, 5.0], index=dti) + exp = frame_or_series([0.0, 1.0, 2.0, 3.0, 4.0], index=dti) + # check that fill value works + result = ts.shift(1, fill_value=0.0) + tm.assert_equal(result, exp) + + exp = frame_or_series([0.0, 0.0, 1.0, 2.0, 3.0], index=dti) + result = ts.shift(2, fill_value=0.0) + tm.assert_equal(result, exp) + + ts = frame_or_series([1, 2, 3]) + res = ts.shift(2, fill_value=0) + assert tm.get_dtype(res) == tm.get_dtype(ts) + + # retain integer dtype + obj = frame_or_series([1, 2, 3, 4, 5], index=dti) + exp = frame_or_series([0, 1, 2, 3, 4], index=dti) + result = obj.shift(1, fill_value=0) + tm.assert_equal(result, exp) + + exp = frame_or_series([0, 0, 1, 2, 3], index=dti) + result = obj.shift(2, fill_value=0) + tm.assert_equal(result, exp) + + def test_shift_empty(self): + # Regression test for GH#8019 + df = DataFrame({"foo": []}) + rs = df.shift(-1) + + tm.assert_frame_equal(df, rs) + + def test_shift_duplicate_columns(self): + # GH#9092; verify that position-based shifting works + # in the presence of duplicate columns + column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]] + data = np.random.default_rng(2).standard_normal((20, 5)) + + shifted = [] + for columns in column_lists: + df = DataFrame(data.copy(), columns=columns) + for s in range(5): + df.iloc[:, s] = df.iloc[:, s].shift(s + 1) + df.columns = range(5) + shifted.append(df) + + # sanity check the base case + nulls = shifted[0].isna().sum() + tm.assert_series_equal(nulls, Series(range(1, 6), dtype="int64")) + + # check all answers are the same + tm.assert_frame_equal(shifted[0], shifted[1]) + tm.assert_frame_equal(shifted[0], shifted[2]) + + def test_shift_axis1_multiple_blocks(self, using_array_manager): + # GH#35488 + df1 = DataFrame(np.random.default_rng(2).integers(1000, size=(5, 3))) + df2 = DataFrame(np.random.default_rng(2).integers(1000, size=(5, 2))) + df3 = pd.concat([df1, df2], axis=1) + if not using_array_manager: + assert len(df3._mgr.blocks) == 2 + + result = df3.shift(2, axis=1) + + expected = df3.take([-1, -1, 0, 1, 2], axis=1) + # Explicit cast to float to avoid implicit cast when setting nan. + # Column names aren't unique, so directly calling `expected.astype` won't work. + expected = expected.pipe( + lambda df: df.set_axis(range(df.shape[1]), axis=1) + .astype({0: "float", 1: "float"}) + .set_axis(df.columns, axis=1) + ) + expected.iloc[:, :2] = np.nan + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + + # Case with periods < 0 + # rebuild df3 because `take` call above consolidated + df3 = pd.concat([df1, df2], axis=1) + if not using_array_manager: + assert len(df3._mgr.blocks) == 2 + result = df3.shift(-2, axis=1) + + expected = df3.take([2, 3, 4, -1, -1], axis=1) + # Explicit cast to float to avoid implicit cast when setting nan. + # Column names aren't unique, so directly calling `expected.astype` won't work. + expected = expected.pipe( + lambda df: df.set_axis(range(df.shape[1]), axis=1) + .astype({3: "float", 4: "float"}) + .set_axis(df.columns, axis=1) + ) + expected.iloc[:, -2:] = np.nan + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) axis=1 support + def test_shift_axis1_multiple_blocks_with_int_fill(self): + # GH#42719 + rng = np.random.default_rng(2) + df1 = DataFrame(rng.integers(1000, size=(5, 3), dtype=int)) + df2 = DataFrame(rng.integers(1000, size=(5, 2), dtype=int)) + df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) + result = df3.shift(2, axis=1, fill_value=np.int_(0)) + assert len(df3._mgr.blocks) == 2 + + expected = df3.take([-1, -1, 0, 1], axis=1) + expected.iloc[:, :2] = np.int_(0) + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + + # Case with periods < 0 + df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) + result = df3.shift(-2, axis=1, fill_value=np.int_(0)) + assert len(df3._mgr.blocks) == 2 + + expected = df3.take([2, 3, -1, -1], axis=1) + expected.iloc[:, -2:] = np.int_(0) + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + + def test_period_index_frame_shift_with_freq(self, frame_or_series): + ps = DataFrame(range(4), index=pd.period_range("2020-01-01", periods=4)) + ps = tm.get_obj(ps, frame_or_series) + + shifted = ps.shift(1, freq="infer") + unshifted = shifted.shift(-1, freq="infer") + tm.assert_equal(unshifted, ps) + + shifted2 = ps.shift(freq="D") + tm.assert_equal(shifted, shifted2) + + shifted3 = ps.shift(freq=offsets.Day()) + tm.assert_equal(shifted, shifted3) + + def test_datetime_frame_shift_with_freq(self, datetime_frame, frame_or_series): + dtobj = tm.get_obj(datetime_frame, frame_or_series) + shifted = dtobj.shift(1, freq="infer") + unshifted = shifted.shift(-1, freq="infer") + tm.assert_equal(dtobj, unshifted) + + shifted2 = dtobj.shift(freq=dtobj.index.freq) + tm.assert_equal(shifted, shifted2) + + inferred_ts = DataFrame( + datetime_frame.values, + Index(np.asarray(datetime_frame.index)), + columns=datetime_frame.columns, + ) + inferred_ts = tm.get_obj(inferred_ts, frame_or_series) + shifted = inferred_ts.shift(1, freq="infer") + expected = dtobj.shift(1, freq="infer") + expected.index = expected.index._with_freq(None) + tm.assert_equal(shifted, expected) + + unshifted = shifted.shift(-1, freq="infer") + tm.assert_equal(unshifted, inferred_ts) + + def test_period_index_frame_shift_with_freq_error(self, frame_or_series): + ps = DataFrame(range(4), index=pd.period_range("2020-01-01", periods=4)) + ps = tm.get_obj(ps, frame_or_series) + msg = "Given freq M does not match PeriodIndex freq D" + with pytest.raises(ValueError, match=msg): + ps.shift(freq="M") + + def test_datetime_frame_shift_with_freq_error( + self, datetime_frame, frame_or_series + ): + dtobj = tm.get_obj(datetime_frame, frame_or_series) + no_freq = dtobj.iloc[[0, 5, 7]] + msg = "Freq was not set in the index hence cannot be inferred" + with pytest.raises(ValueError, match=msg): + no_freq.shift(freq="infer") + + def test_shift_dt64values_int_fill_deprecated(self): + # GH#31971 + ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) + + with pytest.raises(TypeError, match="value should be a"): + ser.shift(1, fill_value=0) + + df = ser.to_frame() + with pytest.raises(TypeError, match="value should be a"): + df.shift(1, fill_value=0) + + # axis = 1 + df2 = DataFrame({"A": ser, "B": ser}) + df2._consolidate_inplace() + + result = df2.shift(1, axis=1, fill_value=0) + expected = DataFrame({"A": [0, 0], "B": df2["A"]}) + tm.assert_frame_equal(result, expected) + + # same thing but not consolidated; pre-2.0 we got different behavior + df3 = DataFrame({"A": ser}) + df3["B"] = ser + assert len(df3._mgr.arrays) == 2 + result = df3.shift(1, axis=1, fill_value=0) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "as_cat", + [ + pytest.param( + True, + marks=pytest.mark.xfail( + reason="_can_hold_element incorrectly always returns True" + ), + ), + False, + ], + ) + @pytest.mark.parametrize( + "vals", + [ + date_range("2020-01-01", periods=2), + date_range("2020-01-01", periods=2, tz="US/Pacific"), + pd.period_range("2020-01-01", periods=2, freq="D"), + pd.timedelta_range("2020 Days", periods=2, freq="D"), + pd.interval_range(0, 3, periods=2), + pytest.param( + pd.array([1, 2], dtype="Int64"), + marks=pytest.mark.xfail( + reason="_can_hold_element incorrectly always returns True" + ), + ), + pytest.param( + pd.array([1, 2], dtype="Float32"), + marks=pytest.mark.xfail( + reason="_can_hold_element incorrectly always returns True" + ), + ), + ], + ids=lambda x: str(x.dtype), + ) + def test_shift_dt64values_axis1_invalid_fill(self, vals, as_cat): + # GH#44564 + ser = Series(vals) + if as_cat: + ser = ser.astype("category") + + df = DataFrame({"A": ser}) + result = df.shift(-1, axis=1, fill_value="foo") + expected = DataFrame({"A": ["foo", "foo"]}) + tm.assert_frame_equal(result, expected) + + # same thing but multiple blocks + df2 = DataFrame({"A": ser, "B": ser}) + df2._consolidate_inplace() + + result = df2.shift(-1, axis=1, fill_value="foo") + expected = DataFrame({"A": df2["B"], "B": ["foo", "foo"]}) + tm.assert_frame_equal(result, expected) + + # same thing but not consolidated + df3 = DataFrame({"A": ser}) + df3["B"] = ser + assert len(df3._mgr.arrays) == 2 + result = df3.shift(-1, axis=1, fill_value="foo") + tm.assert_frame_equal(result, expected) + + def test_shift_axis1_categorical_columns(self): + # GH#38434 + ci = CategoricalIndex(["a", "b", "c"]) + df = DataFrame( + {"a": [1, 3], "b": [2, 4], "c": [5, 6]}, index=ci[:-1], columns=ci + ) + result = df.shift(axis=1) + + expected = DataFrame( + {"a": [np.nan, np.nan], "b": [1, 3], "c": [2, 4]}, index=ci[:-1], columns=ci + ) + tm.assert_frame_equal(result, expected) + + # periods != 1 + result = df.shift(2, axis=1) + expected = DataFrame( + {"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 3]}, + index=ci[:-1], + columns=ci, + ) + tm.assert_frame_equal(result, expected) + + def test_shift_axis1_many_periods(self): + # GH#44978 periods > len(columns) + df = DataFrame(np.random.default_rng(2).random((5, 3))) + shifted = df.shift(6, axis=1, fill_value=None) + + expected = df * np.nan + tm.assert_frame_equal(shifted, expected) + + shifted2 = df.shift(-6, axis=1, fill_value=None) + tm.assert_frame_equal(shifted2, expected) + + def test_shift_with_offsets_freq(self): + df = DataFrame({"x": [1, 2, 3]}, index=date_range("2000", periods=3)) + shifted = df.shift(freq="1MS") + expected = DataFrame( + {"x": [1, 2, 3]}, + index=date_range(start="02/01/2000", end="02/01/2000", periods=3), + ) + tm.assert_frame_equal(shifted, expected) + + def test_shift_with_iterable_basic_functionality(self): + # GH#44424 + data = {"a": [1, 2, 3], "b": [4, 5, 6]} + shifts = [0, 1, 2] + + df = DataFrame(data) + shifted = df.shift(shifts) + + expected = DataFrame( + { + "a_0": [1, 2, 3], + "b_0": [4, 5, 6], + "a_1": [np.nan, 1.0, 2.0], + "b_1": [np.nan, 4.0, 5.0], + "a_2": [np.nan, np.nan, 1.0], + "b_2": [np.nan, np.nan, 4.0], + } + ) + tm.assert_frame_equal(expected, shifted) + + def test_shift_with_iterable_series(self): + # GH#44424 + data = {"a": [1, 2, 3]} + shifts = [0, 1, 2] + + df = DataFrame(data) + s = df["a"] + tm.assert_frame_equal(s.shift(shifts), df.shift(shifts)) + + def test_shift_with_iterable_freq_and_fill_value(self): + # GH#44424 + df = DataFrame( + np.random.default_rng(2).standard_normal(5), + index=date_range("1/1/2000", periods=5, freq="h"), + ) + + tm.assert_frame_equal( + # rename because shift with an iterable leads to str column names + df.shift([1], fill_value=1).rename(columns=lambda x: int(x[0])), + df.shift(1, fill_value=1), + ) + + tm.assert_frame_equal( + df.shift([1], freq="h").rename(columns=lambda x: int(x[0])), + df.shift(1, freq="h"), + ) + + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.shift([1, 2], fill_value=1, freq="h") + + def test_shift_with_iterable_check_other_arguments(self): + # GH#44424 + data = {"a": [1, 2], "b": [4, 5]} + shifts = [0, 1] + df = DataFrame(data) + + # test suffix + shifted = df[["a"]].shift(shifts, suffix="_suffix") + expected = DataFrame({"a_suffix_0": [1, 2], "a_suffix_1": [np.nan, 1.0]}) + tm.assert_frame_equal(shifted, expected) + + # check bad inputs when doing multiple shifts + msg = "If `periods` contains multiple shifts, `axis` cannot be 1." + with pytest.raises(ValueError, match=msg): + df.shift(shifts, axis=1) + + msg = "Periods must be integer, but s is ." + with pytest.raises(TypeError, match=msg): + df.shift(["s"]) + + msg = "If `periods` is an iterable, it cannot be empty." + with pytest.raises(ValueError, match=msg): + df.shift([]) + + msg = "Cannot specify `suffix` if `periods` is an int." + with pytest.raises(ValueError, match=msg): + df.shift(1, suffix="fails") + + def test_shift_axis_one_empty(self): + # GH#57301 + df = DataFrame() + result = df.shift(1, axis=1) + tm.assert_frame_equal(result, df) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_size.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_size.py new file mode 100644 index 0000000000000000000000000000000000000000..0c8b6473c85ea8e4a9749e79c8b4459afe6637d8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_size.py @@ -0,0 +1,21 @@ +import numpy as np +import pytest + +from pandas import DataFrame + + +@pytest.mark.parametrize( + "data, index, expected", + [ + ({"col1": [1], "col2": [3]}, None, 2), + ({}, None, 0), + ({"col1": [1, np.nan], "col2": [3, 4]}, None, 4), + ({"col1": [1, 2], "col2": [3, 4]}, [["a", "b"], [1, 2]], 4), + ({"col1": [1, 2, 3, 4], "col2": [3, 4, 5, 6]}, ["x", "y", "a", "b"], 8), + ], +) +def test_size(data, index, expected): + # GH#52897 + df = DataFrame(data, index=index) + assert df.size == expected + assert isinstance(df.size, int) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_sort_index.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_sort_index.py new file mode 100644 index 0000000000000000000000000000000000000000..830561a1349ee73b68f1f95c31b0e3b8dcccb48b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_sort_index.py @@ -0,0 +1,1028 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + CategoricalDtype, + CategoricalIndex, + DataFrame, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + Timestamp, +) +import pandas._testing as tm + + +class TestDataFrameSortIndex: + def test_sort_index_and_reconstruction_doc_example(self): + # doc example + df = DataFrame( + {"value": [1, 2, 3, 4]}, + index=MultiIndex( + levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) + assert df.index._is_lexsorted() + assert not df.index.is_monotonic_increasing + + # sort it + expected = DataFrame( + {"value": [2, 1, 4, 3]}, + index=MultiIndex( + levels=[["a", "b"], ["aa", "bb"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) + result = df.sort_index() + assert result.index.is_monotonic_increasing + tm.assert_frame_equal(result, expected) + + # reconstruct + result = df.sort_index().copy() + result.index = result.index._sort_levels_monotonic() + assert result.index.is_monotonic_increasing + tm.assert_frame_equal(result, expected) + + def test_sort_index_non_existent_label_multiindex(self): + # GH#12261 + df = DataFrame(0, columns=[], index=MultiIndex.from_product([[], []])) + with tm.assert_produces_warning(None): + df.loc["b", "2"] = 1 + df.loc["a", "3"] = 1 + result = df.sort_index().index.is_monotonic_increasing + assert result is True + + def test_sort_index_reorder_on_ops(self): + # GH#15687 + df = DataFrame( + np.random.default_rng(2).standard_normal((8, 2)), + index=MultiIndex.from_product( + [["a", "b"], ["big", "small"], ["red", "blu"]], + names=["letter", "size", "color"], + ), + columns=["near", "far"], + ) + df = df.sort_index() + + def my_func(group): + group.index = ["newz", "newa"] + return group + + result = df.groupby(level=["letter", "size"]).apply(my_func).sort_index() + expected = MultiIndex.from_product( + [["a", "b"], ["big", "small"], ["newa", "newz"]], + names=["letter", "size", None], + ) + + tm.assert_index_equal(result.index, expected) + + def test_sort_index_nan_multiindex(self): + # GH#14784 + # incorrect sorting w.r.t. nans + tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] + mi = MultiIndex.from_tuples(tuples) + + df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) + s = Series(np.arange(4), index=mi) + + df2 = DataFrame( + { + "date": pd.DatetimeIndex( + [ + "20121002", + "20121007", + "20130130", + "20130202", + "20130305", + "20121002", + "20121207", + "20130130", + "20130202", + "20130305", + "20130202", + "20130305", + ] + ), + "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], + "whole_cost": [ + 1790, + np.nan, + 280, + 259, + np.nan, + 623, + 90, + 312, + np.nan, + 301, + 359, + 801, + ], + "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], + } + ).set_index(["date", "user_id"]) + + # sorting frame, default nan position is last + result = df.sort_index() + expected = df.iloc[[3, 0, 2, 1], :] + tm.assert_frame_equal(result, expected) + + # sorting frame, nan position last + result = df.sort_index(na_position="last") + expected = df.iloc[[3, 0, 2, 1], :] + tm.assert_frame_equal(result, expected) + + # sorting frame, nan position first + result = df.sort_index(na_position="first") + expected = df.iloc[[1, 2, 3, 0], :] + tm.assert_frame_equal(result, expected) + + # sorting frame with removed rows + result = df2.dropna().sort_index() + expected = df2.sort_index().dropna() + tm.assert_frame_equal(result, expected) + + # sorting series, default nan position is last + result = s.sort_index() + expected = s.iloc[[3, 0, 2, 1]] + tm.assert_series_equal(result, expected) + + # sorting series, nan position last + result = s.sort_index(na_position="last") + expected = s.iloc[[3, 0, 2, 1]] + tm.assert_series_equal(result, expected) + + # sorting series, nan position first + result = s.sort_index(na_position="first") + expected = s.iloc[[1, 2, 3, 0]] + tm.assert_series_equal(result, expected) + + def test_sort_index_nan(self): + # GH#3917 + + # Test DataFrame with nan label + df = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, np.nan], + ) + + # NaN label, ascending=True, na_position='last' + sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last") + expected = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, np.nan], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=True, na_position='first' + sorted_df = df.sort_index(na_position="first") + expected = DataFrame( + {"A": [4, 1, 2, np.nan, 1, 6, 8], "B": [5, 9, np.nan, 5, 2, 5, 4]}, + index=[np.nan, 1, 2, 3, 4, 5, 6], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='last' + sorted_df = df.sort_index(kind="quicksort", ascending=False) + expected = DataFrame( + {"A": [8, 6, 1, np.nan, 2, 1, 4], "B": [4, 5, 2, 5, np.nan, 9, 5]}, + index=[6, 5, 4, 3, 2, 1, np.nan], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='first' + sorted_df = df.sort_index( + kind="quicksort", ascending=False, na_position="first" + ) + expected = DataFrame( + {"A": [4, 8, 6, 1, np.nan, 2, 1], "B": [5, 4, 5, 2, 5, np.nan, 9]}, + index=[np.nan, 6, 5, 4, 3, 2, 1], + ) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_index_multi_index(self): + # GH#25775, testing that sorting by index works with a multi-index. + df = DataFrame( + {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} + ) + result = df.set_index(list("abc")).sort_index(level=list("ba")) + + expected = DataFrame( + {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} + ) + expected = expected.set_index(list("abc")) + + tm.assert_frame_equal(result, expected) + + def test_sort_index_inplace(self): + frame = DataFrame( + np.random.default_rng(2).standard_normal((4, 4)), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) + + # axis=0 + unordered = frame.loc[[3, 2, 4, 1]] + a_values = unordered["A"] + df = unordered.copy() + return_value = df.sort_index(inplace=True) + assert return_value is None + expected = frame + tm.assert_frame_equal(df, expected) + # GH 44153 related + # Used to be a_id != id(df["A"]), but flaky in the CI + assert a_values is not df["A"] + + df = unordered.copy() + return_value = df.sort_index(ascending=False, inplace=True) + assert return_value is None + expected = frame[::-1] + tm.assert_frame_equal(df, expected) + + # axis=1 + unordered = frame.loc[:, ["D", "B", "C", "A"]] + df = unordered.copy() + return_value = df.sort_index(axis=1, inplace=True) + assert return_value is None + expected = frame + tm.assert_frame_equal(df, expected) + + df = unordered.copy() + return_value = df.sort_index(axis=1, ascending=False, inplace=True) + assert return_value is None + expected = frame.iloc[:, ::-1] + tm.assert_frame_equal(df, expected) + + def test_sort_index_different_sortorder(self): + A = np.arange(20).repeat(5) + B = np.tile(np.arange(5), 20) + + indexer = np.random.default_rng(2).permutation(100) + A = A.take(indexer) + B = B.take(indexer) + + df = DataFrame( + {"A": A, "B": B, "C": np.random.default_rng(2).standard_normal(100)} + ) + + ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) + expected = df.take(ex_indexer) + + # test with multiindex, too + idf = df.set_index(["A", "B"]) + + result = idf.sort_index(ascending=[1, 0]) + expected = idf.take(ex_indexer) + tm.assert_frame_equal(result, expected) + + # also, Series! + result = idf["C"].sort_index(ascending=[1, 0]) + tm.assert_series_equal(result, expected["C"]) + + def test_sort_index_level(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + df = DataFrame([[1, 2], [3, 4]], mi) + + result = df.sort_index(level="A", sort_remaining=False) + expected = df + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["A", "B"], sort_remaining=False) + expected = df + tm.assert_frame_equal(result, expected) + + # Error thrown by sort_index when + # first index is sorted last (GH#26053) + result = df.sort_index(level=["C", "B", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["B", "C", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["C", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + def test_sort_index_categorical_index(self): + df = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))), + } + ).set_index("B") + + result = df.sort_index() + expected = df.iloc[[4, 0, 1, 5, 2, 3]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(ascending=False) + expected = df.iloc[[2, 3, 0, 1, 5, 4]] + tm.assert_frame_equal(result, expected) + + def test_sort_index(self): + # GH#13496 + + frame = DataFrame( + np.arange(16).reshape(4, 4), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) + + # axis=0 : sort rows by index labels + unordered = frame.loc[[3, 2, 4, 1]] + result = unordered.sort_index(axis=0) + expected = frame + tm.assert_frame_equal(result, expected) + + result = unordered.sort_index(ascending=False) + expected = frame[::-1] + tm.assert_frame_equal(result, expected) + + # axis=1 : sort columns by column names + unordered = frame.iloc[:, [2, 1, 3, 0]] + result = unordered.sort_index(axis=1) + tm.assert_frame_equal(result, frame) + + result = unordered.sort_index(axis=1, ascending=False) + expected = frame.iloc[:, ::-1] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("level", ["A", 0]) # GH#21052 + def test_sort_index_multiindex(self, level): + # GH#13496 + + # sort rows by specified level of multi-index + mi = MultiIndex.from_tuples( + [[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list("ABC") + ) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) + + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC") + ) + expected = DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi) + result = df.sort_index(level=level) + tm.assert_frame_equal(result, expected) + + # sort_remaining=False + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC") + ) + expected = DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi) + result = df.sort_index(level=level, sort_remaining=False) + tm.assert_frame_equal(result, expected) + + def test_sort_index_intervalindex(self): + # this is a de-facto sort via unstack + # confirming that we sort in the order of the bins + y = Series(np.random.default_rng(2).standard_normal(100)) + x1 = Series(np.sign(np.random.default_rng(2).standard_normal(100))) + x2 = pd.cut( + Series(np.random.default_rng(2).standard_normal(100)), + bins=[-3, -0.5, 0, 0.5, 3], + ) + model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"]) + + result = model.groupby(["X1", "X2"], observed=True).mean().unstack() + expected = IntervalIndex.from_tuples( + [(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], closed="right" + ) + result = result.columns.levels[1].categories + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, False, [5, 3, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, False, [2, 3, 5]), + ], + ) + def test_sort_index_ignore_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114 + original_index = [2, 5, 3] + df = DataFrame(original_dict, index=original_index) + expected_df = DataFrame(sorted_dict, index=output_index) + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=original_index)) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("ignore_index", [True, False]) + def test_respect_ignore_index(self, inplace, ignore_index): + # GH 43591 + df = DataFrame({"a": [1, 2, 3]}, index=RangeIndex(4, -1, -2)) + result = df.sort_index( + ascending=False, ignore_index=ignore_index, inplace=inplace + ) + + if inplace: + result = df + if ignore_index: + expected = DataFrame({"a": [1, 2, 3]}) + else: + expected = DataFrame({"a": [1, 2, 3]}, index=RangeIndex(4, -1, -2)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + False, + MultiIndex.from_tuples([(2, 1), (3, 4)], names=list("AB")), + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + False, + MultiIndex.from_tuples([(3, 4), (2, 1)], names=list("AB")), + ), + ], + ) + def test_sort_index_ignore_index_multi_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114, this is to test ignore_index on MultiIndex of index + mi = MultiIndex.from_tuples([(2, 1), (3, 4)], names=list("AB")) + df = DataFrame(original_dict, index=mi) + expected_df = DataFrame(sorted_dict, index=output_index) + + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=mi)) + + def test_sort_index_categorical_multiindex(self): + # GH#15058 + df = DataFrame( + { + "a": range(6), + "l1": pd.Categorical( + ["a", "a", "b", "b", "c", "c"], + categories=["c", "a", "b"], + ordered=True, + ), + "l2": [0, 1, 0, 1, 0, 1], + } + ) + result = df.set_index(["l1", "l2"]).sort_index() + expected = DataFrame( + [4, 5, 0, 1, 2, 3], + columns=["a"], + index=MultiIndex( + levels=[ + CategoricalIndex( + ["c", "a", "b"], + categories=["c", "a", "b"], + ordered=True, + name="l1", + dtype="category", + ), + [0, 1], + ], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=["l1", "l2"], + ), + ) + tm.assert_frame_equal(result, expected) + + def test_sort_index_and_reconstruction(self): + # GH#15622 + # lexsortedness should be identical + # across MultiIndex construction methods + + df = DataFrame([[1, 1], [2, 2]], index=list("ab")) + expected = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_tuples( + [(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")] + ), + ) + assert expected.index._is_lexsorted() + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), + ) + result = result.sort_index() + assert result.index.is_monotonic_increasing + + tm.assert_frame_equal(result, expected) + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex( + levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) + result = result.sort_index() + assert result.index._is_lexsorted() + + tm.assert_frame_equal(result, expected) + + concatted = pd.concat([df, df], keys=[0.8, 0.5]) + result = concatted.sort_index() + + assert result.index.is_monotonic_increasing + + tm.assert_frame_equal(result, expected) + + # GH#14015 + df = DataFrame( + [[1, 2], [6, 7]], + columns=MultiIndex.from_tuples( + [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], + names=["l1", "Date"], + ), + ) + + df.columns = df.columns.set_levels( + pd.to_datetime(df.columns.levels[1]), level=1 + ) + assert not df.columns.is_monotonic_increasing + result = df.sort_index(axis=1) + assert result.columns.is_monotonic_increasing + result = df.sort_index(axis=1, level=1) + assert result.columns.is_monotonic_increasing + + # TODO: better name, de-duplicate with test_sort_index_level above + def test_sort_index_level2(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + df = frame.copy() + df.index = np.arange(len(df)) + + # axis=1 + + # series + a_sorted = frame["A"].sort_index(level=0) + + # preserve names + assert a_sorted.index.names == frame.index.names + + # inplace + rs = frame.copy() + return_value = rs.sort_index(level=0, inplace=True) + assert return_value is None + tm.assert_frame_equal(rs, frame.sort_index(level=0)) + + def test_sort_index_level_large_cardinality(self): + # GH#2684 (int64) + index = MultiIndex.from_arrays([np.arange(4000)] * 3) + df = DataFrame( + np.random.default_rng(2).standard_normal(4000).astype("int64"), index=index + ) + + # it works! + result = df.sort_index(level=0) + assert result.index._lexsort_depth == 3 + + # GH#2684 (int32) + index = MultiIndex.from_arrays([np.arange(4000)] * 3) + df = DataFrame( + np.random.default_rng(2).standard_normal(4000).astype("int32"), index=index + ) + + # it works! + result = df.sort_index(level=0) + assert (result.dtypes.values == df.dtypes.values).all() + assert result.index._lexsort_depth == 3 + + def test_sort_index_level_by_name(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + frame.index.names = ["first", "second"] + result = frame.sort_index(level="second") + expected = frame.sort_index(level=1) + tm.assert_frame_equal(result, expected) + + def test_sort_index_level_mixed(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + sorted_before = frame.sort_index(level=1) + + df = frame.copy() + df["foo"] = "bar" + sorted_after = df.sort_index(level=1) + tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1)) + + dft = frame.T + sorted_before = dft.sort_index(level=1, axis=1) + dft["foo", "three"] = "bar" + + sorted_after = dft.sort_index(level=1, axis=1) + tm.assert_frame_equal( + sorted_before.drop([("foo", "three")], axis=1), + sorted_after.drop([("foo", "three")], axis=1), + ) + + def test_sort_index_preserve_levels(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + result = frame.sort_index() + assert result.index.names == frame.index.names + + @pytest.mark.parametrize( + "gen,extra", + [ + ([1.0, 3.0, 2.0, 5.0], 4.0), + ([1, 3, 2, 5], 4), + ( + [ + Timestamp("20130101"), + Timestamp("20130103"), + Timestamp("20130102"), + Timestamp("20130105"), + ], + Timestamp("20130104"), + ), + (["1one", "3one", "2one", "5one"], "4one"), + ], + ) + def test_sort_index_multilevel_repr_8017(self, gen, extra): + data = np.random.default_rng(2).standard_normal((3, 4)) + + columns = MultiIndex.from_tuples([("red", i) for i in gen]) + df = DataFrame(data, index=list("def"), columns=columns) + df2 = pd.concat( + [ + df, + DataFrame( + "world", + index=list("def"), + columns=MultiIndex.from_tuples([("red", extra)]), + ), + ], + axis=1, + ) + + # check that the repr is good + # make sure that we have a correct sparsified repr + # e.g. only 1 header of read + assert str(df2).splitlines()[0].split() == ["red"] + + # GH 8017 + # sorting fails after columns added + + # construct single-dtype then sort + result = df.copy().sort_index(axis=1) + expected = df.iloc[:, [0, 2, 1, 3]] + tm.assert_frame_equal(result, expected) + + result = df2.sort_index(axis=1) + expected = df2.iloc[:, [0, 2, 1, 4, 3]] + tm.assert_frame_equal(result, expected) + + # setitem then sort + result = df.copy() + result[("red", extra)] = "world" + + result = result.sort_index(axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "categories", + [ + pytest.param(["a", "b", "c"], id="str"), + pytest.param( + [pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(2, 3)], + id="pd.Interval", + ), + ], + ) + def test_sort_index_with_categories(self, categories): + # GH#23452 + df = DataFrame( + {"foo": range(len(categories))}, + index=CategoricalIndex( + data=categories, categories=categories, ordered=True + ), + ) + df.index = df.index.reorder_categories(df.index.categories[::-1]) + result = df.sort_index() + expected = DataFrame( + {"foo": reversed(range(len(categories)))}, + index=CategoricalIndex( + data=categories[::-1], categories=categories[::-1], ordered=True + ), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "ascending", + [ + None, + [True, None], + [False, "True"], + ], + ) + def test_sort_index_ascending_bad_value_raises(self, ascending): + # GH 39434 + df = DataFrame(np.arange(64)) + length = len(df.index) + df.index = [(i - length / 2) % length for i in range(length)] + match = 'For argument "ascending" expected type bool' + with pytest.raises(ValueError, match=match): + df.sort_index(axis=0, ascending=ascending, na_position="first") + + def test_sort_index_use_inf_as_na(self): + # GH 29687 + expected = DataFrame( + {"col1": [1, 2, 3], "col2": [3, 4, 5]}, + index=pd.date_range("2020", periods=3), + ) + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.use_inf_as_na", True): + result = expected.sort_index() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "ascending", + [(True, False), [True, False]], + ) + def test_sort_index_ascending_tuple(self, ascending): + df = DataFrame( + { + "legs": [4, 2, 4, 2, 2], + }, + index=MultiIndex.from_tuples( + [ + ("mammal", "dog"), + ("bird", "duck"), + ("mammal", "horse"), + ("bird", "penguin"), + ("mammal", "kangaroo"), + ], + names=["class", "animal"], + ), + ) + + # parameter `ascending`` is a tuple + result = df.sort_index(level=(0, 1), ascending=ascending) + + expected = DataFrame( + { + "legs": [2, 2, 2, 4, 4], + }, + index=MultiIndex.from_tuples( + [ + ("bird", "penguin"), + ("bird", "duck"), + ("mammal", "kangaroo"), + ("mammal", "horse"), + ("mammal", "dog"), + ], + names=["class", "animal"], + ), + ) + + tm.assert_frame_equal(result, expected) + + +class TestDataFrameSortIndexKey: + def test_sort_multi_index_key(self): + # GH 25775, testing that sorting by index works with a multi-index. + df = DataFrame( + {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} + ).set_index(list("abc")) + + result = df.sort_index(level=list("ac"), key=lambda x: x) + + expected = DataFrame( + {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} + ).set_index(list("abc")) + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=list("ac"), key=lambda x: -x) + expected = DataFrame( + {"a": [3, 2, 1], "b": [0, 0, 0], "c": [0, 2, 1], "d": list("acb")} + ).set_index(list("abc")) + + tm.assert_frame_equal(result, expected) + + def test_sort_index_key(self): # issue 27237 + df = DataFrame(np.arange(6, dtype="int64"), index=list("aaBBca")) + + result = df.sort_index() + expected = df.iloc[[2, 3, 0, 1, 5, 4]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(key=lambda x: x.str.lower()) + expected = df.iloc[[0, 1, 5, 2, 3, 4]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(key=lambda x: x.str.lower(), ascending=False) + expected = df.iloc[[4, 2, 3, 0, 1, 5]] + tm.assert_frame_equal(result, expected) + + def test_sort_index_key_int(self): + df = DataFrame(np.arange(6, dtype="int64"), index=np.arange(6, dtype="int64")) + + result = df.sort_index() + tm.assert_frame_equal(result, df) + + result = df.sort_index(key=lambda x: -x) + expected = df.sort_index(ascending=False) + tm.assert_frame_equal(result, expected) + + result = df.sort_index(key=lambda x: 2 * x) + tm.assert_frame_equal(result, df) + + def test_sort_multi_index_key_str(self): + # GH 25775, testing that sorting by index works with a multi-index. + df = DataFrame( + {"a": ["B", "a", "C"], "b": [0, 1, 0], "c": list("abc"), "d": [0, 1, 2]} + ).set_index(list("abc")) + + result = df.sort_index(level="a", key=lambda x: x.str.lower()) + + expected = DataFrame( + {"a": ["a", "B", "C"], "b": [1, 0, 0], "c": list("bac"), "d": [1, 0, 2]} + ).set_index(list("abc")) + tm.assert_frame_equal(result, expected) + + result = df.sort_index( + level=list("abc"), # can refer to names + key=lambda x: x.str.lower() if x.name in ["a", "c"] else -x, + ) + + expected = DataFrame( + {"a": ["a", "B", "C"], "b": [1, 0, 0], "c": list("bac"), "d": [1, 0, 2]} + ).set_index(list("abc")) + tm.assert_frame_equal(result, expected) + + def test_changes_length_raises(self): + df = DataFrame({"A": [1, 2, 3]}) + with pytest.raises(ValueError, match="change the shape"): + df.sort_index(key=lambda x: x[:1]) + + def test_sort_index_multiindex_sparse_column(self): + # GH 29735, testing that sort_index on a multiindexed frame with sparse + # columns fills with 0. + expected = DataFrame( + { + i: pd.array([0.0, 0.0, 0.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)) + for i in range(4) + }, + index=MultiIndex.from_product([[1, 2], [1, 2]]), + ) + + result = expected.sort_index(level=0) + + tm.assert_frame_equal(result, expected) + + def test_sort_index_na_position(self): + # GH#51612 + df = DataFrame([1, 2], index=MultiIndex.from_tuples([(1, 1), (1, pd.NA)])) + expected = df.copy() + result = df.sort_index(level=[0, 1], na_position="last") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_index_multiindex_sort_remaining(self, ascending): + # GH #24247 + df = DataFrame( + {"A": [1, 2, 3, 4, 5], "B": [10, 20, 30, 40, 50]}, + index=MultiIndex.from_tuples( + [("a", "x"), ("a", "y"), ("b", "x"), ("b", "y"), ("c", "x")] + ), + ) + + result = df.sort_index(level=1, sort_remaining=False, ascending=ascending) + + if ascending: + expected = DataFrame( + {"A": [1, 3, 5, 2, 4], "B": [10, 30, 50, 20, 40]}, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "x"), ("c", "x"), ("a", "y"), ("b", "y")] + ), + ) + else: + expected = DataFrame( + {"A": [2, 4, 1, 3, 5], "B": [20, 40, 10, 30, 50]}, + index=MultiIndex.from_tuples( + [("a", "y"), ("b", "y"), ("a", "x"), ("b", "x"), ("c", "x")] + ), + ) + + tm.assert_frame_equal(result, expected) + + +def test_sort_index_with_sliced_multiindex(): + # GH 55379 + mi = MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("b", "16"), + ("b", "26"), + ("a", "45"), + ("b", "28"), + ("a", "5"), + ("a", "50"), + ("a", "51"), + ("b", "4"), + ], + names=["group", "str"], + ) + + df = DataFrame({"x": range(len(mi))}, index=mi) + result = df.iloc[0:6].sort_index() + + expected = DataFrame( + {"x": [0, 1, 2, 5, 3, 4]}, + index=MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("a", "45"), + ("b", "16"), + ("b", "26"), + ], + names=["group", "str"], + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_axis_columns_ignore_index(): + # GH 56478 + df = DataFrame([[1, 2]], columns=["d", "c"]) + result = df.sort_index(axis="columns", ignore_index=True) + expected = DataFrame([[2, 1]]) + tm.assert_frame_equal(result, expected) + + +def test_sort_index_stable_sort(): + # GH 57151 + df = DataFrame( + data=[ + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + result = df.sort_index(level="dt", kind="stable") + expected = DataFrame( + data=[ + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_sort_values.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_sort_values.py new file mode 100644 index 0000000000000000000000000000000000000000..f2f02058a534e782a1fe1bd302512897218c1a1d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_sort_values.py @@ -0,0 +1,940 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + NaT, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.util.version import Version + + +class TestDataFrameSortValues: + @pytest.mark.parametrize("dtype", [np.uint8, bool]) + def test_sort_values_sparse_no_warning(self, dtype): + # GH#45618 + ser = pd.Series(Categorical(["a", "b", "a"], categories=["a", "b", "c"])) + df = pd.get_dummies(ser, dtype=dtype, sparse=True) + + with tm.assert_produces_warning(None): + # No warnings about constructing Index from SparseArray + df.sort_values(by=df.columns.tolist()) + + def test_sort_values(self): + frame = DataFrame( + [[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC") + ) + + # by column (axis=0) + sorted_df = frame.sort_values(by="A") + indexer = frame["A"].argsort().values + expected = frame.loc[frame.index[indexer]] + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by="A", ascending=False) + indexer = indexer[::-1] + expected = frame.loc[frame.index[indexer]] + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by="A", ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + # GH4839 + sorted_df = frame.sort_values(by=["A"], ascending=[False]) + tm.assert_frame_equal(sorted_df, expected) + + # multiple bys + sorted_df = frame.sort_values(by=["B", "C"]) + expected = frame.loc[[2, 1, 3]] + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=["B", "C"], ascending=False) + tm.assert_frame_equal(sorted_df, expected[::-1]) + + sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False]) + tm.assert_frame_equal(sorted_df, expected) + + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + frame.sort_values(by=["A", "B"], axis=2, inplace=True) + + # by row (axis=1): GH#10806 + sorted_df = frame.sort_values(by=3, axis=1) + expected = frame + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=3, axis=1, ascending=False) + expected = frame.reindex(columns=["C", "B", "A"]) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 2], axis="columns") + expected = frame.reindex(columns=["B", "A", "C"]) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False]) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False) + expected = frame.reindex(columns=["C", "B", "A"]) + tm.assert_frame_equal(sorted_df, expected) + + msg = r"Length of ascending \(5\) != length of by \(2\)" + with pytest.raises(ValueError, match=msg): + frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5) + + def test_sort_values_by_empty_list(self): + # https://github.com/pandas-dev/pandas/issues/40258 + expected = DataFrame({"a": [1, 4, 2, 5, 3, 6]}) + result = expected.sort_values(by=[]) + tm.assert_frame_equal(result, expected) + assert result is not expected + + def test_sort_values_inplace(self): + frame = DataFrame( + np.random.default_rng(2).standard_normal((4, 4)), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values(by="A", inplace=True) + assert return_value is None + expected = frame.sort_values(by="A") + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values(by=1, axis=1, inplace=True) + assert return_value is None + expected = frame.sort_values(by=1, axis=1) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values(by="A", ascending=False, inplace=True) + assert return_value is None + expected = frame.sort_values(by="A", ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values( + by=["A", "B"], ascending=False, inplace=True + ) + assert return_value is None + expected = frame.sort_values(by=["A", "B"], ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_multicolumn(self): + A = np.arange(5).repeat(20) + B = np.tile(np.arange(5), 20) + np.random.default_rng(2).shuffle(A) + np.random.default_rng(2).shuffle(B) + frame = DataFrame( + {"A": A, "B": B, "C": np.random.default_rng(2).standard_normal(100)} + ) + + result = frame.sort_values(by=["A", "B"]) + indexer = np.lexsort((frame["B"], frame["A"])) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + result = frame.sort_values(by=["A", "B"], ascending=False) + indexer = np.lexsort( + (frame["B"].rank(ascending=False), frame["A"].rank(ascending=False)) + ) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + result = frame.sort_values(by=["B", "A"]) + indexer = np.lexsort((frame["A"], frame["B"])) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + def test_sort_values_multicolumn_uint64(self): + # GH#9918 + # uint64 multicolumn sort + + df = DataFrame( + { + "a": pd.Series([18446637057563306014, 1162265347240853609]), + "b": pd.Series([1, 2]), + } + ) + df["a"] = df["a"].astype(np.uint64) + result = df.sort_values(["a", "b"]) + + expected = DataFrame( + { + "a": pd.Series([18446637057563306014, 1162265347240853609]), + "b": pd.Series([1, 2]), + }, + index=pd.Index([1, 0]), + ) + + tm.assert_frame_equal(result, expected) + + def test_sort_values_nan(self): + # GH#3917 + df = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]} + ) + + # sort one column only + expected = DataFrame( + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A"], na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + expected = DataFrame( + {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3], + ) + sorted_df = df.sort_values(["A"], na_position="first", ascending=False) + tm.assert_frame_equal(sorted_df, expected) + + expected = df.reindex(columns=["B", "A"]) + sorted_df = df.sort_values(by=1, axis=1, na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + # na_position='last', order + expected = DataFrame( + {"A": [1, 1, 2, 4, 6, 8, np.nan], "B": [2, 9, np.nan, 5, 5, 4, 5]}, + index=[3, 0, 1, 6, 4, 5, 2], + ) + sorted_df = df.sort_values(["A", "B"]) + tm.assert_frame_equal(sorted_df, expected) + + # na_position='first', order + expected = DataFrame( + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, np.nan, 5, 5, 4]}, + index=[2, 3, 0, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A", "B"], na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + # na_position='first', not order + expected = DataFrame( + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first") + tm.assert_frame_equal(sorted_df, expected) + + # na_position='last', not order + expected = DataFrame( + {"A": [8, 6, 4, 2, 1, 1, np.nan], "B": [4, 5, 5, np.nan, 2, 9, 5]}, + index=[5, 4, 6, 1, 3, 0, 2], + ) + sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last") + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_stable_descending_sort(self): + # GH#6399 + df = DataFrame( + [[2, "first"], [2, "second"], [1, "a"], [1, "b"]], + columns=["sort_col", "order"], + ) + sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False) + tm.assert_frame_equal(df, sorted_df) + + @pytest.mark.parametrize( + "expected_idx_non_na, ascending", + [ + [ + [3, 4, 5, 0, 1, 8, 6, 9, 7, 10, 13, 14], + [True, True], + ], + [ + [0, 3, 4, 5, 1, 8, 6, 7, 10, 13, 14, 9], + [True, False], + ], + [ + [9, 7, 10, 13, 14, 6, 8, 1, 3, 4, 5, 0], + [False, True], + ], + [ + [7, 10, 13, 14, 9, 6, 8, 1, 0, 3, 4, 5], + [False, False], + ], + ], + ) + @pytest.mark.parametrize("na_position", ["first", "last"]) + def test_sort_values_stable_multicolumn_sort( + self, expected_idx_non_na, ascending, na_position + ): + # GH#38426 Clarify sort_values with mult. columns / labels is stable + df = DataFrame( + { + "A": [1, 2, np.nan, 1, 1, 1, 6, 8, 4, 8, 8, np.nan, np.nan, 8, 8], + "B": [9, np.nan, 5, 2, 2, 2, 5, 4, 5, 3, 4, np.nan, np.nan, 4, 4], + } + ) + # All rows with NaN in col "B" only have unique values in "A", therefore, + # only the rows with NaNs in "A" have to be treated individually: + expected_idx = ( + [11, 12, 2] + expected_idx_non_na + if na_position == "first" + else expected_idx_non_na + [2, 11, 12] + ) + expected = df.take(expected_idx) + sorted_df = df.sort_values( + ["A", "B"], ascending=ascending, na_position=na_position + ) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_stable_categorial(self): + # GH#16793 + df = DataFrame({"x": Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)}) + expected = df.copy() + sorted_df = df.sort_values("x", kind="mergesort") + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_datetimes(self): + # GH#3461, argsort / lexsort differences for a datetime column + df = DataFrame( + ["a", "a", "a", "b", "c", "d", "e", "f", "g"], + columns=["A"], + index=date_range("20130101", periods=9), + ) + dts = [ + Timestamp(x) + for x in [ + "2004-02-11", + "2004-01-21", + "2004-01-26", + "2005-09-20", + "2010-10-04", + "2009-05-12", + "2008-11-12", + "2010-09-28", + "2010-09-28", + ] + ] + df["B"] = dts[::2] + dts[1::2] + df["C"] = 2.0 + df["A1"] = 3.0 + + df1 = df.sort_values(by="A") + df2 = df.sort_values(by=["A"]) + tm.assert_frame_equal(df1, df2) + + df1 = df.sort_values(by="B") + df2 = df.sort_values(by=["B"]) + tm.assert_frame_equal(df1, df2) + + df1 = df.sort_values(by="B") + + df2 = df.sort_values(by=["C", "B"]) + tm.assert_frame_equal(df1, df2) + + def test_sort_values_frame_column_inplace_sort_exception( + self, float_frame, using_copy_on_write + ): + s = float_frame["A"] + float_frame_orig = float_frame.copy() + if using_copy_on_write: + # INFO(CoW) Series is a new object, so can be changed inplace + # without modifying original datafame + s.sort_values(inplace=True) + tm.assert_series_equal(s, float_frame_orig["A"].sort_values()) + # column in dataframe is not changed + tm.assert_frame_equal(float_frame, float_frame_orig) + else: + with pytest.raises(ValueError, match="This Series is a view"): + s.sort_values(inplace=True) + + cp = s.copy() + cp.sort_values() # it works! + + def test_sort_values_nat_values_in_int_column(self): + # GH#14922: "sorting with large float and multiple columns incorrect" + + # cause was that the int64 value NaT was considered as "na". Which is + # only correct for datetime64 columns. + + int_values = (2, int(NaT._value)) + float_values = (2.0, -1.797693e308) + + df = DataFrame( + {"int": int_values, "float": float_values}, columns=["int", "float"] + ) + + df_reversed = DataFrame( + {"int": int_values[::-1], "float": float_values[::-1]}, + columns=["int", "float"], + index=[1, 0], + ) + + # NaT is not a "na" for int64 columns, so na_position must not + # influence the result: + df_sorted = df.sort_values(["int", "float"], na_position="last") + tm.assert_frame_equal(df_sorted, df_reversed) + + df_sorted = df.sort_values(["int", "float"], na_position="first") + tm.assert_frame_equal(df_sorted, df_reversed) + + # reverse sorting order + df_sorted = df.sort_values(["int", "float"], ascending=False) + tm.assert_frame_equal(df_sorted, df) + + # and now check if NaT is still considered as "na" for datetime64 + # columns: + df = DataFrame( + {"datetime": [Timestamp("2016-01-01"), NaT], "float": float_values}, + columns=["datetime", "float"], + ) + + df_reversed = DataFrame( + {"datetime": [NaT, Timestamp("2016-01-01")], "float": float_values[::-1]}, + columns=["datetime", "float"], + index=[1, 0], + ) + + df_sorted = df.sort_values(["datetime", "float"], na_position="first") + tm.assert_frame_equal(df_sorted, df_reversed) + + df_sorted = df.sort_values(["datetime", "float"], na_position="last") + tm.assert_frame_equal(df_sorted, df) + + # Ascending should not affect the results. + df_sorted = df.sort_values(["datetime", "float"], ascending=False) + tm.assert_frame_equal(df_sorted, df) + + def test_sort_nat(self): + # GH 16836 + + d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] + d2 = [ + Timestamp(x) + for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] + ] + df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] + d4 = [ + Timestamp(x) + for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] + ] + expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=["a", "b"]) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_na_position_with_categories(self): + # GH#22556 + # Positioning missing value properly when column is Categorical. + categories = ["A", "B", "C"] + category_indices = [0, 2, 4] + list_of_nans = [np.nan, np.nan] + na_indices = [1, 3] + na_position_first = "first" + na_position_last = "last" + column_name = "c" + + reversed_categories = sorted(categories, reverse=True) + reversed_category_indices = sorted(category_indices, reverse=True) + reversed_na_indices = sorted(na_indices) + + df = DataFrame( + { + column_name: Categorical( + ["A", np.nan, "B", np.nan, "C"], categories=categories, ordered=True + ) + } + ) + # sort ascending with na first + result = df.sort_values( + by=column_name, ascending=True, na_position=na_position_first + ) + expected = DataFrame( + { + column_name: Categorical( + list_of_nans + categories, categories=categories, ordered=True + ) + }, + index=na_indices + category_indices, + ) + + tm.assert_frame_equal(result, expected) + + # sort ascending with na last + result = df.sort_values( + by=column_name, ascending=True, na_position=na_position_last + ) + expected = DataFrame( + { + column_name: Categorical( + categories + list_of_nans, categories=categories, ordered=True + ) + }, + index=category_indices + na_indices, + ) + + tm.assert_frame_equal(result, expected) + + # sort descending with na first + result = df.sort_values( + by=column_name, ascending=False, na_position=na_position_first + ) + expected = DataFrame( + { + column_name: Categorical( + list_of_nans + reversed_categories, + categories=categories, + ordered=True, + ) + }, + index=reversed_na_indices + reversed_category_indices, + ) + + tm.assert_frame_equal(result, expected) + + # sort descending with na last + result = df.sort_values( + by=column_name, ascending=False, na_position=na_position_last + ) + expected = DataFrame( + { + column_name: Categorical( + reversed_categories + list_of_nans, + categories=categories, + ordered=True, + ) + }, + index=reversed_category_indices + reversed_na_indices, + ) + + tm.assert_frame_equal(result, expected) + + def test_sort_values_nat(self): + # GH#16836 + + d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] + d2 = [ + Timestamp(x) + for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] + ] + df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] + d4 = [ + Timestamp(x) + for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] + ] + expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=["a", "b"]) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_na_position_with_categories_raises(self): + df = DataFrame( + { + "c": Categorical( + ["A", np.nan, "B", np.nan, "C"], + categories=["A", "B", "C"], + ordered=True, + ) + } + ) + + with pytest.raises(ValueError, match="invalid na_position: bad_position"): + df.sort_values(by="c", ascending=False, na_position="bad_position") + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ignore_index, output_index", + [ + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]), + ( + {"A": [1, 2, 3], "B": [2, 3, 4]}, + {"A": [3, 2, 1], "B": [4, 3, 2]}, + True, + [0, 1, 2], + ), + ( + {"A": [1, 2, 3], "B": [2, 3, 4]}, + {"A": [3, 2, 1], "B": [4, 3, 2]}, + False, + [2, 1, 0], + ), + ], + ) + def test_sort_values_ignore_index( + self, inplace, original_dict, sorted_dict, ignore_index, output_index + ): + # GH 30114 + df = DataFrame(original_dict) + expected = DataFrame(sorted_dict, index=output_index) + kwargs = {"ignore_index": ignore_index, "inplace": inplace} + + if inplace: + result_df = df.copy() + result_df.sort_values("A", ascending=False, **kwargs) + else: + result_df = df.sort_values("A", ascending=False, **kwargs) + + tm.assert_frame_equal(result_df, expected) + tm.assert_frame_equal(df, DataFrame(original_dict)) + + def test_sort_values_nat_na_position_default(self): + # GH 13230 + expected = DataFrame( + { + "A": [1, 2, 3, 4, 4], + "date": pd.DatetimeIndex( + [ + "2010-01-01 09:00:00", + "2010-01-01 09:00:01", + "2010-01-01 09:00:02", + "2010-01-01 09:00:03", + "NaT", + ] + ), + } + ) + result = expected.sort_values(["A", "date"]) + tm.assert_frame_equal(result, expected) + + def test_sort_values_item_cache(self, using_array_manager, using_copy_on_write): + # previous behavior incorrect retained an invalid _item_cache entry + df = DataFrame( + np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"] + ) + df["D"] = df["A"] * 2 + ser = df["A"] + if not using_array_manager: + assert len(df._mgr.blocks) == 2 + + df.sort_values(by="A") + + if using_copy_on_write: + ser.iloc[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] != 99 + else: + ser.values[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] == 99 + + def test_sort_values_reshaping(self): + # GH 39426 + values = list(range(21)) + expected = DataFrame([values], columns=values) + df = expected.sort_values(expected.index[0], axis=1, ignore_index=True) + + tm.assert_frame_equal(df, expected) + + def test_sort_values_no_by_inplace(self): + # GH#50643 + df = DataFrame({"a": [1, 2, 3]}) + expected = df.copy() + result = df.sort_values(by=[], inplace=True) + tm.assert_frame_equal(df, expected) + assert result is None + + def test_sort_values_no_op_reset_index(self): + # GH#52553 + df = DataFrame({"A": [10, 20], "B": [1, 5]}, index=[2, 3]) + result = df.sort_values(by="A", ignore_index=True) + expected = DataFrame({"A": [10, 20], "B": [1, 5]}) + tm.assert_frame_equal(result, expected) + + +class TestDataFrameSortKey: # test key sorting (issue 27237) + def test_sort_values_inplace_key(self, sort_by_key): + frame = DataFrame( + np.random.default_rng(2).standard_normal((4, 4)), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values(by="A", inplace=True, key=sort_by_key) + assert return_value is None + expected = frame.sort_values(by="A", key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values( + by=1, axis=1, inplace=True, key=sort_by_key + ) + assert return_value is None + expected = frame.sort_values(by=1, axis=1, key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + return_value = sorted_df.sort_values( + by="A", ascending=False, inplace=True, key=sort_by_key + ) + assert return_value is None + expected = frame.sort_values(by="A", ascending=False, key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values( + by=["A", "B"], ascending=False, inplace=True, key=sort_by_key + ) + expected = frame.sort_values(by=["A", "B"], ascending=False, key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_key(self): + df = DataFrame(np.array([0, 5, np.nan, 3, 2, np.nan])) + + result = df.sort_values(0) + expected = df.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(0, key=lambda x: x + 5) + expected = df.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(0, key=lambda x: -x, ascending=False) + expected = df.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_frame_equal(result, expected) + + def test_sort_values_by_key(self): + df = DataFrame( + { + "a": np.array([0, 3, np.nan, 3, 2, np.nan]), + "b": np.array([0, 2, np.nan, 5, 2, np.nan]), + } + ) + + result = df.sort_values("a", key=lambda x: -x) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a", "b"], key=lambda x: -x) + expected = df.iloc[[3, 1, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a", "b"], key=lambda x: -x, ascending=False) + expected = df.iloc[[0, 4, 1, 3, 2, 5]] + tm.assert_frame_equal(result, expected) + + def test_sort_values_by_key_by_name(self): + df = DataFrame( + { + "a": np.array([0, 3, np.nan, 3, 2, np.nan]), + "b": np.array([0, 2, np.nan, 5, 2, np.nan]), + } + ) + + def key(col): + if col.name == "a": + return -col + else: + return col + + result = df.sort_values(by="a", key=key) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a"], key=key) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by="b", key=key) + expected = df.iloc[[0, 1, 4, 3, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a", "b"], key=key) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + def test_sort_values_key_string(self): + df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) + + result = df.sort_values(1) + expected = df[::-1] + tm.assert_frame_equal(result, expected) + + result = df.sort_values([0, 1], key=lambda col: col.str.lower()) + tm.assert_frame_equal(result, df) + + result = df.sort_values( + [0, 1], key=lambda col: col.str.lower(), ascending=False + ) + expected = df.sort_values(1, key=lambda col: col.str.lower(), ascending=False) + tm.assert_frame_equal(result, expected) + + def test_sort_values_key_empty(self, sort_by_key): + df = DataFrame(np.array([])) + + df.sort_values(0, key=sort_by_key) + df.sort_index(key=sort_by_key) + + def test_changes_length_raises(self): + df = DataFrame({"A": [1, 2, 3]}) + with pytest.raises(ValueError, match="change the shape"): + df.sort_values("A", key=lambda x: x[:1]) + + def test_sort_values_key_axes(self): + df = DataFrame({0: ["Hello", "goodbye"], 1: [0, 1]}) + + result = df.sort_values(0, key=lambda col: col.str.lower()) + expected = df[::-1] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(1, key=lambda col: -col) + expected = df[::-1] + tm.assert_frame_equal(result, expected) + + def test_sort_values_key_dict_axis(self): + df = DataFrame({0: ["Hello", 0], 1: ["goodbye", 1]}) + + result = df.sort_values(0, key=lambda col: col.str.lower(), axis=1) + expected = df.loc[:, ::-1] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(1, key=lambda col: -col, axis=1) + expected = df.loc[:, ::-1] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("ordered", [True, False]) + def test_sort_values_key_casts_to_categorical(self, ordered): + # https://github.com/pandas-dev/pandas/issues/36383 + categories = ["c", "b", "a"] + df = DataFrame({"x": [1, 1, 1], "y": ["a", "b", "c"]}) + + def sorter(key): + if key.name == "y": + return pd.Series( + Categorical(key, categories=categories, ordered=ordered) + ) + return key + + result = df.sort_values(by=["x", "y"], key=sorter) + expected = DataFrame( + {"x": [1, 1, 1], "y": ["c", "b", "a"]}, index=pd.Index([2, 1, 0]) + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def df_none(): + return DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 2, 2, 1, 1], + "A": np.arange(6, 0, -1), + ("B", 5): ["one", "one", "two", "two", "one", "one"], + } + ) + + +@pytest.fixture(params=[["outer"], ["outer", "inner"]]) +def df_idx(request, df_none): + levels = request.param + return df_none.set_index(levels) + + +@pytest.fixture( + params=[ + "inner", # index level + ["outer"], # list of index level + "A", # column + [("B", 5)], # list of column + ["inner", "outer"], # two index levels + [("B", 5), "outer"], # index level and column + ["A", ("B", 5)], # Two columns + ["inner", "outer"], # two index levels and column + ] +) +def sort_names(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def ascending(request): + return request.param + + +class TestSortValuesLevelAsStr: + def test_sort_index_level_and_column_label( + self, df_none, df_idx, sort_names, ascending, request + ): + # GH#14353 + if ( + Version(np.__version__) >= Version("1.25") + and request.node.callspec.id == "df_idx0-inner-True" + ): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + # Get index levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on columns and the setting index + expected = df_none.sort_values( + by=sort_names, ascending=ascending, axis=0 + ).set_index(levels) + + # Compute result sorting on mix on columns and index levels + result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0) + + tm.assert_frame_equal(result, expected) + + def test_sort_column_level_and_index_label( + self, df_none, df_idx, sort_names, ascending, request + ): + # GH#14353 + + # Get levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on axis=0, setting index levels, and then + # transposing. For some cases this will result in a frame with + # multiple column levels + expected = ( + df_none.sort_values(by=sort_names, ascending=ascending, axis=0) + .set_index(levels) + .T + ) + + # Compute result by transposing and sorting on axis=1. + result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) + + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + tm.assert_frame_equal(result, expected) + + def test_sort_values_validate_ascending_for_value_error(self): + # GH41634 + df = DataFrame({"D": [23, 7, 21]}) + + msg = 'For argument "ascending" expected type bool, received type str.' + with pytest.raises(ValueError, match=msg): + df.sort_values(by="D", ascending="False") + + @pytest.mark.parametrize("ascending", [False, 0, 1, True]) + def test_sort_values_validate_ascending_functional(self, ascending): + df = DataFrame({"D": [23, 7, 21]}) + indexer = df["D"].argsort().values + + if not ascending: + indexer = indexer[::-1] + + expected = df.loc[df.index[indexer]] + result = df.sort_values(by="D", ascending=ascending) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_swapaxes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_swapaxes.py new file mode 100644 index 0000000000000000000000000000000000000000..53a4691d48b1c7027e6e05c2050f4aa0eca4b3b4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_swapaxes.py @@ -0,0 +1,37 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestSwapAxes: + def test_swapaxes(self): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) + msg = "'DataFrame.swapaxes' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) + tm.assert_frame_equal(df.T, df.swapaxes(1, 0)) + + def test_swapaxes_noop(self): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) + msg = "'DataFrame.swapaxes' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_frame_equal(df, df.swapaxes(0, 0)) + + def test_swapaxes_invalid_axis(self): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) + msg = "'DataFrame.swapaxes' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.swapaxes(2, 5) + + def test_round_empty_not_input(self): + # GH#51032 + df = DataFrame({"a": [1, 2]}) + msg = "'DataFrame.swapaxes' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.swapaxes("index", "index") + tm.assert_frame_equal(df, result) + assert df is not result diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_swaplevel.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_swaplevel.py new file mode 100644 index 0000000000000000000000000000000000000000..5511ac7d6b1b209ba00a7414671aa7e61d403898 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_swaplevel.py @@ -0,0 +1,36 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestSwaplevel: + def test_swaplevel(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + + swapped = frame["A"].swaplevel() + swapped2 = frame["A"].swaplevel(0) + swapped3 = frame["A"].swaplevel(0, 1) + swapped4 = frame["A"].swaplevel("first", "second") + assert not swapped.index.equals(frame.index) + tm.assert_series_equal(swapped, swapped2) + tm.assert_series_equal(swapped, swapped3) + tm.assert_series_equal(swapped, swapped4) + + back = swapped.swaplevel() + back2 = swapped.swaplevel(0) + back3 = swapped.swaplevel(0, 1) + back4 = swapped.swaplevel("second", "first") + assert back.index.equals(frame.index) + tm.assert_series_equal(back, back2) + tm.assert_series_equal(back, back3) + tm.assert_series_equal(back, back4) + + ft = frame.T + swapped = ft.swaplevel("first", "second", axis=1) + exp = frame.swaplevel("first", "second").T + tm.assert_frame_equal(swapped, exp) + + msg = "Can only swap levels on a hierarchical axis." + with pytest.raises(TypeError, match=msg): + DataFrame(range(3)).swaplevel() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_csv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_csv.py new file mode 100644 index 0000000000000000000000000000000000000000..250567eafc670eae168e87008c0cf978877d3397 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_csv.py @@ -0,0 +1,1406 @@ +import csv +from io import StringIO +import os + +import numpy as np +import pytest + +from pandas.errors import ParserError + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + period_range, + read_csv, + to_datetime, +) +import pandas._testing as tm +import pandas.core.common as com + +from pandas.io.common import get_handle + + +class TestDataFrameToCSV: + def read_csv(self, path, **kwargs): + params = {"index_col": 0} + params.update(**kwargs) + + return read_csv(path, **params) + + def test_to_csv_from_csv1(self, float_frame, datetime_frame): + with tm.ensure_clean("__tmp_to_csv_from_csv1__") as path: + float_frame.iloc[:5, float_frame.columns.get_loc("A")] = np.nan + + float_frame.to_csv(path) + float_frame.to_csv(path, columns=["A", "B"]) + float_frame.to_csv(path, header=False) + float_frame.to_csv(path, index=False) + + # test roundtrip + # freq does not roundtrip + datetime_frame.index = datetime_frame.index._with_freq(None) + datetime_frame.to_csv(path) + recons = self.read_csv(path, parse_dates=True) + tm.assert_frame_equal(datetime_frame, recons) + + datetime_frame.to_csv(path, index_label="index") + recons = self.read_csv(path, index_col=None, parse_dates=True) + + assert len(recons.columns) == len(datetime_frame.columns) + 1 + + # no index + datetime_frame.to_csv(path, index=False) + recons = self.read_csv(path, index_col=None, parse_dates=True) + tm.assert_almost_equal(datetime_frame.values, recons.values) + + # corner case + dm = DataFrame( + { + "s1": Series(range(3), index=np.arange(3, dtype=np.int64)), + "s2": Series(range(2), index=np.arange(2, dtype=np.int64)), + } + ) + dm.to_csv(path) + + recons = self.read_csv(path) + tm.assert_frame_equal(dm, recons) + + def test_to_csv_from_csv2(self, float_frame): + with tm.ensure_clean("__tmp_to_csv_from_csv2__") as path: + # duplicate index + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + index=["a", "a", "b"], + columns=["x", "y", "z"], + ) + df.to_csv(path) + result = self.read_csv(path) + tm.assert_frame_equal(result, df) + + midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + index=midx, + columns=["x", "y", "z"], + ) + + df.to_csv(path) + result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False) + tm.assert_frame_equal(result, df, check_names=False) + + # column aliases + col_aliases = Index(["AA", "X", "Y", "Z"]) + float_frame.to_csv(path, header=col_aliases) + + rs = self.read_csv(path) + xp = float_frame.copy() + xp.columns = col_aliases + tm.assert_frame_equal(xp, rs) + + msg = "Writing 4 cols but got 2 aliases" + with pytest.raises(ValueError, match=msg): + float_frame.to_csv(path, header=["AA", "X"]) + + def test_to_csv_from_csv3(self): + with tm.ensure_clean("__tmp_to_csv_from_csv3__") as path: + df1 = DataFrame(np.random.default_rng(2).standard_normal((3, 1))) + df2 = DataFrame(np.random.default_rng(2).standard_normal((3, 1))) + + df1.to_csv(path) + df2.to_csv(path, mode="a", header=False) + xp = pd.concat([df1, df2]) + rs = read_csv(path, index_col=0) + rs.columns = [int(label) for label in rs.columns] + xp.columns = [int(label) for label in xp.columns] + tm.assert_frame_equal(xp, rs) + + def test_to_csv_from_csv4(self): + with tm.ensure_clean("__tmp_to_csv_from_csv4__") as path: + # GH 10833 (TimedeltaIndex formatting) + dt = pd.Timedelta(seconds=1) + df = DataFrame( + {"dt_data": [i * dt for i in range(3)]}, + index=Index([i * dt for i in range(3)], name="dt_index"), + ) + df.to_csv(path) + + result = read_csv(path, index_col="dt_index") + result.index = pd.to_timedelta(result.index) + result["dt_data"] = pd.to_timedelta(result["dt_data"]) + + tm.assert_frame_equal(df, result, check_index_type=True) + + def test_to_csv_from_csv5(self, timezone_frame): + # tz, 8260 + with tm.ensure_clean("__tmp_to_csv_from_csv5__") as path: + timezone_frame.to_csv(path) + result = read_csv(path, index_col=0, parse_dates=["A"]) + + converter = ( + lambda c: to_datetime(result[c]) + .dt.tz_convert("UTC") + .dt.tz_convert(timezone_frame[c].dt.tz) + ) + result["B"] = converter("B") + result["C"] = converter("C") + tm.assert_frame_equal(result, timezone_frame) + + def test_to_csv_cols_reordering(self): + # GH3454 + chunksize = 5 + N = int(chunksize * 2.5) + + df = DataFrame( + np.ones((N, 3)), + index=Index([f"i-{i}" for i in range(N)], name="a"), + columns=Index([f"i-{i}" for i in range(3)], name="a"), + ) + cs = df.columns + cols = [cs[2], cs[0]] + + with tm.ensure_clean() as path: + df.to_csv(path, columns=cols, chunksize=chunksize) + rs_c = read_csv(path, index_col=0) + + tm.assert_frame_equal(df[cols], rs_c, check_names=False) + + @pytest.mark.parametrize("cols", [None, ["b", "a"]]) + def test_to_csv_new_dupe_cols(self, cols): + chunksize = 5 + N = int(chunksize * 2.5) + + # dupe cols + df = DataFrame( + np.ones((N, 3)), + index=Index([f"i-{i}" for i in range(N)], name="a"), + columns=["a", "a", "b"], + ) + with tm.ensure_clean() as path: + df.to_csv(path, columns=cols, chunksize=chunksize) + rs_c = read_csv(path, index_col=0) + + # we wrote them in a different order + # so compare them in that order + if cols is not None: + if df.columns.is_unique: + rs_c.columns = cols + else: + indexer, missing = df.columns.get_indexer_non_unique(cols) + rs_c.columns = df.columns.take(indexer) + + for c in cols: + obj_df = df[c] + obj_rs = rs_c[c] + if isinstance(obj_df, Series): + tm.assert_series_equal(obj_df, obj_rs) + else: + tm.assert_frame_equal(obj_df, obj_rs, check_names=False) + + # wrote in the same order + else: + rs_c.columns = df.columns + tm.assert_frame_equal(df, rs_c, check_names=False) + + @pytest.mark.slow + def test_to_csv_dtnat(self): + # GH3437 + def make_dtnat_arr(n, nnat=None): + if nnat is None: + nnat = int(n * 0.1) # 10% + s = list(date_range("2000", freq="5min", periods=n)) + if nnat: + for i in np.random.default_rng(2).integers(0, len(s), nnat): + s[i] = NaT + i = np.random.default_rng(2).integers(100) + s[-i] = NaT + s[i] = NaT + return s + + chunksize = 1000 + s1 = make_dtnat_arr(chunksize + 5) + s2 = make_dtnat_arr(chunksize + 5, 0) + + with tm.ensure_clean("1.csv") as pth: + df = DataFrame({"a": s1, "b": s2}) + df.to_csv(pth, chunksize=chunksize) + + recons = self.read_csv(pth).apply(to_datetime) + tm.assert_frame_equal(df, recons, check_names=False) + + def _return_result_expected( + self, + df, + chunksize, + r_dtype=None, + c_dtype=None, + rnlvl=None, + cnlvl=None, + dupe_col=False, + ): + kwargs = {"parse_dates": False} + if cnlvl: + if rnlvl is not None: + kwargs["index_col"] = list(range(rnlvl)) + kwargs["header"] = list(range(cnlvl)) + + with tm.ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(path, **kwargs) + else: + kwargs["header"] = 0 + + with tm.ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) + recons = self.read_csv(path, **kwargs) + + def _to_uni(x): + if not isinstance(x, str): + return x.decode("utf8") + return x + + if dupe_col: + # read_Csv disambiguates the columns by + # labeling them dupe.1,dupe.2, etc'. monkey patch columns + recons.columns = df.columns + if rnlvl and not cnlvl: + delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)] + ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) + recons.index = ix + recons = recons.iloc[:, rnlvl - 1 :] + + type_map = {"i": "i", "f": "f", "s": "O", "u": "O", "dt": "O", "p": "O"} + if r_dtype: + if r_dtype == "u": # unicode + r_dtype = "O" + recons.index = np.array( + [_to_uni(label) for label in recons.index], dtype=r_dtype + ) + df.index = np.array( + [_to_uni(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "dt": # unicode + r_dtype = "O" + recons.index = np.array( + [Timestamp(label) for label in recons.index], dtype=r_dtype + ) + df.index = np.array( + [Timestamp(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "p": + r_dtype = "O" + idx_list = to_datetime(recons.index) + recons.index = np.array( + [Timestamp(label) for label in idx_list], dtype=r_dtype + ) + df.index = np.array( + list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype + ) + else: + r_dtype = type_map.get(r_dtype) + recons.index = np.array(recons.index, dtype=r_dtype) + df.index = np.array(df.index, dtype=r_dtype) + if c_dtype: + if c_dtype == "u": + c_dtype = "O" + recons.columns = np.array( + [_to_uni(label) for label in recons.columns], dtype=c_dtype + ) + df.columns = np.array( + [_to_uni(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "dt": + c_dtype = "O" + recons.columns = np.array( + [Timestamp(label) for label in recons.columns], dtype=c_dtype + ) + df.columns = np.array( + [Timestamp(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "p": + c_dtype = "O" + col_list = to_datetime(recons.columns) + recons.columns = np.array( + [Timestamp(label) for label in col_list], dtype=c_dtype + ) + col_list = df.columns.to_timestamp() + df.columns = np.array( + [Timestamp(label) for label in col_list], dtype=c_dtype + ) + else: + c_dtype = type_map.get(c_dtype) + recons.columns = np.array(recons.columns, dtype=c_dtype) + df.columns = np.array(df.columns, dtype=c_dtype) + return df, recons + + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + def test_to_csv_nrows(self, nrows): + df = DataFrame( + np.ones((nrows, 4)), + index=date_range("2020-01-01", periods=nrows), + columns=Index(list("abcd"), dtype=object), + ) + result, expected = self._return_result_expected(df, 1000, "dt", "s") + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + @pytest.mark.parametrize( + "r_idx_type, c_idx_type", [("i", "i"), ("s", "s"), ("s", "dt"), ("p", "p")] + ) + @pytest.mark.parametrize("ncols", [1, 2, 3, 4]) + @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") + def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): + axes = { + "i": lambda n: Index(np.arange(n), dtype=np.int64), + "s": lambda n: Index([f"{i}_{chr(i)}" for i in range(97, 97 + n)]), + "dt": lambda n: date_range("2020-01-01", periods=n), + "p": lambda n: period_range("2020-01-01", periods=n, freq="D"), + } + df = DataFrame( + np.ones((nrows, ncols)), + index=axes[r_idx_type](nrows), + columns=axes[c_idx_type](ncols), + ) + result, expected = self._return_result_expected( + df, + 1000, + r_idx_type, + c_idx_type, + ) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [10, 98, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + @pytest.mark.parametrize("ncols", [1, 2, 3, 4]) + def test_to_csv_idx_ncols(self, nrows, ncols): + df = DataFrame( + np.ones((nrows, ncols)), + index=Index([f"i-{i}" for i in range(nrows)], name="a"), + columns=Index([f"i-{i}" for i in range(ncols)], name="a"), + ) + result, expected = self._return_result_expected(df, 1000) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + @pytest.mark.parametrize("nrows", [10, 98, 99, 100, 101, 102]) + def test_to_csv_dup_cols(self, nrows): + df = DataFrame( + np.ones((nrows, 3)), + index=Index([f"i-{i}" for i in range(nrows)], name="a"), + columns=Index([f"i-{i}" for i in range(3)], name="a"), + ) + + cols = list(df.columns) + cols[:2] = ["dupe", "dupe"] + cols[-2:] = ["dupe", "dupe"] + ix = list(df.index) + ix[:2] = ["rdupe", "rdupe"] + ix[-2:] = ["rdupe", "rdupe"] + df.index = ix + df.columns = cols + result, expected = self._return_result_expected(df, 1000, dupe_col=True) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + def test_to_csv_empty(self): + df = DataFrame(index=np.arange(10, dtype=np.int64)) + result, expected = self._return_result_expected(df, 1000) + tm.assert_frame_equal(result, expected, check_column_type=False) + + @pytest.mark.slow + def test_to_csv_chunksize(self): + chunksize = 1000 + rows = chunksize // 2 + 1 + df = DataFrame( + np.ones((rows, 2)), + columns=Index(list("ab"), dtype=object), + index=MultiIndex.from_arrays([range(rows) for _ in range(2)]), + ) + result, expected = self._return_result_expected(df, chunksize, rnlvl=2) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.slow + @pytest.mark.parametrize( + "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] + ) + @pytest.mark.parametrize("ncols", [2, 3, 4]) + @pytest.mark.parametrize( + "df_params, func_params", + [ + [{"r_idx_nlevels": 2}, {"rnlvl": 2}], + [{"c_idx_nlevels": 2}, {"cnlvl": 2}], + [{"r_idx_nlevels": 2, "c_idx_nlevels": 2}, {"rnlvl": 2, "cnlvl": 2}], + ], + ) + def test_to_csv_params(self, nrows, df_params, func_params, ncols): + if df_params.get("r_idx_nlevels"): + index = MultiIndex.from_arrays( + [f"i-{i}" for i in range(nrows)] + for _ in range(df_params["r_idx_nlevels"]) + ) + else: + index = None + + if df_params.get("c_idx_nlevels"): + columns = MultiIndex.from_arrays( + [f"i-{i}" for i in range(ncols)] + for _ in range(df_params["c_idx_nlevels"]) + ) + else: + columns = Index([f"i-{i}" for i in range(ncols)], dtype=object) + df = DataFrame(np.ones((nrows, ncols)), index=index, columns=columns) + result, expected = self._return_result_expected(df, 1000, **func_params) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_to_csv_from_csv_w_some_infs(self, float_frame): + # test roundtrip with inf, -inf, nan, as full columns and mix + float_frame["G"] = np.nan + f = lambda x: [np.inf, np.nan][np.random.default_rng(2).random() < 0.5] + float_frame["h"] = float_frame.index.map(f) + + with tm.ensure_clean() as path: + float_frame.to_csv(path) + recons = self.read_csv(path) + + tm.assert_frame_equal(float_frame, recons) + tm.assert_frame_equal(np.isinf(float_frame), np.isinf(recons)) + + def test_to_csv_from_csv_w_all_infs(self, float_frame): + # test roundtrip with inf, -inf, nan, as full columns and mix + float_frame["E"] = np.inf + float_frame["F"] = -np.inf + + with tm.ensure_clean() as path: + float_frame.to_csv(path) + recons = self.read_csv(path) + + tm.assert_frame_equal(float_frame, recons) + tm.assert_frame_equal(np.isinf(float_frame), np.isinf(recons)) + + def test_to_csv_no_index(self): + # GH 3624, after appending columns, to_csv fails + with tm.ensure_clean("__tmp_to_csv_no_index__") as path: + df = DataFrame({"c1": [1, 2, 3], "c2": [4, 5, 6]}) + df.to_csv(path, index=False) + result = read_csv(path) + tm.assert_frame_equal(df, result) + df["c3"] = Series([7, 8, 9], dtype="int64") + df.to_csv(path, index=False) + result = read_csv(path) + tm.assert_frame_equal(df, result) + + def test_to_csv_with_mix_columns(self): + # gh-11637: incorrect output when a mix of integer and string column + # names passed as columns parameter in to_csv + + df = DataFrame({0: ["a", "b", "c"], 1: ["aa", "bb", "cc"]}) + df["test"] = "txt" + assert df.to_csv() == df.to_csv(columns=[0, 1, "test"]) + + def test_to_csv_headers(self): + # GH6186, the presence or absence of `index` incorrectly + # causes to_csv to have different header semantics. + from_df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + to_df = DataFrame([[1, 2], [3, 4]], columns=["X", "Y"]) + with tm.ensure_clean("__tmp_to_csv_headers__") as path: + from_df.to_csv(path, header=["X", "Y"]) + recons = self.read_csv(path) + + tm.assert_frame_equal(to_df, recons) + + from_df.to_csv(path, index=False, header=["X", "Y"]) + recons = self.read_csv(path) + + return_value = recons.reset_index(inplace=True) + assert return_value is None + tm.assert_frame_equal(to_df, recons) + + def test_to_csv_multiindex(self, float_frame, datetime_frame): + frame = float_frame + old_index = frame.index + arrays = np.arange(len(old_index) * 2, dtype=np.int64).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) + frame.index = new_index + + with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: + frame.to_csv(path, header=False) + frame.to_csv(path, columns=["A", "B"]) + + # round trip + frame.to_csv(path) + + df = self.read_csv(path, index_col=[0, 1], parse_dates=False) + + # TODO to_csv drops column name + tm.assert_frame_equal(frame, df, check_names=False) + assert frame.index.names == df.index.names + + # needed if setUp becomes a class method + float_frame.index = old_index + + # try multiindex with dates + tsframe = datetime_frame + old_index = tsframe.index + new_index = [old_index, np.arange(len(old_index), dtype=np.int64)] + tsframe.index = MultiIndex.from_arrays(new_index) + + tsframe.to_csv(path, index_label=["time", "foo"]) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + recons = self.read_csv(path, index_col=[0, 1], parse_dates=True) + + # TODO to_csv drops column name + tm.assert_frame_equal(tsframe, recons, check_names=False) + + # do not load index + tsframe.to_csv(path) + recons = self.read_csv(path, index_col=None) + assert len(recons.columns) == len(tsframe.columns) + 2 + + # no index + tsframe.to_csv(path, index=False) + recons = self.read_csv(path, index_col=None) + tm.assert_almost_equal(recons.values, datetime_frame.values) + + # needed if setUp becomes class method + datetime_frame.index = old_index + + with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: + # GH3571, GH1651, GH3141 + + def _make_frame(names=None): + if names is True: + names = ["first", "second"] + return DataFrame( + np.random.default_rng(2).integers(0, 10, size=(3, 3)), + columns=MultiIndex.from_tuples( + [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names + ), + dtype="int64", + ) + + # column & index are multi-index + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + index=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(5)] for _ in range(2)], names=list("ab") + ), + ) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) + tm.assert_frame_equal(df, result) + + # column is mi + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + ) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=0) + tm.assert_frame_equal(df, result) + + # dup column names? + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + index=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(5)] for _ in range(3)], names=list("abc") + ), + ) + df.to_csv(path) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) + tm.assert_frame_equal(df, result) + + # writing with no index + df = _make_frame() + df.to_csv(path, index=False) + result = read_csv(path, header=[0, 1]) + tm.assert_frame_equal(df, result) + + # we lose the names here + df = _make_frame(True) + df.to_csv(path, index=False) + result = read_csv(path, header=[0, 1]) + assert com.all_none(*result.columns.names) + result.columns.names = df.columns.names + tm.assert_frame_equal(df, result) + + # whatsnew example + df = _make_frame() + df.to_csv(path) + result = read_csv(path, header=[0, 1], index_col=[0]) + tm.assert_frame_equal(df, result) + + df = _make_frame(True) + df.to_csv(path) + result = read_csv(path, header=[0, 1], index_col=[0]) + tm.assert_frame_equal(df, result) + + # invalid options + df = _make_frame(True) + df.to_csv(path) + + for i in [6, 7]: + msg = f"len of {i}, but only 5 lines in file" + with pytest.raises(ParserError, match=msg): + read_csv(path, header=list(range(i)), index_col=0) + + # write with cols + msg = "cannot specify cols with a MultiIndex" + with pytest.raises(TypeError, match=msg): + df.to_csv(path, columns=["foo", "bar"]) + + with tm.ensure_clean("__tmp_to_csv_multiindex__") as path: + # empty + tsframe[:0].to_csv(path) + recons = self.read_csv(path) + + exp = tsframe[:0] + exp.index = [] + + tm.assert_index_equal(recons.columns, exp.columns) + assert len(recons) == 0 + + def test_to_csv_interval_index(self, using_infer_string): + # GH 28210 + df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) + + with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + df.to_csv(path) + result = self.read_csv(path, index_col=0) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("string[pyarrow_numpy]") + else: + expected.index = expected.index.astype(str) + + tm.assert_frame_equal(result, expected) + + def test_to_csv_float32_nanrep(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((1, 4)).astype(np.float32) + ) + df[1] = np.nan + + with tm.ensure_clean("__tmp_to_csv_float32_nanrep__.csv") as path: + df.to_csv(path, na_rep=999) + + with open(path, encoding="utf-8") as f: + lines = f.readlines() + assert lines[1].split(",")[2] == "999" + + def test_to_csv_withcommas(self): + # Commas inside fields should be correctly escaped when saving as CSV. + df = DataFrame({"A": [1, 2, 3], "B": ["5,6", "7,8", "9,0"]}) + + with tm.ensure_clean("__tmp_to_csv_withcommas__.csv") as path: + df.to_csv(path) + df2 = self.read_csv(path) + tm.assert_frame_equal(df2, df) + + def test_to_csv_mixed(self): + def create_cols(name): + return [f"{name}{i:03d}" for i in range(5)] + + df_float = DataFrame( + np.random.default_rng(2).standard_normal((100, 5)), + dtype="float64", + columns=create_cols("float"), + ) + df_int = DataFrame( + np.random.default_rng(2).standard_normal((100, 5)).astype("int64"), + dtype="int64", + columns=create_cols("int"), + ) + df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) + df_object = DataFrame( + "foo", index=df_float.index, columns=create_cols("object") + ) + df_dt = DataFrame( + Timestamp("20010101").as_unit("ns"), + index=df_float.index, + columns=create_cols("date"), + ) + + # add in some nans + df_float.iloc[30:50, 1:3] = np.nan + df_dt.iloc[30:50, 1:3] = np.nan + + df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) + + # dtype + dtypes = {} + for n, dtype in [ + ("float", np.float64), + ("int", np.int64), + ("bool", np.bool_), + ("object", object), + ]: + for c in create_cols(n): + dtypes[c] = dtype + + with tm.ensure_clean() as filename: + df.to_csv(filename) + rs = read_csv( + filename, index_col=0, dtype=dtypes, parse_dates=create_cols("date") + ) + tm.assert_frame_equal(rs, df) + + def test_to_csv_dups_cols(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((1000, 30)), + columns=list(range(15)) + list(range(15)), + dtype="float64", + ) + + with tm.ensure_clean() as filename: + df.to_csv(filename) # single dtype, fine + result = read_csv(filename, index_col=0) + result.columns = df.columns + tm.assert_frame_equal(result, df) + + df_float = DataFrame( + np.random.default_rng(2).standard_normal((1000, 3)), dtype="float64" + ) + df_int = DataFrame(np.random.default_rng(2).standard_normal((1000, 3))).astype( + "int64" + ) + df_bool = DataFrame(True, index=df_float.index, columns=range(3)) + df_object = DataFrame("foo", index=df_float.index, columns=range(3)) + df_dt = DataFrame( + Timestamp("20010101").as_unit("ns"), index=df_float.index, columns=range(3) + ) + df = pd.concat( + [df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True + ) + + df.columns = [0, 1, 2] * 5 + + with tm.ensure_clean() as filename: + df.to_csv(filename) + result = read_csv(filename, index_col=0) + + # date cols + for i in ["0.4", "1.4", "2.4"]: + result[i] = to_datetime(result[i]) + + result.columns = df.columns + tm.assert_frame_equal(result, df) + + def test_to_csv_dups_cols2(self): + # GH3457 + df = DataFrame( + np.ones((5, 3)), + index=Index([f"i-{i}" for i in range(5)], name="foo"), + columns=Index(["a", "a", "b"], dtype=object), + ) + + with tm.ensure_clean() as filename: + df.to_csv(filename) + + # read_csv will rename the dups columns + result = read_csv(filename, index_col=0) + result = result.rename(columns={"a.1": "a"}) + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize("chunksize", [10000, 50000, 100000]) + def test_to_csv_chunking(self, chunksize): + aa = DataFrame({"A": range(100000)}) + aa["B"] = aa.A + 1.0 + aa["C"] = aa.A + 2.0 + aa["D"] = aa.A + 3.0 + + with tm.ensure_clean() as filename: + aa.to_csv(filename, chunksize=chunksize) + rs = read_csv(filename, index_col=0) + tm.assert_frame_equal(rs, aa) + + @pytest.mark.slow + def test_to_csv_wide_frame_formatting(self, monkeypatch): + # Issue #8621 + chunksize = 100 + df = DataFrame( + np.random.default_rng(2).standard_normal((1, chunksize + 10)), + columns=None, + index=None, + ) + with tm.ensure_clean() as filename: + with monkeypatch.context() as m: + m.setattr("pandas.io.formats.csvs._DEFAULT_CHUNKSIZE_CELLS", chunksize) + df.to_csv(filename, header=False, index=False) + rs = read_csv(filename, header=None) + tm.assert_frame_equal(rs, df) + + def test_to_csv_bug(self): + f1 = StringIO("a,1.0\nb,2.0") + df = self.read_csv(f1, header=None) + newdf = DataFrame({"t": df[df.columns[0]]}) + + with tm.ensure_clean() as path: + newdf.to_csv(path) + + recons = read_csv(path, index_col=0) + # don't check_names as t != 1 + tm.assert_frame_equal(recons, newdf, check_names=False) + + def test_to_csv_unicode(self): + df = DataFrame({"c/\u03c3": [1, 2, 3]}) + with tm.ensure_clean() as path: + df.to_csv(path, encoding="UTF-8") + df2 = read_csv(path, index_col=0, encoding="UTF-8") + tm.assert_frame_equal(df, df2) + + df.to_csv(path, encoding="UTF-8", index=False) + df2 = read_csv(path, index_col=None, encoding="UTF-8") + tm.assert_frame_equal(df, df2) + + def test_to_csv_unicode_index_col(self): + buf = StringIO("") + df = DataFrame( + [["\u05d0", "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]], + columns=["\u05d0", "\u05d1", "\u05d2", "\u05d3"], + index=["\u05d0", "\u05d1"], + ) + + df.to_csv(buf, encoding="UTF-8") + buf.seek(0) + + df2 = read_csv(buf, index_col=0, encoding="UTF-8") + tm.assert_frame_equal(df, df2) + + def test_to_csv_stringio(self, float_frame): + buf = StringIO() + float_frame.to_csv(buf) + buf.seek(0) + recons = read_csv(buf, index_col=0) + tm.assert_frame_equal(recons, float_frame) + + def test_to_csv_float_format(self): + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + + with tm.ensure_clean() as filename: + df.to_csv(filename, float_format="%.2f") + + rs = read_csv(filename, index_col=0) + xp = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + tm.assert_frame_equal(rs, xp) + + def test_to_csv_float_format_over_decimal(self): + # GH#47436 + df = DataFrame({"a": [0.5, 1.0]}) + result = df.to_csv( + decimal=",", + float_format=lambda x: np.format_float_positional(x, trim="-"), + index=False, + ) + expected_rows = ["a", "0.5", "1"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_unicodewriter_quoting(self): + df = DataFrame({"A": [1, 2, 3], "B": ["foo", "bar", "baz"]}) + + buf = StringIO() + df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf-8") + + result = buf.getvalue() + expected_rows = ['"A","B"', '1,"foo"', '2,"bar"', '3,"baz"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + @pytest.mark.parametrize("encoding", [None, "utf-8"]) + def test_to_csv_quote_none(self, encoding): + # GH4328 + df = DataFrame({"A": ["hello", '{"hello"}']}) + buf = StringIO() + df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) + + result = buf.getvalue() + expected_rows = ["A", "hello", '{"hello"}'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_index_no_leading_comma(self): + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) + + buf = StringIO() + df.to_csv(buf, index_label=False) + + expected_rows = ["A,B", "one,1,4", "two,2,5", "three,3,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert buf.getvalue() == expected + + def test_to_csv_lineterminators(self): + # see gh-20353 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) + + with tm.ensure_clean() as path: + # case 1: CRLF as line terminator + df.to_csv(path, lineterminator="\r\n") + expected = b",A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n" + + with open(path, mode="rb") as f: + assert f.read() == expected + + with tm.ensure_clean() as path: + # case 2: LF as line terminator + df.to_csv(path, lineterminator="\n") + expected = b",A,B\none,1,4\ntwo,2,5\nthree,3,6\n" + + with open(path, mode="rb") as f: + assert f.read() == expected + + with tm.ensure_clean() as path: + # case 3: The default line terminator(=os.linesep)(gh-21406) + df.to_csv(path) + os_linesep = os.linesep.encode("utf-8") + expected = ( + b",A,B" + + os_linesep + + b"one,1,4" + + os_linesep + + b"two,2,5" + + os_linesep + + b"three,3,6" + + os_linesep + ) + + with open(path, mode="rb") as f: + assert f.read() == expected + + def test_to_csv_from_csv_categorical(self): + # CSV with categoricals should result in the same output + # as when one would add a "normal" Series/DataFrame. + s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) + s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) + res = StringIO() + + s.to_csv(res, header=False) + exp = StringIO() + + s2.to_csv(exp, header=False) + assert res.getvalue() == exp.getvalue() + + df = DataFrame({"s": s}) + df2 = DataFrame({"s": s2}) + + res = StringIO() + df.to_csv(res) + + exp = StringIO() + df2.to_csv(exp) + + assert res.getvalue() == exp.getvalue() + + def test_to_csv_path_is_none(self, float_frame): + # GH 8215 + # Make sure we return string for consistency with + # Series.to_csv() + csv_str = float_frame.to_csv(path_or_buf=None) + assert isinstance(csv_str, str) + recons = read_csv(StringIO(csv_str), index_col=0) + tm.assert_frame_equal(float_frame, recons) + + @pytest.mark.parametrize( + "df,encoding", + [ + ( + DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ), + None, + ), + # GH 21241, 21118 + (DataFrame([["abc", "def", "ghi"]], columns=["X", "Y", "Z"]), "ascii"), + (DataFrame(5 * [[123, "你好", "世界"]], columns=["X", "Y", "Z"]), "gb2312"), + ( + DataFrame( + 5 * [[123, "Γειά σου", "Κόσμε"]], # noqa: RUF001 + columns=["X", "Y", "Z"], + ), + "cp737", + ), + ], + ) + def test_to_csv_compression(self, df, encoding, compression): + with tm.ensure_clean() as filename: + df.to_csv(filename, compression=compression, encoding=encoding) + # test the round trip - to_csv -> read_csv + result = read_csv( + filename, compression=compression, index_col=0, encoding=encoding + ) + tm.assert_frame_equal(df, result) + + # test the round trip using file handle - to_csv -> read_csv + with get_handle( + filename, "w", compression=compression, encoding=encoding + ) as handles: + df.to_csv(handles.handle, encoding=encoding) + assert not handles.handle.closed + + result = read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + ).squeeze("columns") + tm.assert_frame_equal(df, result) + + # explicitly make sure file is compressed + with tm.decompress_file(filename, compression) as fh: + text = fh.read().decode(encoding or "utf8") + for col in df.columns: + assert col in text + + with tm.decompress_file(filename, compression) as fh: + tm.assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) + + def test_to_csv_date_format(self, datetime_frame): + with tm.ensure_clean("__tmp_to_csv_date_format__") as path: + dt_index = datetime_frame.index + datetime_frame = DataFrame( + {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index + ) + datetime_frame.to_csv(path, date_format="%Y%m%d") + + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + + datetime_frame_int = datetime_frame.map(lambda x: int(x.strftime("%Y%m%d"))) + datetime_frame_int.index = datetime_frame_int.index.map( + lambda x: int(x.strftime("%Y%m%d")) + ) + + tm.assert_frame_equal(test, datetime_frame_int) + + datetime_frame.to_csv(path, date_format="%Y-%m-%d") + + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + datetime_frame_str = datetime_frame.map(lambda x: x.strftime("%Y-%m-%d")) + datetime_frame_str.index = datetime_frame_str.index.map( + lambda x: x.strftime("%Y-%m-%d") + ) + + tm.assert_frame_equal(test, datetime_frame_str) + + # Check that columns get converted + datetime_frame_columns = datetime_frame.T + datetime_frame_columns.to_csv(path, date_format="%Y%m%d") + + test = read_csv(path, index_col=0) + + datetime_frame_columns = datetime_frame_columns.map( + lambda x: int(x.strftime("%Y%m%d")) + ) + # Columns don't get converted to ints by read_csv + datetime_frame_columns.columns = datetime_frame_columns.columns.map( + lambda x: x.strftime("%Y%m%d") + ) + + tm.assert_frame_equal(test, datetime_frame_columns) + + # test NaTs + nat_index = to_datetime( + ["NaT"] * 10 + ["2000-01-01", "2000-01-01", "2000-01-01"] + ) + nat_frame = DataFrame({"A": nat_index}, index=nat_index) + nat_frame.to_csv(path, date_format="%Y-%m-%d") + + test = read_csv(path, parse_dates=[0, 1], index_col=0) + + tm.assert_frame_equal(test, nat_frame) + + @pytest.mark.parametrize("td", [pd.Timedelta(0), pd.Timedelta("10s")]) + def test_to_csv_with_dst_transitions(self, td): + with tm.ensure_clean("csv_date_format_with_dst") as path: + # make sure we are not failing on transitions + times = date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="h", + ambiguous="infer", + ) + i = times + td + i = i._with_freq(None) # freq is not preserved by read_csv + time_range = np.array(range(len(i)), dtype="int64") + df = DataFrame({"A": time_range}, index=i) + df.to_csv(path, index=True) + # we have to reconvert the index as we + # don't parse the tz's + result = read_csv(path, index_col=0) + result.index = to_datetime(result.index, utc=True).tz_convert( + "Europe/London" + ) + tm.assert_frame_equal(result, df) + + def test_to_csv_with_dst_transitions_with_pickle(self): + # GH11619 + idx = date_range("2015-01-01", "2015-12-31", freq="h", tz="Europe/Paris") + idx = idx._with_freq(None) # freq does not round-trip + idx._data._freq = None # otherwise there is trouble on unpickle + df = DataFrame({"values": 1, "idx": idx}, index=idx) + with tm.ensure_clean("csv_date_format_with_dst") as path: + df.to_csv(path, index=True) + result = read_csv(path, index_col=0) + result.index = to_datetime(result.index, utc=True).tz_convert( + "Europe/Paris" + ) + result["idx"] = to_datetime(result["idx"], utc=True).astype( + "datetime64[ns, Europe/Paris]" + ) + tm.assert_frame_equal(result, df) + + # assert working + df.astype(str) + + with tm.ensure_clean("csv_date_format_with_dst") as path: + df.to_pickle(path) + result = pd.read_pickle(path) + tm.assert_frame_equal(result, df) + + def test_to_csv_quoting(self): + df = DataFrame( + { + "c_bool": [True, False], + "c_float": [1.0, 3.2], + "c_int": [42, np.nan], + "c_string": ["a", "b,c"], + } + ) + + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + '1,False,3.2,,"b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + result = df.to_csv() + assert result == expected + + result = df.to_csv(quoting=None) + assert result == expected + + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + '1,False,3.2,,"b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + result = df.to_csv(quoting=csv.QUOTE_MINIMAL) + assert result == expected + + expected_rows = [ + '"","c_bool","c_float","c_int","c_string"', + '"0","True","1.0","42.0","a"', + '"1","False","3.2","","b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + + result = df.to_csv(quoting=csv.QUOTE_ALL) + assert result == expected + + # see gh-12922, gh-13259: make sure changes to + # the formatters do not break this behaviour + expected_rows = [ + '"","c_bool","c_float","c_int","c_string"', + '0,True,1.0,42.0,"a"', + '1,False,3.2,"","b,c"', + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) + assert result == expected + + msg = "need to escape, but no escapechar set" + with pytest.raises(csv.Error, match=msg): + df.to_csv(quoting=csv.QUOTE_NONE) + + with pytest.raises(csv.Error, match=msg): + df.to_csv(quoting=csv.QUOTE_NONE, escapechar=None) + + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + "1,False,3.2,,b!,c", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="!") + assert result == expected + + expected_rows = [ + ",c_bool,c_ffloat,c_int,c_string", + "0,True,1.0,42.0,a", + "1,False,3.2,,bf,c", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="f") + assert result == expected + + # see gh-3503: quoting Windows line terminators + # presents with encoding? + text_rows = ["a,b,c", '1,"test \r\n",3'] + text = tm.convert_rows_list_to_csv_str(text_rows) + df = read_csv(StringIO(text)) + + buf = StringIO() + df.to_csv(buf, encoding="utf-8", index=False) + assert buf.getvalue() == text + + # xref gh-7791: make sure the quoting parameter is passed through + # with multi-indexes + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) + df = df.set_index(["a", "b"]) + + expected_rows = ['"a","b","c"', '"1","3","5"', '"2","4","6"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(quoting=csv.QUOTE_ALL) == expected + + def test_period_index_date_overflow(self): + # see gh-15982 + + dates = ["1990-01-01", "2000-01-01", "3005-01-01"] + index = pd.PeriodIndex(dates, freq="D") + + df = DataFrame([4, 5, 6], index=index) + result = df.to_csv() + + expected_rows = [",0", "1990-01-01,4", "2000-01-01,5", "3005-01-01,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + date_format = "%m-%d-%Y" + result = df.to_csv(date_format=date_format) + + expected_rows = [",0", "01-01-1990,4", "01-01-2000,5", "01-01-3005,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + # Overflow with pd.NaT + dates = ["1990-01-01", NaT, "3005-01-01"] + index = pd.PeriodIndex(dates, freq="D") + + df = DataFrame([4, 5, 6], index=index) + result = df.to_csv() + + expected_rows = [",0", "1990-01-01,4", ",5", "3005-01-01,6"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_multi_index_header(self): + # see gh-5539 + columns = MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) + df.columns = columns + + header = ["a", "b", "c", "d"] + result = df.to_csv(header=header) + + expected_rows = [",a,b,c,d", "0,1,2,3,4", "1,5,6,7,8"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_single_level_multi_index(self): + # see gh-26303 + index = Index([(1,), (2,), (3,)]) + df = DataFrame([[1, 2, 3]], columns=index) + df = df.reindex(columns=[(1,), (3,)]) + expected = ",1,3\n0,1,3\n" + result = df.to_csv(lineterminator="\n") + tm.assert_almost_equal(result, expected) + + def test_gz_lineend(self): + # GH 25311 + df = DataFrame({"a": [1, 2]}) + expected_rows = ["a", "1", "2"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + with tm.ensure_clean("__test_gz_lineend.csv.gz") as path: + df.to_csv(path, index=False) + with tm.decompress_file(path, compression="gzip") as f: + result = f.read().decode("utf-8") + + assert result == expected + + def test_to_csv_numpy_16_bug(self): + frame = DataFrame({"a": date_range("1/1/2000", periods=10)}) + + buf = StringIO() + frame.to_csv(buf) + + result = buf.getvalue() + assert "2000-01-01" in result + + def test_to_csv_na_quoting(self): + # GH 15891 + # Normalize carriage return for Windows OS + result = ( + DataFrame([None, None]) + .to_csv(None, header=False, index=False, na_rep="") + .replace("\r\n", "\n") + ) + expected = '""\n""\n' + assert result == expected + + def test_to_csv_categorical_and_ea(self): + # GH#46812 + df = DataFrame({"a": "x", "b": [1, pd.NA]}) + df["b"] = df["b"].astype("Int16") + df["b"] = df["b"].astype("category") + result = df.to_csv() + expected_rows = [",a,b", "0,x,1", "1,x,"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_categorical_and_interval(self): + # GH#46297 + df = DataFrame( + { + "a": [ + pd.Interval( + Timestamp("2020-01-01"), + Timestamp("2020-01-02"), + closed="both", + ) + ] + } + ) + df["a"] = df["a"].astype("category") + result = df.to_csv() + expected_rows = [",a", '0,"[2020-01-01 00:00:00, 2020-01-02 00:00:00]"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_dict.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..570f85a4a31ee5f210a6ccd9c8c52a95b5c09b8d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_dict.py @@ -0,0 +1,535 @@ +from collections import ( + OrderedDict, + defaultdict, +) +from datetime import datetime + +import numpy as np +import pytest +import pytz + +from pandas import ( + NA, + DataFrame, + Index, + Interval, + MultiIndex, + Period, + Series, + Timedelta, + Timestamp, +) +import pandas._testing as tm + + +class TestDataFrameToDict: + def test_to_dict_timestamp(self): + # GH#11247 + # split/records producing np.datetime64 rather than Timestamps + # on datetime64[ns] dtypes only + + tsmp = Timestamp("20130101") + test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]}) + test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]}) + + expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}] + expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}] + + assert test_data.to_dict(orient="records") == expected_records + assert test_data_mixed.to_dict(orient="records") == expected_records_mixed + + expected_series = { + "A": Series([tsmp, tsmp], name="A"), + "B": Series([tsmp, tsmp], name="B"), + } + expected_series_mixed = { + "A": Series([tsmp, tsmp], name="A"), + "B": Series([1, 2], name="B"), + } + + tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series) + tm.assert_dict_equal( + test_data_mixed.to_dict(orient="series"), expected_series_mixed + ) + + expected_split = { + "index": [0, 1], + "data": [[tsmp, tsmp], [tsmp, tsmp]], + "columns": ["A", "B"], + } + expected_split_mixed = { + "index": [0, 1], + "data": [[tsmp, 1], [tsmp, 2]], + "columns": ["A", "B"], + } + + tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split) + tm.assert_dict_equal( + test_data_mixed.to_dict(orient="split"), expected_split_mixed + ) + + def test_to_dict_index_not_unique_with_index_orient(self): + # GH#22801 + # Data loss when indexes are not unique. Raise ValueError. + df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"]) + msg = "DataFrame index must be unique for orient='index'" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient="index") + + def test_to_dict_invalid_orient(self): + df = DataFrame({"A": [0, 1]}) + msg = "orient 'xinvalid' not understood" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient="xinvalid") + + @pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"]) + def test_to_dict_short_orient_raises(self, orient): + # GH#32515 + df = DataFrame({"A": [0, 1]}) + with pytest.raises(ValueError, match="not understood"): + df.to_dict(orient=orient) + + @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict]) + def test_to_dict(self, mapping): + # orient= should only take the listed options + # see GH#32515 + test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} + + # GH#16122 + recons_data = DataFrame(test_data).to_dict(into=mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][k2] + + recons_data = DataFrame(test_data).to_dict("list", into=mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][int(k2) - 1] + + recons_data = DataFrame(test_data).to_dict("series", into=mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][k2] + + recons_data = DataFrame(test_data).to_dict("split", into=mapping) + expected_split = { + "columns": ["A", "B"], + "index": ["1", "2", "3"], + "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]], + } + tm.assert_dict_equal(recons_data, expected_split) + + recons_data = DataFrame(test_data).to_dict("records", into=mapping) + expected_records = [ + {"A": 1.0, "B": "1"}, + {"A": 2.0, "B": "2"}, + {"A": np.nan, "B": "3"}, + ] + assert isinstance(recons_data, list) + assert len(recons_data) == 3 + for left, right in zip(recons_data, expected_records): + tm.assert_dict_equal(left, right) + + # GH#10844 + recons_data = DataFrame(test_data).to_dict("index") + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k2][k] + + df = DataFrame(test_data) + df["duped"] = df[df.columns[0]] + recons_data = df.to_dict("index") + comp_data = test_data.copy() + comp_data["duped"] = comp_data[df.columns[0]] + for k, v in comp_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k2][k] + + @pytest.mark.parametrize("mapping", [list, defaultdict, []]) + def test_to_dict_errors(self, mapping): + # GH#16122 + df = DataFrame(np.random.default_rng(2).standard_normal((3, 3))) + msg = "|".join( + [ + "unsupported type: ", + r"to_dict\(\) only accepts initialized defaultdicts", + ] + ) + with pytest.raises(TypeError, match=msg): + df.to_dict(into=mapping) + + def test_to_dict_not_unique_warning(self): + # GH#16927: When converting to a dict, if a column has a non-unique name + # it will be dropped, throwing a warning. + df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"]) + with tm.assert_produces_warning(UserWarning): + df.to_dict() + + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.parametrize( + "orient,expected", + [ + ("list", {"A": [2, 5], "B": [3, 6]}), + ("dict", {"A": {0: 2, 1: 5}, "B": {0: 3, 1: 6}}), + ], + ) + def test_to_dict_not_unique(self, orient, expected): + # GH#54824: This is to make sure that dataframes with non-unique column + # would have uniform behavior throughout different orients + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "A", "B"]) + result = df.to_dict(orient) + assert result == expected + + # orient - orient argument to to_dict function + # item_getter - function for extracting value from + # the resulting dict using column name and index + @pytest.mark.parametrize( + "orient,item_getter", + [ + ("dict", lambda d, col, idx: d[col][idx]), + ("records", lambda d, col, idx: d[idx][col]), + ("list", lambda d, col, idx: d[col][idx]), + ("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]), + ("index", lambda d, col, idx: d[idx][col]), + ], + ) + def test_to_dict_box_scalars(self, orient, item_getter): + # GH#14216, GH#23753 + # make sure that we are boxing properly + df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]}) + result = df.to_dict(orient=orient) + assert isinstance(item_getter(result, "a", 0), int) + assert isinstance(item_getter(result, "b", 0), float) + + def test_to_dict_tz(self): + # GH#18372 When converting to dict with orient='records' columns of + # datetime that are tz-aware were not converted to required arrays + data = [ + (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),), + ] + df = DataFrame(list(data), columns=["d"]) + + result = df.to_dict(orient="records") + expected = [ + {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)}, + {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)}, + ] + tm.assert_dict_equal(result[0], expected[0]) + tm.assert_dict_equal(result[1], expected[1]) + + @pytest.mark.parametrize( + "into, expected", + [ + ( + dict, + { + 0: {"int_col": 1, "float_col": 1.0}, + 1: {"int_col": 2, "float_col": 2.0}, + 2: {"int_col": 3, "float_col": 3.0}, + }, + ), + ( + OrderedDict, + OrderedDict( + [ + (0, {"int_col": 1, "float_col": 1.0}), + (1, {"int_col": 2, "float_col": 2.0}), + (2, {"int_col": 3, "float_col": 3.0}), + ] + ), + ), + ( + defaultdict(dict), + defaultdict( + dict, + { + 0: {"int_col": 1, "float_col": 1.0}, + 1: {"int_col": 2, "float_col": 2.0}, + 2: {"int_col": 3, "float_col": 3.0}, + }, + ), + ), + ], + ) + def test_to_dict_index_dtypes(self, into, expected): + # GH#18580 + # When using to_dict(orient='index') on a dataframe with int + # and float columns only the int columns were cast to float + + df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]}) + + result = df.to_dict(orient="index", into=into) + cols = ["int_col", "float_col"] + result = DataFrame.from_dict(result, orient="index")[cols] + expected = DataFrame.from_dict(expected, orient="index")[cols] + tm.assert_frame_equal(result, expected) + + def test_to_dict_numeric_names(self): + # GH#24940 + df = DataFrame({str(i): [i] for i in range(5)}) + result = set(df.to_dict("records")[0].keys()) + expected = set(df.columns) + assert result == expected + + def test_to_dict_wide(self): + # GH#24939 + df = DataFrame({(f"A_{i:d}"): [i] for i in range(256)}) + result = df.to_dict("records")[0] + expected = {f"A_{i:d}": i for i in range(256)} + assert result == expected + + @pytest.mark.parametrize( + "data,dtype", + ( + ([True, True, False], bool), + [ + [ + datetime(2018, 1, 1), + datetime(2019, 2, 2), + datetime(2020, 3, 3), + ], + Timestamp, + ], + [[1.0, 2.0, 3.0], float], + [[1, 2, 3], int], + [["X", "Y", "Z"], str], + ), + ) + def test_to_dict_orient_dtype(self, data, dtype): + # GH22620 & GH21256 + + df = DataFrame({"a": data}) + d = df.to_dict(orient="records") + assert all(type(record["a"]) is dtype for record in d) + + @pytest.mark.parametrize( + "data,expected_dtype", + ( + [np.uint64(2), int], + [np.int64(-9), int], + [np.float64(1.1), float], + [np.bool_(True), bool], + [np.datetime64("2005-02-25"), Timestamp], + ), + ) + def test_to_dict_scalar_constructor_orient_dtype(self, data, expected_dtype): + # GH22620 & GH21256 + + df = DataFrame({"a": data}, index=[0]) + d = df.to_dict(orient="records") + result = type(d[0]["a"]) + assert result is expected_dtype + + def test_to_dict_mixed_numeric_frame(self): + # GH 12859 + df = DataFrame({"a": [1.0], "b": [9.0]}) + result = df.reset_index().to_dict("records") + expected = [{"index": 0, "a": 1.0, "b": 9.0}] + assert result == expected + + @pytest.mark.parametrize( + "index", + [ + None, + Index(["aa", "bb"]), + Index(["aa", "bb"], name="cc"), + MultiIndex.from_tuples([("a", "b"), ("a", "c")]), + MultiIndex.from_tuples([("a", "b"), ("a", "c")], names=["n1", "n2"]), + ], + ) + @pytest.mark.parametrize( + "columns", + [ + ["x", "y"], + Index(["x", "y"]), + Index(["x", "y"], name="z"), + MultiIndex.from_tuples([("x", 1), ("y", 2)]), + MultiIndex.from_tuples([("x", 1), ("y", 2)], names=["z1", "z2"]), + ], + ) + def test_to_dict_orient_tight(self, index, columns): + df = DataFrame.from_records( + [[1, 3], [2, 4]], + columns=columns, + index=index, + ) + roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight") + + tm.assert_frame_equal(df, roundtrip) + + @pytest.mark.parametrize( + "orient", + ["dict", "list", "split", "records", "index", "tight"], + ) + @pytest.mark.parametrize( + "data,expected_types", + ( + ( + { + "a": [np.int64(1), 1, np.int64(3)], + "b": [np.float64(1.0), 2.0, np.float64(3.0)], + "c": [np.float64(1.0), 2, np.int64(3)], + "d": [np.float64(1.0), "a", np.int64(3)], + "e": [np.float64(1.0), ["a"], np.int64(3)], + "f": [np.float64(1.0), ("a",), np.int64(3)], + }, + { + "a": [int, int, int], + "b": [float, float, float], + "c": [float, float, float], + "d": [float, str, int], + "e": [float, list, int], + "f": [float, tuple, int], + }, + ), + ( + { + "a": [1, 2, 3], + "b": [1.1, 2.2, 3.3], + }, + { + "a": [int, int, int], + "b": [float, float, float], + }, + ), + ( # Make sure we have one df which is all object type cols + { + "a": [1, "hello", 3], + "b": [1.1, "world", 3.3], + }, + { + "a": [int, str, int], + "b": [float, str, float], + }, + ), + ), + ) + def test_to_dict_returns_native_types(self, orient, data, expected_types): + # GH 46751 + # Tests we get back native types for all orient types + df = DataFrame(data) + result = df.to_dict(orient) + if orient == "dict": + assertion_iterator = ( + (i, key, value) + for key, index_value_map in result.items() + for i, value in index_value_map.items() + ) + elif orient == "list": + assertion_iterator = ( + (i, key, value) + for key, values in result.items() + for i, value in enumerate(values) + ) + elif orient in {"split", "tight"}: + assertion_iterator = ( + (i, key, result["data"][i][j]) + for i in result["index"] + for j, key in enumerate(result["columns"]) + ) + elif orient == "records": + assertion_iterator = ( + (i, key, value) + for i, record in enumerate(result) + for key, value in record.items() + ) + elif orient == "index": + assertion_iterator = ( + (i, key, value) + for i, record in result.items() + for key, value in record.items() + ) + + for i, key, value in assertion_iterator: + assert value == data[key][i] + assert type(value) is expected_types[key][i] + + @pytest.mark.parametrize("orient", ["dict", "list", "series", "records", "index"]) + def test_to_dict_index_false_error(self, orient): + # GH#46398 + df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"]) + msg = "'index=False' is only valid when 'orient' is 'split' or 'tight'" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient=orient, index=False) + + @pytest.mark.parametrize( + "orient, expected", + [ + ("split", {"columns": ["col1", "col2"], "data": [[1, 3], [2, 4]]}), + ( + "tight", + { + "columns": ["col1", "col2"], + "data": [[1, 3], [2, 4]], + "column_names": [None], + }, + ), + ], + ) + def test_to_dict_index_false(self, orient, expected): + # GH#46398 + df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"]) + result = df.to_dict(orient=orient, index=False) + tm.assert_dict_equal(result, expected) + + @pytest.mark.parametrize( + "orient, expected", + [ + ("dict", {"a": {0: 1, 1: None}}), + ("list", {"a": [1, None]}), + ("split", {"index": [0, 1], "columns": ["a"], "data": [[1], [None]]}), + ( + "tight", + { + "index": [0, 1], + "columns": ["a"], + "data": [[1], [None]], + "index_names": [None], + "column_names": [None], + }, + ), + ("records", [{"a": 1}, {"a": None}]), + ("index", {0: {"a": 1}, 1: {"a": None}}), + ], + ) + def test_to_dict_na_to_none(self, orient, expected): + # GH#50795 + df = DataFrame({"a": [1, NA]}, dtype="Int64") + result = df.to_dict(orient=orient) + assert result == expected + + def test_to_dict_masked_native_python(self): + # GH#34665 + df = DataFrame({"a": Series([1, 2], dtype="Int64"), "B": 1}) + result = df.to_dict(orient="records") + assert isinstance(result[0]["a"], int) + + df = DataFrame({"a": Series([1, NA], dtype="Int64"), "B": 1}) + result = df.to_dict(orient="records") + assert isinstance(result[0]["a"], int) + + def test_to_dict_pos_args_deprecation(self): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_dict except for the " + r"argument 'orient' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_dict("records", {}) + + +@pytest.mark.parametrize( + "val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)] +) +def test_to_dict_list_pd_scalars(val): + # GH 54824 + df = DataFrame({"a": [val]}) + result = df.to_dict(orient="list") + expected = {"a": [val]} + assert result == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_dict_of_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..f64cfd5fe6a2db53a9fd1ca6b0972f1c82054781 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -0,0 +1,76 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + MultiIndex, +) +import pandas._testing as tm +from pandas.core.arrays import NumpyExtensionArray + +pytestmark = td.skip_array_manager_invalid_test + + +class TestToDictOfBlocks: + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + def test_no_copy_blocks(self, float_frame, using_copy_on_write): + # GH#9607 + df = DataFrame(float_frame, copy=True) + column = df.columns[0] + + _last_df = None + # use the copy=False, change a column + blocks = df._to_dict_of_blocks() + for _df in blocks.values(): + _last_df = _df + if column in _df: + _df.loc[:, column] = _df[column] + 1 + + if not using_copy_on_write: + # make sure we did change the original DataFrame + assert _last_df is not None and _last_df[column].equals(df[column]) + else: + assert _last_df is not None and not _last_df[column].equals(df[column]) + + +def test_to_dict_of_blocks_item_cache(using_copy_on_write, warn_copy_on_write): + # Calling to_dict_of_blocks should not poison item_cache + df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + df["c"] = NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object)) + mgr = df._mgr + assert len(mgr.blocks) == 3 # i.e. not consolidated + + ser = df["b"] # populations item_cache["b"] + + df._to_dict_of_blocks() + + if using_copy_on_write: + with pytest.raises(ValueError, match="read-only"): + ser.values[0] = "foo" + elif warn_copy_on_write: + ser.values[0] = "foo" + assert df.loc[0, "b"] == "foo" + # with warning mode, the item cache is disabled + assert df["b"] is not ser + else: + # Check that the to_dict_of_blocks didn't break link between ser and df + ser.values[0] = "foo" + assert df.loc[0, "b"] == "foo" + + assert df["b"] is ser + + +def test_set_change_dtype_slice(): + # GH#8850 + cols = MultiIndex.from_tuples([("1st", "a"), ("2nd", "b"), ("3rd", "c")]) + df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols) + df["2nd"] = df["2nd"] * 2.0 + + blocks = df._to_dict_of_blocks() + assert sorted(blocks.keys()) == ["float64", "int64"] + tm.assert_frame_equal( + blocks["float64"], DataFrame([[1.0, 4.0], [4.0, 10.0]], columns=cols[:2]) + ) + tm.assert_frame_equal(blocks["int64"], DataFrame([[3], [6]], columns=cols[2:])) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_numpy.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_numpy.py new file mode 100644 index 0000000000000000000000000000000000000000..bdb9b2c05506124abfbdbb656fd088c5e8c1cee0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_numpy.py @@ -0,0 +1,49 @@ +import numpy as np + +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + Timestamp, +) +import pandas._testing as tm + + +class TestToNumpy: + def test_to_numpy(self): + df = DataFrame({"A": [1, 2], "B": [3, 4.5]}) + expected = np.array([[1, 3], [2, 4.5]]) + result = df.to_numpy() + tm.assert_numpy_array_equal(result, expected) + + def test_to_numpy_dtype(self): + df = DataFrame({"A": [1, 2], "B": [3, 4.5]}) + expected = np.array([[1, 3], [2, 4]], dtype="int64") + result = df.to_numpy(dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + @td.skip_array_manager_invalid_test + def test_to_numpy_copy(self, using_copy_on_write): + arr = np.random.default_rng(2).standard_normal((4, 3)) + df = DataFrame(arr) + if using_copy_on_write: + assert df.values.base is not arr + assert df.to_numpy(copy=False).base is df.values.base + else: + assert df.values.base is arr + assert df.to_numpy(copy=False).base is arr + assert df.to_numpy(copy=True).base is not arr + + # we still don't want a copy when na_value=np.nan is passed, + # and that can be respected because we are already numpy-float + if using_copy_on_write: + assert df.to_numpy(copy=False).base is df.values.base + else: + assert df.to_numpy(copy=False, na_value=np.nan).base is arr + + def test_to_numpy_mixed_dtype_to_str(self): + # https://github.com/pandas-dev/pandas/issues/35455 + df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) + result = df.to_numpy(dtype=str) + expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str) + tm.assert_numpy_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_period.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_period.py new file mode 100644 index 0000000000000000000000000000000000000000..6a3e6b8c0e0596cfad38bfd1e02fd1b0f34e4ddb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_period.py @@ -0,0 +1,89 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + DatetimeIndex, + PeriodIndex, + Series, + date_range, + period_range, +) +import pandas._testing as tm + + +class TestToPeriod: + def test_to_period(self, frame_or_series): + K = 5 + + dr = date_range("1/1/2000", "1/1/2001", freq="D") + obj = DataFrame( + np.random.default_rng(2).standard_normal((len(dr), K)), + index=dr, + columns=["A", "B", "C", "D", "E"], + ) + obj["mix"] = "a" + obj = tm.get_obj(obj, frame_or_series) + + pts = obj.to_period() + exp = obj.copy() + exp.index = period_range("1/1/2000", "1/1/2001") + tm.assert_equal(pts, exp) + + pts = obj.to_period("M") + exp.index = exp.index.asfreq("M") + tm.assert_equal(pts, exp) + + def test_to_period_without_freq(self, frame_or_series): + # GH#7606 without freq + idx = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"]) + exp_idx = PeriodIndex( + ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], freq="D" + ) + + obj = DataFrame( + np.random.default_rng(2).standard_normal((4, 4)), index=idx, columns=idx + ) + obj = tm.get_obj(obj, frame_or_series) + expected = obj.copy() + expected.index = exp_idx + tm.assert_equal(obj.to_period(), expected) + + if frame_or_series is DataFrame: + expected = obj.copy() + expected.columns = exp_idx + tm.assert_frame_equal(obj.to_period(axis=1), expected) + + def test_to_period_columns(self): + dr = date_range("1/1/2000", "1/1/2001") + df = DataFrame(np.random.default_rng(2).standard_normal((len(dr), 5)), index=dr) + df["mix"] = "a" + + df = df.T + pts = df.to_period(axis=1) + exp = df.copy() + exp.columns = period_range("1/1/2000", "1/1/2001") + tm.assert_frame_equal(pts, exp) + + pts = df.to_period("M", axis=1) + tm.assert_index_equal(pts.columns, exp.columns.asfreq("M")) + + def test_to_period_invalid_axis(self): + dr = date_range("1/1/2000", "1/1/2001") + df = DataFrame(np.random.default_rng(2).standard_normal((len(dr), 5)), index=dr) + df["mix"] = "a" + + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.to_period(axis=2) + + def test_to_period_raises(self, index, frame_or_series): + # https://github.com/pandas-dev/pandas/issues/33327 + obj = Series(index=index, dtype=object) + if frame_or_series is DataFrame: + obj = obj.to_frame() + + if not isinstance(index, DatetimeIndex): + msg = f"unsupported Type {type(index).__name__}" + with pytest.raises(TypeError, match=msg): + obj.to_period() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_records.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_records.py new file mode 100644 index 0000000000000000000000000000000000000000..fab90b112fa94c9aa6bf6d8b9f0045e82f3ec92d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_records.py @@ -0,0 +1,523 @@ +from collections import abc +import email +from email.parser import Parser + +import numpy as np +import pytest + +from pandas import ( + CategoricalDtype, + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameToRecords: + def test_to_records_timeseries(self): + index = date_range("1/1/2000", periods=10) + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 3)), + index=index, + columns=["a", "b", "c"], + ) + + result = df.to_records() + assert result["index"].dtype == "M8[ns]" + + result = df.to_records(index=False) + + def test_to_records_dt64(self): + df = DataFrame( + [["one", "two", "three"], ["four", "five", "six"]], + index=date_range("2012-01-01", "2012-01-02"), + ) + + expected = df.index.values[0] + result = df.to_records()["index"][0] + assert expected == result + + def test_to_records_dt64tz_column(self): + # GH#32535 dont less tz in to_records + df = DataFrame({"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")}) + + result = df.to_records() + + assert result.dtype["A"] == object + val = result[0][1] + assert isinstance(val, Timestamp) + assert val == df.loc[0, "A"] + + def test_to_records_with_multindex(self): + # GH#3189 + index = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + data = np.zeros((8, 4)) + df = DataFrame(data, index=index) + r = df.to_records(index=True)["level_0"] + assert "bar" in r + assert "one" not in r + + def test_to_records_with_Mapping_type(self): + abc.Mapping.register(email.message.Message) + + headers = Parser().parsestr( + "From: \n" + "To: \n" + "Subject: Test message\n" + "\n" + "Body would go here\n" + ) + + frame = DataFrame.from_records([headers]) + all(x in frame for x in ["Type", "Subject", "From"]) + + def test_to_records_floats(self): + df = DataFrame(np.random.default_rng(2).random((10, 10))) + df.to_records() + + def test_to_records_index_name(self): + df = DataFrame(np.random.default_rng(2).standard_normal((3, 3))) + df.index.name = "X" + rs = df.to_records() + assert "X" in rs.dtype.fields + + df = DataFrame(np.random.default_rng(2).standard_normal((3, 3))) + rs = df.to_records() + assert "index" in rs.dtype.fields + + df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) + df.index.names = ["A", None] + result = df.to_records() + expected = np.rec.fromarrays( + [np.array(["a", "a", "b"]), np.array(["x", "y", "z"])] + + [np.asarray(df.iloc[:, i]) for i in range(3)], + dtype={ + "names": ["A", "level_1", "0", "1", "2"], + "formats": [ + "O", + "O", + f"{tm.ENDIAN}f8", + f"{tm.ENDIAN}f8", + f"{tm.ENDIAN}f8", + ], + }, + ) + tm.assert_numpy_array_equal(result, expected) + + def test_to_records_with_unicode_index(self): + # GH#13172 + # unicode_literals conflict with to_records + result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records() + expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")]) + tm.assert_almost_equal(result, expected) + + def test_to_records_index_dtype(self): + # GH 47263: consistent data types for Index and MultiIndex + df = DataFrame( + { + 1: date_range("2022-01-01", periods=2), + 2: date_range("2022-01-01", periods=2), + 3: date_range("2022-01-01", periods=2), + } + ) + + expected = np.rec.array( + [ + ("2022-01-01", "2022-01-01", "2022-01-01"), + ("2022-01-02", "2022-01-02", "2022-01-02"), + ], + dtype=[ + ("1", f"{tm.ENDIAN}M8[ns]"), + ("2", f"{tm.ENDIAN}M8[ns]"), + ("3", f"{tm.ENDIAN}M8[ns]"), + ], + ) + + result = df.to_records(index=False) + tm.assert_almost_equal(result, expected) + + result = df.set_index(1).to_records(index=True) + tm.assert_almost_equal(result, expected) + + result = df.set_index([1, 2]).to_records(index=True) + tm.assert_almost_equal(result, expected) + + def test_to_records_with_unicode_column_names(self): + # xref issue: https://github.com/numpy/numpy/issues/2407 + # Issue GH#11879. to_records used to raise an exception when used + # with column names containing non-ascii characters in Python 2 + result = DataFrame(data={"accented_name_é": [1.0]}).to_records() + + # Note that numpy allows for unicode field names but dtypes need + # to be specified using dictionary instead of list of tuples. + expected = np.rec.array( + [(0, 1.0)], + dtype={"names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"]}, + ) + tm.assert_almost_equal(result, expected) + + def test_to_records_with_categorical(self): + # GH#8626 + + # dict creation + df = DataFrame({"A": list("abc")}, dtype="category") + expected = Series(list("abc"), dtype="category", name="A") + tm.assert_series_equal(df["A"], expected) + + # list-like creation + df = DataFrame(list("abc"), dtype="category") + expected = Series(list("abc"), dtype="category", name=0) + tm.assert_series_equal(df[0], expected) + + # to record array + # this coerces + result = df.to_records() + expected = np.rec.array( + [(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")] + ) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize( + "kwargs,expected", + [ + # No dtypes --> default to array dtypes. + ( + {}, + np.rec.array( + [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Should have no effect in this case. + ( + {"index": True}, + np.rec.array( + [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Column dtype applied across the board. Index unaffected. + ( + {"column_dtypes": f"{tm.ENDIAN}U4"}, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", f"{tm.ENDIAN}U4"), + ("B", f"{tm.ENDIAN}U4"), + ("C", f"{tm.ENDIAN}U4"), + ], + ), + ), + # Index dtype applied across the board. Columns unaffected. + ( + {"index_dtypes": f"{tm.ENDIAN}U1"}, + np.rec.array( + [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}U1"), + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Pass in a type instance. + ( + {"column_dtypes": str}, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", f"{tm.ENDIAN}U"), + ("B", f"{tm.ENDIAN}U"), + ("C", f"{tm.ENDIAN}U"), + ], + ), + ), + # Pass in a dtype instance. + ( + {"column_dtypes": np.dtype(np.str_)}, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", f"{tm.ENDIAN}U"), + ("B", f"{tm.ENDIAN}U"), + ("C", f"{tm.ENDIAN}U"), + ], + ), + ), + # Pass in a dictionary (name-only). + ( + { + "column_dtypes": { + "A": np.int8, + "B": np.float32, + "C": f"{tm.ENDIAN}U2", + } + }, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", "i1"), + ("B", f"{tm.ENDIAN}f4"), + ("C", f"{tm.ENDIAN}U2"), + ], + ), + ), + # Pass in a dictionary (indices-only). + ( + {"index_dtypes": {0: "int16"}}, + np.rec.array( + [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[ + ("index", "i2"), + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Ignore index mappings if index is not True. + ( + {"index": False, "index_dtypes": f"{tm.ENDIAN}U2"}, + np.rec.array( + [(1, 0.2, "a"), (2, 1.5, "bc")], + dtype=[ + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Non-existent names / indices in mapping should not error. + ( + {"index_dtypes": {0: "int16", "not-there": "float32"}}, + np.rec.array( + [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[ + ("index", "i2"), + ("A", f"{tm.ENDIAN}i8"), + ("B", f"{tm.ENDIAN}f8"), + ("C", "O"), + ], + ), + ), + # Names / indices not in mapping default to array dtype. + ( + {"column_dtypes": {"A": np.int8, "B": np.float32}}, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", "i1"), + ("B", f"{tm.ENDIAN}f4"), + ("C", "O"), + ], + ), + ), + # Names / indices not in dtype mapping default to array dtype. + ( + {"column_dtypes": {"A": np.dtype("int8"), "B": np.dtype("float32")}}, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}i8"), + ("A", "i1"), + ("B", f"{tm.ENDIAN}f4"), + ("C", "O"), + ], + ), + ), + # Mixture of everything. + ( + { + "column_dtypes": {"A": np.int8, "B": np.float32}, + "index_dtypes": f"{tm.ENDIAN}U2", + }, + np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}U2"), + ("A", "i1"), + ("B", f"{tm.ENDIAN}f4"), + ("C", "O"), + ], + ), + ), + # Invalid dype values. + ( + {"index": False, "column_dtypes": []}, + (ValueError, "Invalid dtype \\[\\] specified for column A"), + ), + ( + {"index": False, "column_dtypes": {"A": "int32", "B": 5}}, + (ValueError, "Invalid dtype 5 specified for column B"), + ), + # Numpy can't handle EA types, so check error is raised + ( + { + "index": False, + "column_dtypes": {"A": "int32", "B": CategoricalDtype(["a", "b"])}, + }, + (ValueError, "Invalid dtype category specified for column B"), + ), + # Check that bad types raise + ( + {"index": False, "column_dtypes": {"A": "int32", "B": "foo"}}, + (TypeError, "data type [\"']foo[\"'] not understood"), + ), + ], + ) + def test_to_records_dtype(self, kwargs, expected): + # see GH#18146 + df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) + + if not isinstance(expected, np.rec.recarray): + with pytest.raises(expected[0], match=expected[1]): + df.to_records(**kwargs) + else: + result = df.to_records(**kwargs) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize( + "df,kwargs,expected", + [ + # MultiIndex in the index. + ( + DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc") + ).set_index(["a", "b"]), + {"column_dtypes": "float64", "index_dtypes": {0: "int32", 1: "int8"}}, + np.rec.array( + [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)], + dtype=[ + ("a", f"{tm.ENDIAN}i4"), + ("b", "i1"), + ("c", f"{tm.ENDIAN}f8"), + ], + ), + ), + # MultiIndex in the columns. + ( + DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=MultiIndex.from_tuples( + [("a", "d"), ("b", "e"), ("c", "f")] + ), + ), + { + "column_dtypes": {0: f"{tm.ENDIAN}U1", 2: "float32"}, + "index_dtypes": "float32", + }, + np.rec.array( + [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0), (2.0, "7", 8, 9.0)], + dtype=[ + ("index", f"{tm.ENDIAN}f4"), + ("('a', 'd')", f"{tm.ENDIAN}U1"), + ("('b', 'e')", f"{tm.ENDIAN}i8"), + ("('c', 'f')", f"{tm.ENDIAN}f4"), + ], + ), + ), + # MultiIndex in both the columns and index. + ( + DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=MultiIndex.from_tuples( + [("a", "d"), ("b", "e"), ("c", "f")], names=list("ab") + ), + index=MultiIndex.from_tuples( + [("d", -4), ("d", -5), ("f", -6)], names=list("cd") + ), + ), + { + "column_dtypes": "float64", + "index_dtypes": {0: f"{tm.ENDIAN}U2", 1: "int8"}, + }, + np.rec.array( + [ + ("d", -4, 1.0, 2.0, 3.0), + ("d", -5, 4.0, 5.0, 6.0), + ("f", -6, 7, 8, 9.0), + ], + dtype=[ + ("c", f"{tm.ENDIAN}U2"), + ("d", "i1"), + ("('a', 'd')", f"{tm.ENDIAN}f8"), + ("('b', 'e')", f"{tm.ENDIAN}f8"), + ("('c', 'f')", f"{tm.ENDIAN}f8"), + ], + ), + ), + ], + ) + def test_to_records_dtype_mi(self, df, kwargs, expected): + # see GH#18146 + result = df.to_records(**kwargs) + tm.assert_almost_equal(result, expected) + + def test_to_records_dict_like(self): + # see GH#18146 + class DictLike: + def __init__(self, **kwargs) -> None: + self.d = kwargs.copy() + + def __getitem__(self, key): + return self.d.__getitem__(key) + + def __contains__(self, key) -> bool: + return key in self.d + + def keys(self): + return self.d.keys() + + df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) + + dtype_mappings = { + "column_dtypes": DictLike(A=np.int8, B=np.float32), + "index_dtypes": f"{tm.ENDIAN}U2", + } + + result = df.to_records(**dtype_mappings) + expected = np.rec.array( + [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], + dtype=[ + ("index", f"{tm.ENDIAN}U2"), + ("A", "i1"), + ("B", f"{tm.ENDIAN}f4"), + ("C", "O"), + ], + ) + tm.assert_almost_equal(result, expected) + + @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"]) + def test_to_records_datetimeindex_with_tz(self, tz): + # GH#13937 + dr = date_range("2016-01-01", periods=10, freq="s", tz=tz) + + df = DataFrame({"datetime": dr}, index=dr) + + expected = df.to_records() + result = df.tz_convert("UTC").to_records() + + # both converted to UTC, so they are equal + tm.assert_numpy_array_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_timestamp.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_timestamp.py new file mode 100644 index 0000000000000000000000000000000000000000..0e7e1d595d6be9250638932e7690f420b9a12fc0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_to_timestamp.py @@ -0,0 +1,154 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + DatetimeIndex, + PeriodIndex, + Series, + Timedelta, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + + +def _get_with_delta(delta, freq="YE-DEC"): + return date_range( + to_datetime("1/1/2001") + delta, + to_datetime("12/31/2009") + delta, + freq=freq, + ) + + +class TestToTimestamp: + def test_to_timestamp(self, frame_or_series): + K = 5 + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + obj = DataFrame( + np.random.default_rng(2).standard_normal((len(index), K)), + index=index, + columns=["A", "B", "C", "D", "E"], + ) + obj["mix"] = "a" + obj = tm.get_obj(obj, frame_or_series) + + exp_index = date_range("1/1/2001", end="12/31/2009", freq="YE-DEC") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") + result = obj.to_timestamp("D", "end") + tm.assert_index_equal(result.index, exp_index) + tm.assert_numpy_array_equal(result.values, obj.values) + if frame_or_series is Series: + assert result.name == "A" + + exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN") + result = obj.to_timestamp("D", "start") + tm.assert_index_equal(result.index, exp_index) + + result = obj.to_timestamp(how="start") + tm.assert_index_equal(result.index, exp_index) + + delta = timedelta(hours=23) + result = obj.to_timestamp("H", "end") + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + delta = timedelta(hours=23, minutes=59) + result = obj.to_timestamp("T", "end") + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + result = obj.to_timestamp("S", "end") + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + + def test_to_timestamp_columns(self): + K = 5 + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + df = DataFrame( + np.random.default_rng(2).standard_normal((len(index), K)), + index=index, + columns=["A", "B", "C", "D", "E"], + ) + df["mix"] = "a" + + # columns + df = df.T + + exp_index = date_range("1/1/2001", end="12/31/2009", freq="YE-DEC") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") + result = df.to_timestamp("D", "end", axis=1) + tm.assert_index_equal(result.columns, exp_index) + tm.assert_numpy_array_equal(result.values, df.values) + + exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN") + result = df.to_timestamp("D", "start", axis=1) + tm.assert_index_equal(result.columns, exp_index) + + delta = timedelta(hours=23) + result = df.to_timestamp("H", "end", axis=1) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result.columns, exp_index) + + delta = timedelta(hours=23, minutes=59) + result = df.to_timestamp("min", "end", axis=1) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") + tm.assert_index_equal(result.columns, exp_index) + + result = df.to_timestamp("S", "end", axis=1) + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result.columns, exp_index) + + result1 = df.to_timestamp("5min", axis=1) + result2 = df.to_timestamp("min", axis=1) + expected = date_range("2001-01-01", "2009-01-01", freq="YS") + assert isinstance(result1.columns, DatetimeIndex) + assert isinstance(result2.columns, DatetimeIndex) + tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8) + tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8) + # PeriodIndex.to_timestamp always use 'infer' + assert result1.columns.freqstr == "YS-JAN" + assert result2.columns.freqstr == "YS-JAN" + + def test_to_timestamp_invalid_axis(self): + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + obj = DataFrame( + np.random.default_rng(2).standard_normal((len(index), 5)), index=index + ) + + # invalid axis + with pytest.raises(ValueError, match="axis"): + obj.to_timestamp(axis=2) + + def test_to_timestamp_hourly(self, frame_or_series): + index = period_range(freq="h", start="1/1/2001", end="1/2/2001") + obj = Series(1, index=index, name="foo") + if frame_or_series is not Series: + obj = obj.to_frame() + + exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="h") + result = obj.to_timestamp(how="end") + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result.index, exp_index) + if frame_or_series is Series: + assert result.name == "foo" + + def test_to_timestamp_raises(self, index, frame_or_series): + # GH#33327 + obj = frame_or_series(index=index, dtype=object) + + if not isinstance(index, PeriodIndex): + msg = f"unsupported Type {type(index).__name__}" + with pytest.raises(TypeError, match=msg): + obj.to_timestamp() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_transpose.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_transpose.py new file mode 100644 index 0000000000000000000000000000000000000000..3e74094f266d14b8752e562653cf490868dcd0b0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_transpose.py @@ -0,0 +1,209 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + Series, + Timestamp, + bdate_range, + date_range, + timedelta_range, +) +import pandas._testing as tm + + +class TestTranspose: + def test_transpose_td64_intervals(self): + # GH#44917 + tdi = timedelta_range("0 Days", "3 Days") + ii = IntervalIndex.from_breaks(tdi) + ii = ii.insert(-1, np.nan) + df = DataFrame(ii) + + result = df.T + expected = DataFrame({i: ii[i : i + 1] for i in range(len(ii))}) + tm.assert_frame_equal(result, expected) + + def test_transpose_empty_preserves_datetimeindex(self): + # GH#41382 + dti = DatetimeIndex([], dtype="M8[ns]") + df = DataFrame(index=dti) + + expected = DatetimeIndex([], dtype="datetime64[ns]", freq=None) + + result1 = df.T.sum().index + result2 = df.sum(axis=1).index + + tm.assert_index_equal(result1, expected) + tm.assert_index_equal(result2, expected) + + def test_transpose_tzaware_1col_single_tz(self): + # GH#26825 + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") + + df = DataFrame(dti) + assert (df.dtypes == dti.dtype).all() + res = df.T + assert (res.dtypes == dti.dtype).all() + + def test_transpose_tzaware_2col_single_tz(self): + # GH#26825 + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") + + df3 = DataFrame({"A": dti, "B": dti}) + assert (df3.dtypes == dti.dtype).all() + res3 = df3.T + assert (res3.dtypes == dti.dtype).all() + + def test_transpose_tzaware_2col_mixed_tz(self): + # GH#26825 + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") + + df4 = DataFrame({"A": dti, "B": dti2}) + assert (df4.dtypes == [dti.dtype, dti2.dtype]).all() + assert (df4.T.dtypes == object).all() + tm.assert_frame_equal(df4.T.T, df4.astype(object)) + + @pytest.mark.parametrize("tz", [None, "America/New_York"]) + def test_transpose_preserves_dtindex_equality_with_dst(self, tz): + # GH#19970 + idx = date_range("20161101", "20161130", freq="4h", tz=tz) + df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx) + result = df.T == df.T + expected = DataFrame(True, index=list("ab"), columns=idx) + tm.assert_frame_equal(result, expected) + + def test_transpose_object_to_tzaware_mixed_tz(self): + # GH#26825 + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") + + # mixed all-tzaware dtypes + df2 = DataFrame([dti, dti2]) + assert (df2.dtypes == object).all() + res2 = df2.T + assert (res2.dtypes == object).all() + + def test_transpose_uint64(self): + df = DataFrame( + {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, + dtype=np.uint64, + ) + result = df.T + expected = DataFrame(df.values.T) + expected.index = ["A", "B"] + tm.assert_frame_equal(result, expected) + + def test_transpose_float(self, float_frame): + frame = float_frame + dft = frame.T + for idx, series in dft.items(): + for col, value in series.items(): + if np.isnan(value): + assert np.isnan(frame[col][idx]) + else: + assert value == frame[col][idx] + + def test_transpose_mixed(self): + # mixed type + mixed = DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + }, + index=Index(["a", "b", "c", "d", "e"], dtype=object), + ) + + mixed_T = mixed.T + for col, s in mixed_T.items(): + assert s.dtype == np.object_ + + @td.skip_array_manager_invalid_test + def test_transpose_get_view(self, float_frame, using_copy_on_write): + dft = float_frame.T + dft.iloc[:, 5:10] = 5 + + if using_copy_on_write: + assert (float_frame.values[5:10] != 5).all() + else: + assert (float_frame.values[5:10] == 5).all() + + @td.skip_array_manager_invalid_test + def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write): + dti = date_range("2016-01-01", periods=6, tz="US/Pacific") + arr = dti._data.reshape(3, 2) + df = DataFrame(arr) + assert df._mgr.nblocks == 1 + + result = df.T + assert result._mgr.nblocks == 1 + + rtrip = result._mgr.blocks[0].values + if using_copy_on_write: + assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray) + else: + assert np.shares_memory(arr._ndarray, rtrip._ndarray) + + def test_transpose_not_inferring_dt(self): + # GH#51546 + df = DataFrame( + { + "a": [Timestamp("2019-12-31"), Timestamp("2019-12-31")], + }, + dtype=object, + ) + result = df.T + expected = DataFrame( + [[Timestamp("2019-12-31"), Timestamp("2019-12-31")]], + columns=[0, 1], + index=["a"], + dtype=object, + ) + tm.assert_frame_equal(result, expected) + + def test_transpose_not_inferring_dt_mixed_blocks(self): + # GH#51546 + df = DataFrame( + { + "a": Series( + [Timestamp("2019-12-31"), Timestamp("2019-12-31")], dtype=object + ), + "b": [Timestamp("2019-12-31"), Timestamp("2019-12-31")], + } + ) + result = df.T + expected = DataFrame( + [ + [Timestamp("2019-12-31"), Timestamp("2019-12-31")], + [Timestamp("2019-12-31"), Timestamp("2019-12-31")], + ], + columns=[0, 1], + index=["a", "b"], + dtype=object, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype1", ["Int64", "Float64"]) + @pytest.mark.parametrize("dtype2", ["Int64", "Float64"]) + def test_transpose(self, dtype1, dtype2): + # GH#57315 - transpose should have F contiguous blocks + df = DataFrame( + { + "a": pd.array([1, 1, 2], dtype=dtype1), + "b": pd.array([3, 4, 5], dtype=dtype2), + } + ) + result = df.T + for blk in result._mgr.blocks: + # When dtypes are unequal, we get NumPy object array + data = blk.values._data if dtype1 == dtype2 else blk.values + assert data.flags["F_CONTIGUOUS"] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_truncate.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_truncate.py new file mode 100644 index 0000000000000000000000000000000000000000..12077952c2e0300257eb9029d2a1a231d5fa0a5c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_truncate.py @@ -0,0 +1,154 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + DatetimeIndex, + Index, + Series, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameTruncate: + def test_truncate(self, datetime_frame, frame_or_series): + ts = datetime_frame[::3] + ts = tm.get_obj(ts, frame_or_series) + + start, end = datetime_frame.index[3], datetime_frame.index[6] + + start_missing = datetime_frame.index[2] + end_missing = datetime_frame.index[7] + + # neither specified + truncated = ts.truncate() + tm.assert_equal(truncated, ts) + + # both specified + expected = ts[1:3] + + truncated = ts.truncate(start, end) + tm.assert_equal(truncated, expected) + + truncated = ts.truncate(start_missing, end_missing) + tm.assert_equal(truncated, expected) + + # start specified + expected = ts[1:] + + truncated = ts.truncate(before=start) + tm.assert_equal(truncated, expected) + + truncated = ts.truncate(before=start_missing) + tm.assert_equal(truncated, expected) + + # end specified + expected = ts[:3] + + truncated = ts.truncate(after=end) + tm.assert_equal(truncated, expected) + + truncated = ts.truncate(after=end_missing) + tm.assert_equal(truncated, expected) + + # corner case, empty series/frame returned + truncated = ts.truncate(after=ts.index[0] - ts.index.freq) + assert len(truncated) == 0 + + truncated = ts.truncate(before=ts.index[-1] + ts.index.freq) + assert len(truncated) == 0 + + msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-05-16 00:00:00" + with pytest.raises(ValueError, match=msg): + ts.truncate( + before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq + ) + + def test_truncate_nonsortedindex(self, frame_or_series): + # GH#17935 + + obj = DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0]) + obj = tm.get_obj(obj, frame_or_series) + + msg = "truncate requires a sorted index" + with pytest.raises(ValueError, match=msg): + obj.truncate(before=3, after=9) + + def test_sort_values_nonsortedindex(self): + rng = date_range("2011-01-01", "2012-01-01", freq="W") + ts = DataFrame( + { + "A": np.random.default_rng(2).standard_normal(len(rng)), + "B": np.random.default_rng(2).standard_normal(len(rng)), + }, + index=rng, + ) + + decreasing = ts.sort_values("A", ascending=False) + + msg = "truncate requires a sorted index" + with pytest.raises(ValueError, match=msg): + decreasing.truncate(before="2011-11", after="2011-12") + + def test_truncate_nonsortedindex_axis1(self): + # GH#17935 + + df = DataFrame( + { + 3: np.random.default_rng(2).standard_normal(5), + 20: np.random.default_rng(2).standard_normal(5), + 2: np.random.default_rng(2).standard_normal(5), + 0: np.random.default_rng(2).standard_normal(5), + }, + columns=[3, 20, 2, 0], + ) + msg = "truncate requires a sorted index" + with pytest.raises(ValueError, match=msg): + df.truncate(before=2, after=20, axis=1) + + @pytest.mark.parametrize( + "before, after, indices", + [(1, 2, [2, 1]), (None, 2, [2, 1, 0]), (1, None, [3, 2, 1])], + ) + @pytest.mark.parametrize("dtyp", [*tm.ALL_REAL_NUMPY_DTYPES, "datetime64[ns]"]) + def test_truncate_decreasing_index( + self, before, after, indices, dtyp, frame_or_series + ): + # https://github.com/pandas-dev/pandas/issues/33756 + idx = Index([3, 2, 1, 0], dtype=dtyp) + if isinstance(idx, DatetimeIndex): + before = pd.Timestamp(before) if before is not None else None + after = pd.Timestamp(after) if after is not None else None + indices = [pd.Timestamp(i) for i in indices] + values = frame_or_series(range(len(idx)), index=idx) + result = values.truncate(before=before, after=after) + expected = values.loc[indices] + tm.assert_equal(result, expected) + + def test_truncate_multiindex(self, frame_or_series): + # GH 34564 + mi = pd.MultiIndex.from_product([[1, 2, 3, 4], ["A", "B"]], names=["L1", "L2"]) + s1 = DataFrame(range(mi.shape[0]), index=mi, columns=["col"]) + s1 = tm.get_obj(s1, frame_or_series) + + result = s1.truncate(before=2, after=3) + + df = DataFrame.from_dict( + {"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]} + ) + expected = df.set_index(["L1", "L2"]) + expected = tm.get_obj(expected, frame_or_series) + + tm.assert_equal(result, expected) + + def test_truncate_index_only_one_unique_value(self, frame_or_series): + # GH 42365 + obj = Series(0, index=date_range("2021-06-30", "2021-06-30")).repeat(5) + if frame_or_series is DataFrame: + obj = obj.to_frame(name="a") + + truncated = obj.truncate("2021-06-28", "2021-07-01") + + tm.assert_equal(truncated, obj) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_tz_convert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_tz_convert.py new file mode 100644 index 0000000000000000000000000000000000000000..bcb8e423980fdc06195846a6d79afa00f8e691fd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_tz_convert.py @@ -0,0 +1,131 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + date_range, +) +import pandas._testing as tm + + +class TestTZConvert: + def test_tz_convert(self, frame_or_series): + rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + + obj = DataFrame({"a": 1}, index=rng) + obj = tm.get_obj(obj, frame_or_series) + + result = obj.tz_convert("Europe/Berlin") + expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + expected = tm.get_obj(expected, frame_or_series) + + assert result.index.tz.zone == "Europe/Berlin" + tm.assert_equal(result, expected) + + def test_tz_convert_axis1(self): + rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") + + obj = DataFrame({"a": 1}, index=rng) + + obj = obj.T + result = obj.tz_convert("Europe/Berlin", axis=1) + assert result.columns.tz.zone == "Europe/Berlin" + + expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + + tm.assert_equal(result, expected.T) + + def test_tz_convert_naive(self, frame_or_series): + # can't convert tz-naive + rng = date_range("1/1/2011", periods=200, freq="D") + ts = Series(1, index=rng) + ts = frame_or_series(ts) + + with pytest.raises(TypeError, match="Cannot convert tz-naive"): + ts.tz_convert("US/Eastern") + + @pytest.mark.parametrize("fn", ["tz_localize", "tz_convert"]) + def test_tz_convert_and_localize(self, fn): + l0 = date_range("20140701", periods=5, freq="D") + l1 = date_range("20140701", periods=5, freq="D") + + int_idx = Index(range(5)) + + if fn == "tz_convert": + l0 = l0.tz_localize("UTC") + l1 = l1.tz_localize("UTC") + + for idx in [l0, l1]: + l0_expected = getattr(idx, fn)("US/Pacific") + l1_expected = getattr(idx, fn)("US/Pacific") + + df1 = DataFrame(np.ones(5), index=l0) + df1 = getattr(df1, fn)("US/Pacific") + tm.assert_index_equal(df1.index, l0_expected) + + # MultiIndex + # GH7846 + df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1])) + + # freq is not preserved in MultiIndex construction + l1_expected = l1_expected._with_freq(None) + l0_expected = l0_expected._with_freq(None) + l1 = l1._with_freq(None) + l0 = l0._with_freq(None) + + df3 = getattr(df2, fn)("US/Pacific", level=0) + assert not df3.index.levels[0].equals(l0) + tm.assert_index_equal(df3.index.levels[0], l0_expected) + tm.assert_index_equal(df3.index.levels[1], l1) + assert not df3.index.levels[1].equals(l1_expected) + + df3 = getattr(df2, fn)("US/Pacific", level=1) + tm.assert_index_equal(df3.index.levels[0], l0) + assert not df3.index.levels[0].equals(l0_expected) + tm.assert_index_equal(df3.index.levels[1], l1_expected) + assert not df3.index.levels[1].equals(l1) + + df4 = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) + + # TODO: untested + getattr(df4, fn)("US/Pacific", level=1) + + tm.assert_index_equal(df3.index.levels[0], l0) + assert not df3.index.levels[0].equals(l0_expected) + tm.assert_index_equal(df3.index.levels[1], l1_expected) + assert not df3.index.levels[1].equals(l1) + + # Bad Inputs + + # Not DatetimeIndex / PeriodIndex + with pytest.raises(TypeError, match="DatetimeIndex"): + df = DataFrame(index=int_idx) + getattr(df, fn)("US/Pacific") + + # Not DatetimeIndex / PeriodIndex + with pytest.raises(TypeError, match="DatetimeIndex"): + df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) + getattr(df, fn)("US/Pacific", level=0) + + # Invalid level + with pytest.raises(ValueError, match="not valid"): + df = DataFrame(index=l0) + getattr(df, fn)("US/Pacific", level=1) + + @pytest.mark.parametrize("copy", [True, False]) + def test_tz_convert_copy_inplace_mutate(self, copy, frame_or_series): + # GH#6326 + obj = frame_or_series( + np.arange(0, 5), + index=date_range("20131027", periods=5, freq="h", tz="Europe/Berlin"), + ) + orig = obj.copy() + result = obj.tz_convert("UTC", copy=copy) + expected = frame_or_series(np.arange(0, 5), index=obj.index.tz_convert("UTC")) + tm.assert_equal(result, expected) + tm.assert_equal(obj, orig) + assert result.index is not obj.index + assert result is not obj diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_tz_localize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_tz_localize.py new file mode 100644 index 0000000000000000000000000000000000000000..b167afc17f484cce36c3909222b3f3d80ff4c926 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_tz_localize.py @@ -0,0 +1,68 @@ +from datetime import timezone + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + date_range, +) +import pandas._testing as tm + + +class TestTZLocalize: + # See also: + # test_tz_convert_and_localize in test_tz_convert + + def test_tz_localize(self, frame_or_series): + rng = date_range("1/1/2011", periods=100, freq="h") + + obj = DataFrame({"a": 1}, index=rng) + obj = tm.get_obj(obj, frame_or_series) + + result = obj.tz_localize("utc") + expected = DataFrame({"a": 1}, rng.tz_localize("UTC")) + expected = tm.get_obj(expected, frame_or_series) + + assert result.index.tz is timezone.utc + tm.assert_equal(result, expected) + + def test_tz_localize_axis1(self): + rng = date_range("1/1/2011", periods=100, freq="h") + + df = DataFrame({"a": 1}, index=rng) + + df = df.T + result = df.tz_localize("utc", axis=1) + assert result.columns.tz is timezone.utc + + expected = DataFrame({"a": 1}, rng.tz_localize("UTC")) + + tm.assert_frame_equal(result, expected.T) + + def test_tz_localize_naive(self, frame_or_series): + # Can't localize if already tz-aware + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") + ts = Series(1, index=rng) + ts = frame_or_series(ts) + + with pytest.raises(TypeError, match="Already tz-aware"): + ts.tz_localize("US/Eastern") + + @pytest.mark.parametrize("copy", [True, False]) + def test_tz_localize_copy_inplace_mutate(self, copy, frame_or_series): + # GH#6326 + obj = frame_or_series( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1h", tz=None) + ) + orig = obj.copy() + result = obj.tz_localize("UTC", copy=copy) + expected = frame_or_series( + np.arange(0, 5), + index=date_range("20131027", periods=5, freq="1h", tz="UTC"), + ) + tm.assert_equal(result, expected) + tm.assert_equal(obj, orig) + assert result.index is not obj.index + assert result is not obj diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_update.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_update.py new file mode 100644 index 0000000000000000000000000000000000000000..8af1798aa8e00db3f0a2c2950008988d4d351fdb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_update.py @@ -0,0 +1,204 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Series, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameUpdate: + def test_update_nan(self): + # #15593 #15617 + # test 1 + df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) + expected = df1.copy() + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) + + # test 2 + df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) + expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) + + def test_update(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other) + + expected = DataFrame( + [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) + tm.assert_frame_equal(df, expected) + + def test_update_dtypes(self): + # gh 3016 + df = DataFrame( + [[1.0, 2.0, 1, False, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], + ) + + other = DataFrame( + [[45, 45, 3, True]], index=[0], columns=["A", "B", "int", "bool1"] + ) + df.update(other) + + expected = DataFrame( + [[45.0, 45.0, 3, True, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], + ) + tm.assert_frame_equal(df, expected) + + def test_update_nooverwrite(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, overwrite=False) + + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]] + ) + tm.assert_frame_equal(df, expected) + + def test_update_filtered(self): + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, filter_func=lambda x: x > 2) + + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "bad_kwarg, exception, msg", + [ + # errors must be 'ignore' or 'raise' + ({"errors": "something"}, ValueError, "The parameter errors must.*"), + ({"join": "inner"}, NotImplementedError, "Only left join is supported"), + ], + ) + def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): + df = DataFrame([[1.5, 1, 3.0]]) + with pytest.raises(exception, match=msg): + df.update(df, **bad_kwarg) + + def test_update_raise_on_overlap(self): + df = DataFrame( + [[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + + other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2]) + with pytest.raises(ValueError, match="Data overlaps"): + df.update(other, errors="raise") + + def test_update_from_non_df(self): + d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])} + df = DataFrame(d) + + d["a"] = Series([5, 6, 7, 8]) + df.update(d) + + expected = DataFrame(d) + + tm.assert_frame_equal(df, expected) + + d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]} + df = DataFrame(d) + + d["a"] = [5, 6, 7, 8] + df.update(d) + + expected = DataFrame(d) + + tm.assert_frame_equal(df, expected) + + def test_update_datetime_tz(self): + # GH 25807 + result = DataFrame([pd.Timestamp("2019", tz="UTC")]) + with tm.assert_produces_warning(None): + result.update(result) + expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) + tm.assert_frame_equal(result, expected) + + def test_update_datetime_tz_in_place(self, using_copy_on_write, warn_copy_on_write): + # https://github.com/pandas-dev/pandas/issues/56227 + result = DataFrame([pd.Timestamp("2019", tz="UTC")]) + orig = result.copy() + view = result[:] + with tm.assert_produces_warning( + FutureWarning if warn_copy_on_write else None, match="Setting a value" + ): + result.update(result + pd.Timedelta(days=1)) + expected = DataFrame([pd.Timestamp("2019-01-02", tz="UTC")]) + tm.assert_frame_equal(result, expected) + if not using_copy_on_write: + tm.assert_frame_equal(view, expected) + else: + tm.assert_frame_equal(view, orig) + + def test_update_with_different_dtype(self, using_copy_on_write): + # GH#3217 + df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) + df["c"] = np.nan + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.update({"c": Series(["foo"], index=[0])}) + + expected = DataFrame( + { + "a": [1, 3], + "b": [np.nan, 2], + "c": Series(["foo", np.nan], dtype="object"), + } + ) + tm.assert_frame_equal(df, expected) + + @td.skip_array_manager_invalid_test + def test_update_modify_view( + self, using_copy_on_write, warn_copy_on_write, using_infer_string + ): + # GH#47188 + df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) + df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) + df2_orig = df2.copy() + result_view = df2[:] + # TODO(CoW-warn) better warning message + with tm.assert_cow_warning(warn_copy_on_write): + df2.update(df) + expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) + tm.assert_frame_equal(df2, expected) + if using_copy_on_write or using_infer_string: + tm.assert_frame_equal(result_view, df2_orig) + else: + tm.assert_frame_equal(result_view, expected) + + def test_update_dt_column_with_NaT_create_column(self): + # GH#16713 + df = DataFrame({"A": [1, None], "B": [pd.NaT, pd.to_datetime("2016-01-01")]}) + df2 = DataFrame({"A": [2, 3]}) + df.update(df2, overwrite=False) + expected = DataFrame( + {"A": [1.0, 3.0], "B": [pd.NaT, pd.to_datetime("2016-01-01")]} + ) + tm.assert_frame_equal(df, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_value_counts.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_value_counts.py new file mode 100644 index 0000000000000000000000000000000000000000..4136d641ef67f2d289142b34db7a2616de24ad24 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_value_counts.py @@ -0,0 +1,205 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_data_frame_value_counts_unsorted(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts(sort=False) + expected = pd.Series( + data=[1, 2, 1], + index=pd.MultiIndex.from_arrays( + [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"] + ), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_ascending(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts(ascending=True) + expected = pd.Series( + data=[1, 1, 2], + index=pd.MultiIndex.from_arrays( + [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"] + ), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_default(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts() + expected = pd.Series( + data=[2, 1, 1], + index=pd.MultiIndex.from_arrays( + [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"] + ), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_normalize(): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + result = df.value_counts(normalize=True) + expected = pd.Series( + data=[0.5, 0.25, 0.25], + index=pd.MultiIndex.from_arrays( + [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"] + ), + name="proportion", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_single_col_default(): + df = pd.DataFrame({"num_legs": [2, 4, 4, 6]}) + + result = df.value_counts() + expected = pd.Series( + data=[2, 1, 1], + index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_empty(): + df_no_cols = pd.DataFrame() + + result = df_no_cols.value_counts() + expected = pd.Series( + [], dtype=np.int64, name="count", index=np.array([], dtype=np.intp) + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_empty_normalize(): + df_no_cols = pd.DataFrame() + + result = df_no_cols.value_counts(normalize=True) + expected = pd.Series( + [], dtype=np.float64, name="proportion", index=np.array([], dtype=np.intp) + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_dropna_true(nulls_fixture): + # GH 41334 + df = pd.DataFrame( + { + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + result = df.value_counts() + expected = pd.Series( + data=[1, 1], + index=pd.MultiIndex.from_arrays( + [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"] + ), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_data_frame_value_counts_dropna_false(nulls_fixture): + # GH 41334 + df = pd.DataFrame( + { + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + + result = df.value_counts(dropna=False) + expected = pd.Series( + data=[1, 1, 1, 1], + index=pd.MultiIndex( + levels=[ + pd.Index(["Anne", "Beth", "John"]), + pd.Index(["Louise", "Smith", np.nan]), + ], + codes=[[0, 1, 2, 2], [2, 0, 1, 2]], + names=["first_name", "middle_name"], + ), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("columns", (["first_name", "middle_name"], [0, 1])) +def test_data_frame_value_counts_subset(nulls_fixture, columns): + # GH 50829 + df = pd.DataFrame( + { + columns[0]: ["John", "Anne", "John", "Beth"], + columns[1]: ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + result = df.value_counts(columns[0]) + expected = pd.Series( + data=[2, 1, 1], + index=pd.Index(["John", "Anne", "Beth"], name=columns[0]), + name="count", + ) + + tm.assert_series_equal(result, expected) + + +def test_value_counts_categorical_future_warning(): + # GH#54775 + df = pd.DataFrame({"a": [1, 2, 3]}, dtype="category") + result = df.value_counts() + expected = pd.Series( + 1, + index=pd.MultiIndex.from_arrays( + [pd.Index([1, 2, 3], name="a", dtype="category")] + ), + name="count", + ) + tm.assert_series_equal(result, expected) + + +def test_value_counts_with_missing_category(): + # GH-54836 + df = pd.DataFrame({"a": pd.Categorical([1, 2, 4], categories=[1, 2, 3, 4])}) + result = df.value_counts() + expected = pd.Series( + [1, 1, 1, 0], + index=pd.MultiIndex.from_arrays( + [pd.CategoricalIndex([1, 2, 4, 3], categories=[1, 2, 3, 4], name="a")] + ), + name="count", + ) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_values.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_values.py new file mode 100644 index 0000000000000000000000000000000000000000..bbca4ee1b88b1b756ea27140d2944d349049c37c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/frame/methods/test_values.py @@ -0,0 +1,280 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + NaT, + Series, + Timestamp, + date_range, + period_range, +) +import pandas._testing as tm + + +class TestDataFrameValues: + @td.skip_array_manager_invalid_test + def test_values(self, float_frame, using_copy_on_write): + if using_copy_on_write: + with pytest.raises(ValueError, match="read-only"): + float_frame.values[:, 0] = 5.0 + assert (float_frame.values[:, 0] != 5).all() + else: + float_frame.values[:, 0] = 5.0 + assert (float_frame.values[:, 0] == 5).all() + + def test_more_values(self, float_string_frame): + values = float_string_frame.values + assert values.shape[1] == len(float_string_frame.columns) + + def test_values_mixed_dtypes(self, float_frame, float_string_frame): + frame = float_frame + arr = frame.values + + frame_cols = frame.columns + for i, row in enumerate(arr): + for j, value in enumerate(row): + col = frame_cols[j] + if np.isnan(value): + assert np.isnan(frame[col].iloc[i]) + else: + assert value == frame[col].iloc[i] + + # mixed type + arr = float_string_frame[["foo", "A"]].values + assert arr[0, 0] == "bar" + + df = DataFrame({"complex": [1j, 2j, 3j], "real": [1, 2, 3]}) + arr = df.values + assert arr[0, 0] == 1j + + def test_values_duplicates(self): + df = DataFrame( + [[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"] + ) + + result = df.values + expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object) + + tm.assert_numpy_array_equal(result, expected) + + def test_values_with_duplicate_columns(self): + df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"]) + result = df.values + expected = np.array([[1, 2.5], [3, 4.5]]) + assert (result == expected).all().all() + + @pytest.mark.parametrize("constructor", [date_range, period_range]) + def test_values_casts_datetimelike_to_object(self, constructor): + series = Series(constructor("2000-01-01", periods=10, freq="D")) + + expected = series.astype("object") + + df = DataFrame( + {"a": series, "b": np.random.default_rng(2).standard_normal(len(series))} + ) + + result = df.values.squeeze() + assert (result[:, 0] == expected.values).all() + + df = DataFrame({"a": series, "b": ["foo"] * len(series)}) + + result = df.values.squeeze() + assert (result[:, 0] == expected.values).all() + + def test_frame_values_with_tz(self): + tz = "US/Central" + df = DataFrame({"A": date_range("2000", periods=4, tz=tz)}) + result = df.values + expected = np.array( + [ + [Timestamp("2000-01-01", tz=tz)], + [Timestamp("2000-01-02", tz=tz)], + [Timestamp("2000-01-03", tz=tz)], + [Timestamp("2000-01-04", tz=tz)], + ] + ) + tm.assert_numpy_array_equal(result, expected) + + # two columns, homogeneous + + df["B"] = df["A"] + result = df.values + expected = np.concatenate([expected, expected], axis=1) + tm.assert_numpy_array_equal(result, expected) + + # three columns, heterogeneous + est = "US/Eastern" + df["C"] = df["A"].dt.tz_convert(est) + + new = np.array( + [ + [Timestamp("2000-01-01T01:00:00", tz=est)], + [Timestamp("2000-01-02T01:00:00", tz=est)], + [Timestamp("2000-01-03T01:00:00", tz=est)], + [Timestamp("2000-01-04T01:00:00", tz=est)], + ] + ) + expected = np.concatenate([expected, new], axis=1) + result = df.values + tm.assert_numpy_array_equal(result, expected) + + def test_interleave_with_tzaware(self, timezone_frame): + # interleave with object + result = timezone_frame.assign(D="foo").values + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ["foo", "foo", "foo"], + ], + dtype=object, + ).T + tm.assert_numpy_array_equal(result, expected) + + # interleave with only datetime64[ns] + result = timezone_frame.values + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ], + dtype=object, + ).T + tm.assert_numpy_array_equal(result, expected) + + def test_values_interleave_non_unique_cols(self): + df = DataFrame( + [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], + columns=["x", "x"], + index=[1, 2], + ) + + df_unique = df.copy() + df_unique.columns = ["x", "y"] + assert df_unique.values.shape == df.values.shape + tm.assert_numpy_array_equal(df_unique.values[0], df.values[0]) + tm.assert_numpy_array_equal(df_unique.values[1], df.values[1]) + + def test_values_numeric_cols(self, float_frame): + float_frame["foo"] = "bar" + + values = float_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + def test_values_lcd(self, mixed_float_frame, mixed_int_frame): + # mixed lcd + values = mixed_float_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + values = mixed_float_frame[["A", "B", "C"]].values + assert values.dtype == np.float32 + + values = mixed_float_frame[["C"]].values + assert values.dtype == np.float16 + + # GH#10364 + # B uint64 forces float because there are other signed int types + values = mixed_int_frame[["A", "B", "C", "D"]].values + assert values.dtype == np.float64 + + values = mixed_int_frame[["A", "D"]].values + assert values.dtype == np.int64 + + # B uint64 forces float because there are other signed int types + values = mixed_int_frame[["A", "B", "C"]].values + assert values.dtype == np.float64 + + # as B and C are both unsigned, no forcing to float is needed + values = mixed_int_frame[["B", "C"]].values + assert values.dtype == np.uint64 + + values = mixed_int_frame[["A", "C"]].values + assert values.dtype == np.int32 + + values = mixed_int_frame[["C", "D"]].values + assert values.dtype == np.int64 + + values = mixed_int_frame[["A"]].values + assert values.dtype == np.int32 + + values = mixed_int_frame[["C"]].values + assert values.dtype == np.uint8 + + +class TestPrivateValues: + @td.skip_array_manager_invalid_test + def test_private_values_dt64tz(self, using_copy_on_write): + dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1) + + df = DataFrame(dta, columns=["A"]) + tm.assert_equal(df._values, dta) + + if using_copy_on_write: + assert not np.shares_memory(df._values._ndarray, dta._ndarray) + else: + # we have a view + assert np.shares_memory(df._values._ndarray, dta._ndarray) + + # TimedeltaArray + tda = dta - dta + df2 = df - df + tm.assert_equal(df2._values, tda) + + @td.skip_array_manager_invalid_test + def test_private_values_dt64tz_multicol(self, using_copy_on_write): + dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2) + + df = DataFrame(dta, columns=["A", "B"]) + tm.assert_equal(df._values, dta) + + if using_copy_on_write: + assert not np.shares_memory(df._values._ndarray, dta._ndarray) + else: + # we have a view + assert np.shares_memory(df._values._ndarray, dta._ndarray) + + # TimedeltaArray + tda = dta - dta + df2 = df - df + tm.assert_equal(df2._values, tda) + + def test_private_values_dt64_multiblock(self): + dta = date_range("2000", periods=8)._data + + df = DataFrame({"A": dta[:4]}, copy=False) + df["B"] = dta[4:] + + assert len(df._mgr.arrays) == 2 + + result = df._values + expected = dta.reshape(2, 4).T + tm.assert_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fa08eba0d284695827f1ebfd36e68622b916492 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f89f7d57eef805cfc41972e95c2f36caa66bd523 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_formats.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_formats.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5442a01f516e5754f8a8b06535b7a24bcd7fbb41 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_formats.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a23a4c1939d414014c45725c2f5501c6920feba Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_pickle.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_pickle.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea25efc16add75c3c5e090092f5b794de3b4282b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_pickle.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_setops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_setops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4497eddb543203eafa14edfffcd4cd839949257 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_setops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_where.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_where.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a987f824970851801d1bac88a36e793c14db94dc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/base_class/__pycache__/test_where.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d25f78c18bb5b7b08f163bf8ce21787ad4da661 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f74beefb644af4ec8b22f8aef402de5da58ff88 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_formats.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_formats.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..992c7859ca80c8ca5dd91e2e67143036eb997091 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_formats.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_freq_attr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_freq_attr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8daa5558846373dedcf07e0f428ef881dc2978b9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_freq_attr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_indexing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_indexing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a295848e2e3dfc8625b7f0bfb63304f46368c17 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_indexing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_join.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_join.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57a4d50db2e63237492a49deddcfc8ff5b6be45d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_join.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_monotonic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_monotonic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cf95d9ca5c85992d3bd3a03bbf1760bf1e19274 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_monotonic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_partial_slicing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_partial_slicing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..072ba6d5cfd96f42c3d0569340daf0b552c1be8c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_partial_slicing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_period.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_period.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..358f28828f627e1887aacc0b8ac52a036a5ef796 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_period.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_period_range.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_period_range.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58b4eea0254e2d485bb6a807951ad984d2337497 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_period_range.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_pickle.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_pickle.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55a3be40126fdb91ac3185eb92be752ccfb63258 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_pickle.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_resolution.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_resolution.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70be9005201626b2d142fd9369c4a739b805837f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_resolution.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_scalar_compat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_scalar_compat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd0ffa297d7b8181806c20de9fed2dd277ce15e2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_scalar_compat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_searchsorted.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_searchsorted.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b49771e4bbbf126e2c70c5f800b26d525a4c0ce Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_searchsorted.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_setops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_setops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6600bcde3d2e9ac3edcbd155f19207c7633a1509 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_setops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_tools.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_tools.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..670d8df7a4a02962dfabd4a5de927e8f4ecf0060 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/__pycache__/test_tools.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..089179fbf32eac7129ce6b49ecd8f1041cc594ca Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_asfreq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_asfreq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d0874fb0193eeebd4f27c573cb53a77ee98d669 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_asfreq.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_astype.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_astype.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a38644a4cf7b498503b06aee9f6cdcf36650152 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_astype.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_factorize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_factorize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b23f9d0d8e74dfa15ee8cd00081987e22f175275 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_factorize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_fillna.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_fillna.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94fccbff4cab600b87f5dfe6122ac3bb644b2877 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_fillna.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_insert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_insert.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf5f6ea14912416f74e7e06fff9c11e3168aca96 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_insert.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_is_full.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_is_full.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f519ab756246aef1c994b5bd00725cbbebe3e47 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_is_full.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_repeat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_repeat.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba1e63f93f6e33ccc7d38b4e406a73bfe364b31a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_repeat.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_shift.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_shift.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5c22329a604af1d3ad971875558467d1a2649d5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_shift.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_to_timestamp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_to_timestamp.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9705139abdbbee1871572a005445c37daeb39cd2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/__pycache__/test_to_timestamp.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_asfreq.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_asfreq.py new file mode 100644 index 0000000000000000000000000000000000000000..865bae69d91c7960e286646e22d0fa2646333303 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_asfreq.py @@ -0,0 +1,189 @@ +import re + +import pytest + +from pandas import ( + PeriodIndex, + Series, + period_range, +) +import pandas._testing as tm + +from pandas.tseries import offsets + + +class TestPeriodIndex: + def test_asfreq(self): + pi1 = period_range(freq="Y", start="1/1/2001", end="1/1/2001") + pi2 = period_range(freq="Q", start="1/1/2001", end="1/1/2001") + pi3 = period_range(freq="M", start="1/1/2001", end="1/1/2001") + pi4 = period_range(freq="D", start="1/1/2001", end="1/1/2001") + pi5 = period_range(freq="h", start="1/1/2001", end="1/1/2001 00:00") + pi6 = period_range(freq="Min", start="1/1/2001", end="1/1/2001 00:00") + pi7 = period_range(freq="s", start="1/1/2001", end="1/1/2001 00:00:00") + + assert pi1.asfreq("Q", "s") == pi2 + assert pi1.asfreq("Q", "s") == pi2 + assert pi1.asfreq("M", "start") == pi3 + assert pi1.asfreq("D", "StarT") == pi4 + assert pi1.asfreq("h", "beGIN") == pi5 + assert pi1.asfreq("Min", "s") == pi6 + assert pi1.asfreq("s", "s") == pi7 + + assert pi2.asfreq("Y", "s") == pi1 + assert pi2.asfreq("M", "s") == pi3 + assert pi2.asfreq("D", "s") == pi4 + assert pi2.asfreq("h", "s") == pi5 + assert pi2.asfreq("Min", "s") == pi6 + assert pi2.asfreq("s", "s") == pi7 + + assert pi3.asfreq("Y", "s") == pi1 + assert pi3.asfreq("Q", "s") == pi2 + assert pi3.asfreq("D", "s") == pi4 + assert pi3.asfreq("h", "s") == pi5 + assert pi3.asfreq("Min", "s") == pi6 + assert pi3.asfreq("s", "s") == pi7 + + assert pi4.asfreq("Y", "s") == pi1 + assert pi4.asfreq("Q", "s") == pi2 + assert pi4.asfreq("M", "s") == pi3 + assert pi4.asfreq("h", "s") == pi5 + assert pi4.asfreq("Min", "s") == pi6 + assert pi4.asfreq("s", "s") == pi7 + + assert pi5.asfreq("Y", "s") == pi1 + assert pi5.asfreq("Q", "s") == pi2 + assert pi5.asfreq("M", "s") == pi3 + assert pi5.asfreq("D", "s") == pi4 + assert pi5.asfreq("Min", "s") == pi6 + assert pi5.asfreq("s", "s") == pi7 + + assert pi6.asfreq("Y", "s") == pi1 + assert pi6.asfreq("Q", "s") == pi2 + assert pi6.asfreq("M", "s") == pi3 + assert pi6.asfreq("D", "s") == pi4 + assert pi6.asfreq("h", "s") == pi5 + assert pi6.asfreq("s", "s") == pi7 + + assert pi7.asfreq("Y", "s") == pi1 + assert pi7.asfreq("Q", "s") == pi2 + assert pi7.asfreq("M", "s") == pi3 + assert pi7.asfreq("D", "s") == pi4 + assert pi7.asfreq("h", "s") == pi5 + assert pi7.asfreq("Min", "s") == pi6 + + msg = "How must be one of S or E" + with pytest.raises(ValueError, match=msg): + pi7.asfreq("T", "foo") + result1 = pi1.asfreq("3M") + result2 = pi1.asfreq("M") + expected = period_range(freq="M", start="2001-12", end="2001-12") + tm.assert_numpy_array_equal(result1.asi8, expected.asi8) + assert result1.freqstr == "3M" + tm.assert_numpy_array_equal(result2.asi8, expected.asi8) + assert result2.freqstr == "M" + + def test_asfreq_nat(self): + idx = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-04"], freq="M") + result = idx.asfreq(freq="Q") + expected = PeriodIndex(["2011Q1", "2011Q1", "NaT", "2011Q2"], freq="Q") + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("freq", ["D", "3D"]) + def test_asfreq_mult_pi(self, freq): + pi = PeriodIndex(["2001-01", "2001-02", "NaT", "2001-03"], freq="2M") + + result = pi.asfreq(freq) + exp = PeriodIndex(["2001-02-28", "2001-03-31", "NaT", "2001-04-30"], freq=freq) + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + result = pi.asfreq(freq, how="S") + exp = PeriodIndex(["2001-01-01", "2001-02-01", "NaT", "2001-03-01"], freq=freq) + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + def test_asfreq_combined_pi(self): + pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="h") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="25h") + for freq, how in zip(["1D1h", "1h1D"], ["S", "E"]): + result = pi.asfreq(freq, how=how) + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + for freq in ["1D1h", "1h1D"]: + pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq) + result = pi.asfreq("h") + exp = PeriodIndex(["2001-01-02 00:00", "2001-01-03 02:00", "NaT"], freq="h") + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq) + result = pi.asfreq("h", how="S") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="h") + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + def test_astype_asfreq(self): + pi1 = PeriodIndex(["2011-01-01", "2011-02-01", "2011-03-01"], freq="D") + exp = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="M") + tm.assert_index_equal(pi1.asfreq("M"), exp) + tm.assert_index_equal(pi1.astype("period[M]"), exp) + + exp = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="3M") + tm.assert_index_equal(pi1.asfreq("3M"), exp) + tm.assert_index_equal(pi1.astype("period[3M]"), exp) + + def test_asfreq_with_different_n(self): + ser = Series([1, 2], index=PeriodIndex(["2020-01", "2020-03"], freq="2M")) + result = ser.asfreq("M") + + excepted = Series([1, 2], index=PeriodIndex(["2020-02", "2020-04"], freq="M")) + tm.assert_series_equal(result, excepted) + + @pytest.mark.parametrize( + "freq", + [ + "2BMS", + "2YS-MAR", + "2bh", + ], + ) + def test_pi_asfreq_not_supported_frequency(self, freq): + # GH#55785 + msg = f"{freq[1:]} is not supported as period frequency" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + "2BME", + "2YE-MAR", + "2QE", + ], + ) + def test_pi_asfreq_invalid_frequency(self, freq): + # GH#55785 + msg = f"Invalid frequency: {freq}" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + offsets.MonthBegin(2), + offsets.BusinessMonthEnd(2), + ], + ) + def test_pi_asfreq_invalid_baseoffset(self, freq): + # GH#56945 + msg = re.escape(f"{freq} is not supported as period frequency") + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_factorize.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_factorize.py new file mode 100644 index 0000000000000000000000000000000000000000..1239eae6091b81dfcc1ac049996296f6af565df8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_factorize.py @@ -0,0 +1,41 @@ +import numpy as np + +from pandas import PeriodIndex +import pandas._testing as tm + + +class TestFactorize: + def test_factorize_period(self): + idx1 = PeriodIndex( + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], + freq="M", + ) + + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") + + arr, idx = idx1.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + arr, idx = idx1.factorize(sort=True) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + def test_factorize_period_nonmonotonic(self): + idx2 = PeriodIndex( + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], + freq="M", + ) + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") + + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) + arr, idx = idx2.factorize(sort=True) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) + + exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) + exp_idx = PeriodIndex(["2014-03", "2014-02", "2014-01"], freq="M") + arr, idx = idx2.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, exp_idx) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_fillna.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_fillna.py new file mode 100644 index 0000000000000000000000000000000000000000..ed6b4686a06defdc3eac4e1f6427fb0569c2d48d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_fillna.py @@ -0,0 +1,41 @@ +from pandas import ( + Index, + NaT, + Period, + PeriodIndex, +) +import pandas._testing as tm + + +class TestFillNA: + def test_fillna_period(self): + # GH#11343 + idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="h") + + exp = PeriodIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="h" + ) + result = idx.fillna(Period("2011-01-01 10:00", freq="h")) + tm.assert_index_equal(result, exp) + + exp = Index( + [ + Period("2011-01-01 09:00", freq="h"), + "x", + Period("2011-01-01 11:00", freq="h"), + ], + dtype=object, + ) + result = idx.fillna("x") + tm.assert_index_equal(result, exp) + + exp = Index( + [ + Period("2011-01-01 09:00", freq="h"), + Period("2011-01-01", freq="D"), + Period("2011-01-01 11:00", freq="h"), + ], + dtype=object, + ) + result = idx.fillna(Period("2011-01-01", freq="D")) + tm.assert_index_equal(result, exp) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_is_full.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_is_full.py new file mode 100644 index 0000000000000000000000000000000000000000..b4105bedbe21d6dc85379f1a6eefb298db954056 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_is_full.py @@ -0,0 +1,23 @@ +import pytest + +from pandas import PeriodIndex + + +def test_is_full(): + index = PeriodIndex([2005, 2007, 2009], freq="Y") + assert not index.is_full + + index = PeriodIndex([2005, 2006, 2007], freq="Y") + assert index.is_full + + index = PeriodIndex([2005, 2005, 2007], freq="Y") + assert not index.is_full + + index = PeriodIndex([2005, 2005, 2006], freq="Y") + assert index.is_full + + index = PeriodIndex([2006, 2005, 2005], freq="Y") + with pytest.raises(ValueError, match="Index is not monotonic"): + index.is_full + + assert index[:0].is_full diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_to_timestamp.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_to_timestamp.py new file mode 100644 index 0000000000000000000000000000000000000000..3867f9e3245dc10a90ab4fcb1458b861ee7e2f86 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -0,0 +1,142 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + NaT, + PeriodIndex, + Timedelta, + Timestamp, + date_range, + period_range, +) +import pandas._testing as tm + + +class TestToTimestamp: + def test_to_timestamp_non_contiguous(self): + # GH#44100 + dti = date_range("2021-10-18", periods=9, freq="D") + pi = dti.to_period() + + result = pi[::2].to_timestamp() + expected = dti[::2] + tm.assert_index_equal(result, expected) + + result = pi._data[::2].to_timestamp() + expected = dti._data[::2] + # TODO: can we get the freq to round-trip? + tm.assert_datetime_array_equal(result, expected, check_freq=False) + + result = pi[::-1].to_timestamp() + expected = dti[::-1] + tm.assert_index_equal(result, expected) + + result = pi._data[::-1].to_timestamp() + expected = dti._data[::-1] + tm.assert_datetime_array_equal(result, expected, check_freq=False) + + result = pi[::2][::-1].to_timestamp() + expected = dti[::2][::-1] + tm.assert_index_equal(result, expected) + + result = pi._data[::2][::-1].to_timestamp() + expected = dti._data[::2][::-1] + tm.assert_datetime_array_equal(result, expected, check_freq=False) + + def test_to_timestamp_freq(self): + idx = period_range("2017", periods=12, freq="Y-DEC") + result = idx.to_timestamp() + expected = date_range("2017", periods=12, freq="YS-JAN") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_pi_nat(self): + # GH#7228 + index = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") + + result = index.to_timestamp("D") + expected = DatetimeIndex( + [NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], + dtype="M8[ns]", + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.name == "idx" + + result2 = result.to_period(freq="M") + tm.assert_index_equal(result2, index) + assert result2.name == "idx" + + result3 = result.to_period(freq="3M") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") + tm.assert_index_equal(result3, exp) + assert result3.freqstr == "3M" + + msg = "Frequency must be positive, because it represents span: -2Y" + with pytest.raises(ValueError, match=msg): + result.to_period(freq="-2Y") + + def test_to_timestamp_preserve_name(self): + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009", name="foo") + assert index.name == "foo" + + conv = index.to_timestamp("D") + assert conv.name == "foo" + + def test_to_timestamp_quarterly_bug(self): + years = np.arange(1960, 2000).repeat(4) + quarters = np.tile(list(range(1, 5)), 40) + + pindex = PeriodIndex.from_fields(year=years, quarter=quarters) + + stamps = pindex.to_timestamp("D", "end") + expected = DatetimeIndex([x.to_timestamp("D", "end") for x in pindex]) + tm.assert_index_equal(stamps, expected) + assert stamps.freq == expected.freq + + def test_to_timestamp_pi_mult(self): + idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="2M", name="idx") + + result = idx.to_timestamp() + expected = DatetimeIndex( + ["2011-01-01", "NaT", "2011-02-01"], dtype="M8[ns]", name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E") + expected = DatetimeIndex( + ["2011-02-28", "NaT", "2011-03-31"], dtype="M8[ns]", name="idx" + ) + expected = expected + Timedelta(1, "D") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_pi_combined(self): + idx = period_range(start="2011", periods=2, freq="1D1h", name="idx") + + result = idx.to_timestamp() + expected = DatetimeIndex( + ["2011-01-01 00:00", "2011-01-02 01:00"], dtype="M8[ns]", name="idx" + ) + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E") + expected = DatetimeIndex( + ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx", dtype="M8[ns]" + ) + expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E", freq="h") + expected = DatetimeIndex( + ["2011-01-02 00:00", "2011-01-03 01:00"], dtype="M8[ns]", name="idx" + ) + expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_1703(self): + index = period_range("1/1/2012", periods=4, freq="D") + + result = index.to_timestamp() + assert result[0] == Timestamp("1/1/2012") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_delete.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_delete.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b54246b3513abd242ea550bfb330526ca96ced62 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_delete.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_freq_attr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_freq_attr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ac92f91d21cbfa25ed80827d96b2354da12636a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_freq_attr.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_join.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_join.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0159b72d846a00f85c5573fcff65a9ea78db7bf3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_join.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_ops.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18a1d30544c16f76bb90ff0ce18a297d5699e71e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/indexes/timedeltas/__pycache__/test_ops.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17fb6ceb9f5997df580f67bb561207bbd1c921e9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_odf.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_odf.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a86e4034367b8361e643d1d1fa4bc3e226781a65 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_odf.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_odswriter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_odswriter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..914f0595a066898aa55f0baed072a68f8e03d3ac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_odswriter.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_openpyxl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_openpyxl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4937441f41e2b42e6f74c83a023dcc012f13222 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_openpyxl.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_readers.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_readers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f659ae3b40c0753d62133f37d3cc939432d755d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_readers.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_style.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_style.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb632f07eff0e94ad587467350cf3a22c5601b97 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_style.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_writers.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_writers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6fd0bf99fa69eb4c837f9904224c2ab2f680158c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_writers.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_xlrd.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_xlrd.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17e8a99f4bc23a619ef8eb70313224c860aaab40 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_xlrd.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_xlsxwriter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_xlsxwriter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36a8616098853810b4dc630192de59263deebc94 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/excel/__pycache__/test_xlsxwriter.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_chunksize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_chunksize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e28389e9b5f21c289f9ef36d3321a69fc7e044c8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_chunksize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_file_buffer_url.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_file_buffer_url.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0fa607d6fae456034498b5ee110de32c93f1a6fa Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_file_buffer_url.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_inf.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_inf.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..13ca83fef0c5b8be2587e24ac9d1ae25628f2bb1 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/parser/common/__pycache__/test_inf.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27f63c9b1f394ff0aa9a49ffdde41727bf8a0cd5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/conftest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7227e9a2b33cdc272c655b45b35ba67e06c3d796 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/conftest.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_append.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_append.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac3824229ae73c99af0599e1baf9121a9638bc30 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_append.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_categorical.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_categorical.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e97d649f212f621f75af24a91098f199e007b663 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_categorical.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_errors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_errors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..174c832f7189abf5e7c7336d47dfc293c14d4202 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_errors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_file_handling.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_file_handling.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d32dc8ef9c6f3031045c642708d006274117c3ec Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_file_handling.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_keys.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_keys.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3333047e7c486af123d02c4bbed8ad864e14848 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_keys.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_put.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_put.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e67f62b47e9216e08620c9d5255d56e47e0e2929 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_put.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_read.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_read.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..842b47a8363a0177b1e8961bc54afe96aa09373e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_read.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_select.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_select.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0058a7217b92e195490dd5b6d0274400097de717 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_select.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_subclass.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_subclass.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6498ebef7b9148ee48d14702ee2616c0ea65f4d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/io/pytables/__pycache__/test_subclass.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/reductions/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/reductions/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8baebe1cd164a805b53ff7b454a328eff5f0f05c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/reductions/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/reductions/__pycache__/test_reductions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/reductions/__pycache__/test_reductions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6a859fb237d32d3415ccf32ddbe788c1cd5def2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/reductions/__pycache__/test_reductions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/reductions/__pycache__/test_stat_reductions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/reductions/__pycache__/test_stat_reductions.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f6c67b18ba96c73dd425aaae6c3db111e3e70e7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/reductions/__pycache__/test_stat_reductions.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..292831db7b65cc18a2f264f4ebd2095c8ac6900d Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_arithmetic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_arithmetic.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0873ba10f915cf5034cb06c47dd77b0151addc05 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_arithmetic.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_constructors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_constructors.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bee1a0e4a5f3ec394bd68398b16a8c45f46e320e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_constructors.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_formats.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_formats.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..389f7a5ee782e7ecd755ea5beb85f8b9f702cdf9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_formats.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_timedelta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_timedelta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2c9f44ae93d8c7d8e82efe44b46b4769b951f5f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/__pycache__/test_timedelta.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/test_as_unit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/test_as_unit.py new file mode 100644 index 0000000000000000000000000000000000000000..8660141e5a5372d4bb8e921bc8b3a5ee148e8900 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/test_as_unit.py @@ -0,0 +1,80 @@ +import pytest + +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas.errors import OutOfBoundsTimedelta + +from pandas import Timedelta + + +class TestAsUnit: + def test_as_unit(self): + td = Timedelta(days=1) + + assert td.as_unit("ns") is td + + res = td.as_unit("us") + assert res._value == td._value // 1000 + assert res._creso == NpyDatetimeUnit.NPY_FR_us.value + + rt = res.as_unit("ns") + assert rt._value == td._value + assert rt._creso == td._creso + + res = td.as_unit("ms") + assert res._value == td._value // 1_000_000 + assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value + + rt = res.as_unit("ns") + assert rt._value == td._value + assert rt._creso == td._creso + + res = td.as_unit("s") + assert res._value == td._value // 1_000_000_000 + assert res._creso == NpyDatetimeUnit.NPY_FR_s.value + + rt = res.as_unit("ns") + assert rt._value == td._value + assert rt._creso == td._creso + + def test_as_unit_overflows(self): + # microsecond that would be just out of bounds for nano + us = 9223372800000000 + td = Timedelta._from_value_and_reso(us, NpyDatetimeUnit.NPY_FR_us.value) + + msg = "Cannot cast 106752 days 00:00:00 to unit='ns' without overflow" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + td.as_unit("ns") + + res = td.as_unit("ms") + assert res._value == us // 1000 + assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value + + def test_as_unit_rounding(self): + td = Timedelta(microseconds=1500) + res = td.as_unit("ms") + + expected = Timedelta(milliseconds=1) + assert res == expected + + assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value + assert res._value == 1 + + with pytest.raises(ValueError, match="Cannot losslessly convert units"): + td.as_unit("ms", round_ok=False) + + def test_as_unit_non_nano(self): + # case where we are going neither to nor from nano + td = Timedelta(days=1).as_unit("ms") + assert td.days == 1 + assert td._value == 86_400_000 + assert td.components.days == 1 + assert td._d == 1 + assert td.total_seconds() == 86400 + + res = td.as_unit("us") + assert res._value == 86_400_000_000 + assert res.components.days == 1 + assert res.components.hours == 0 + assert res._d == 1 + assert res._h == 0 + assert res.total_seconds() == 86400 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/test_round.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/test_round.py new file mode 100644 index 0000000000000000000000000000000000000000..e54adb27d126bf13b454c513cecf847cdbb623bb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timedelta/methods/test_round.py @@ -0,0 +1,187 @@ +from hypothesis import ( + given, + strategies as st, +) +import numpy as np +import pytest + +from pandas._libs import lib +from pandas._libs.tslibs import iNaT +from pandas.errors import OutOfBoundsTimedelta + +from pandas import Timedelta + + +class TestTimedeltaRound: + @pytest.mark.parametrize( + "freq,s1,s2", + [ + # This first case has s1, s2 being the same as t1,t2 below + ( + "ns", + Timedelta("1 days 02:34:56.789123456"), + Timedelta("-1 days 02:34:56.789123456"), + ), + ( + "us", + Timedelta("1 days 02:34:56.789123000"), + Timedelta("-1 days 02:34:56.789123000"), + ), + ( + "ms", + Timedelta("1 days 02:34:56.789000000"), + Timedelta("-1 days 02:34:56.789000000"), + ), + ("s", Timedelta("1 days 02:34:57"), Timedelta("-1 days 02:34:57")), + ("2s", Timedelta("1 days 02:34:56"), Timedelta("-1 days 02:34:56")), + ("5s", Timedelta("1 days 02:34:55"), Timedelta("-1 days 02:34:55")), + ("min", Timedelta("1 days 02:35:00"), Timedelta("-1 days 02:35:00")), + ("12min", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), + ("h", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), + ("d", Timedelta("1 days"), Timedelta("-1 days")), + ], + ) + def test_round(self, freq, s1, s2): + t1 = Timedelta("1 days 02:34:56.789123456") + t2 = Timedelta("-1 days 02:34:56.789123456") + + r1 = t1.round(freq) + assert r1 == s1 + r2 = t2.round(freq) + assert r2 == s2 + + def test_round_invalid(self): + t1 = Timedelta("1 days 02:34:56.789123456") + + for freq, msg in [ + ("YE", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ]: + with pytest.raises(ValueError, match=msg): + t1.round(freq) + + @pytest.mark.skip_ubsan + def test_round_implementation_bounds(self): + # See also: analogous test for Timestamp + # GH#38964 + result = Timedelta.min.ceil("s") + expected = Timedelta.min + Timedelta(seconds=1) - Timedelta(145224193) + assert result == expected + + result = Timedelta.max.floor("s") + expected = Timedelta.max - Timedelta(854775807) + assert result == expected + + msg = ( + r"Cannot round -106752 days \+00:12:43.145224193 to freq=s without overflow" + ) + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta.min.floor("s") + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta.min.round("s") + + msg = "Cannot round 106751 days 23:47:16.854775807 to freq=s without overflow" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta.max.ceil("s") + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta.max.round("s") + + @pytest.mark.skip_ubsan + @given(val=st.integers(min_value=iNaT + 1, max_value=lib.i8max)) + @pytest.mark.parametrize( + "method", [Timedelta.round, Timedelta.floor, Timedelta.ceil] + ) + def test_round_sanity(self, val, method): + cls = Timedelta + err_cls = OutOfBoundsTimedelta + + val = np.int64(val) + td = cls(val) + + def checker(ts, nanos, unit): + # First check that we do raise in cases where we should + if nanos == 1: + pass + else: + div, mod = divmod(ts._value, nanos) + diff = int(nanos - mod) + lb = ts._value - mod + assert lb <= ts._value # i.e. no overflows with python ints + ub = ts._value + diff + assert ub > ts._value # i.e. no overflows with python ints + + msg = "without overflow" + if mod == 0: + # We should never be raising in this + pass + elif method is cls.ceil: + if ub > cls.max._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + elif method is cls.floor: + if lb < cls.min._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + elif mod >= diff: + if ub > cls.max._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + elif lb < cls.min._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + + res = method(ts, unit) + + td = res - ts + diff = abs(td._value) + assert diff < nanos + assert res._value % nanos == 0 + + if method is cls.round: + assert diff <= nanos / 2 + elif method is cls.floor: + assert res <= ts + elif method is cls.ceil: + assert res >= ts + + nanos = 1 + checker(td, nanos, "ns") + + nanos = 1000 + checker(td, nanos, "us") + + nanos = 1_000_000 + checker(td, nanos, "ms") + + nanos = 1_000_000_000 + checker(td, nanos, "s") + + nanos = 60 * 1_000_000_000 + checker(td, nanos, "min") + + nanos = 60 * 60 * 1_000_000_000 + checker(td, nanos, "h") + + nanos = 24 * 60 * 60 * 1_000_000_000 + checker(td, nanos, "D") + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_round_non_nano(self, unit): + td = Timedelta("1 days 02:34:57").as_unit(unit) + + res = td.round("min") + assert res == Timedelta("1 days 02:35:00") + assert res._creso == td._creso + + res = td.floor("min") + assert res == Timedelta("1 days 02:34:00") + assert res._creso == td._creso + + res = td.ceil("min") + assert res == Timedelta("1 days 02:35:00") + assert res._creso == td._creso diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae5f34b073d5b58a9faa2a15818e7cde9e1baeea Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_as_unit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_as_unit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f6b2e25ea0fa41d0b701096a1479fec7655af09 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_as_unit.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_timestamp_method.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_timestamp_method.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1055a6406785678798f8a2aead8dac9ace77c00 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_timestamp_method.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_to_pydatetime.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_to_pydatetime.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09b618a07d673a212c4904490e4662081dc6a1e7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_to_pydatetime.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_tz_localize.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_tz_localize.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6ad1e5c125a32abcfb8643cb077912b16c809d9 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/scalar/timestamp/methods/__pycache__/test_tz_localize.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_delitem.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_delitem.py new file mode 100644 index 0000000000000000000000000000000000000000..3d1082c3d040b95063d10c7160154b62dc1c84f6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_delitem.py @@ -0,0 +1,70 @@ +import pytest + +from pandas import ( + Index, + Series, + date_range, +) +import pandas._testing as tm + + +class TestSeriesDelItem: + def test_delitem(self): + # GH#5542 + # should delete the item inplace + s = Series(range(5)) + del s[0] + + expected = Series(range(1, 5), index=range(1, 5)) + tm.assert_series_equal(s, expected) + + del s[1] + expected = Series(range(2, 5), index=range(2, 5)) + tm.assert_series_equal(s, expected) + + # only 1 left, del, add, del + s = Series(1) + del s[0] + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) + s[0] = 1 + tm.assert_series_equal(s, Series(1)) + del s[0] + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) + + def test_delitem_object_index(self, using_infer_string): + # Index(dtype=object) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + s = Series(1, index=Index(["a"], dtype=dtype)) + del s["a"] + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + s["a"] = 1 + tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype))) + del s["a"] + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) + + def test_delitem_missing_key(self): + # empty + s = Series(dtype=object) + + with pytest.raises(KeyError, match=r"^0$"): + del s[0] + + def test_delitem_extension_dtype(self): + # GH#40386 + # DatetimeTZDtype + dti = date_range("2016-01-01", periods=3, tz="US/Pacific") + ser = Series(dti) + + expected = ser[[0, 2]] + del ser[1] + assert ser.dtype == dti.dtype + tm.assert_series_equal(ser, expected) + + # PeriodDtype + pi = dti.tz_localize(None).to_period("D") + ser = Series(pi) + + expected = ser[:2] + del ser[2] + assert ser.dtype == pi.dtype + tm.assert_series_equal(ser, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_getitem.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_getitem.py new file mode 100644 index 0000000000000000000000000000000000000000..596a225c288b890461d4ea5d03bef6f6eb1bc04b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_getitem.py @@ -0,0 +1,735 @@ +""" +Series.__getitem__ test classes are organized by the type of key passed. +""" +from datetime import ( + date, + datetime, + time, +) + +import numpy as np +import pytest + +from pandas._libs.tslibs import ( + conversion, + timezones, +) + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + Index, + Series, + Timestamp, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.indexing import IndexingError + +from pandas.tseries.offsets import BDay + + +class TestSeriesGetitemScalars: + def test_getitem_object_index_float_string(self): + # GH#17286 + ser = Series([1] * 4, index=Index(["a", "b", "c", 1.0])) + assert ser["a"] == 1 + assert ser[1.0] == 1 + + def test_getitem_float_keys_tuple_values(self): + # see GH#13509 + + # unique Index + ser = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.1, 0.2], name="foo") + result = ser[0.0] + assert result == (1, 1) + + # non-unique Index + expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name="foo") + ser = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.0, 0.2], name="foo") + + result = ser[0.0] + tm.assert_series_equal(result, expected) + + def test_getitem_unrecognized_scalar(self): + # GH#32684 a scalar key that is not recognized by lib.is_scalar + + # a series that might be produced via `frame.dtypes` + ser = Series([1, 2], index=[np.dtype("O"), np.dtype("i8")]) + + key = ser.index[1] + + result = ser[key] + assert result == 2 + + def test_getitem_negative_out_of_bounds(self): + ser = Series(["a"] * 10, index=["a"] * 10) + + msg = "index -11 is out of bounds for axis 0 with size 10|index out of bounds" + warn_msg = "Series.__getitem__ treating keys as positions is deprecated" + with pytest.raises(IndexError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + ser[-11] + + def test_getitem_out_of_bounds_indexerror(self, datetime_series): + # don't segfault, GH#495 + msg = r"index \d+ is out of bounds for axis 0 with size \d+" + warn_msg = "Series.__getitem__ treating keys as positions is deprecated" + with pytest.raises(IndexError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + datetime_series[len(datetime_series)] + + def test_getitem_out_of_bounds_empty_rangeindex_keyerror(self): + # GH#917 + # With a RangeIndex, an int key gives a KeyError + ser = Series([], dtype=object) + with pytest.raises(KeyError, match="-1"): + ser[-1] + + def test_getitem_keyerror_with_integer_index(self, any_int_numpy_dtype): + dtype = any_int_numpy_dtype + ser = Series( + np.random.default_rng(2).standard_normal(6), + index=Index([0, 0, 1, 1, 2, 2], dtype=dtype), + ) + + with pytest.raises(KeyError, match=r"^5$"): + ser[5] + + with pytest.raises(KeyError, match=r"^'c'$"): + ser["c"] + + # not monotonic + ser = Series( + np.random.default_rng(2).standard_normal(6), index=[2, 2, 0, 0, 1, 1] + ) + + with pytest.raises(KeyError, match=r"^5$"): + ser[5] + + with pytest.raises(KeyError, match=r"^'c'$"): + ser["c"] + + def test_getitem_int64(self, datetime_series): + idx = np.int64(5) + msg = "Series.__getitem__ treating keys as positions is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = datetime_series[idx] + assert res == datetime_series.iloc[5] + + def test_getitem_full_range(self): + # github.com/pandas-dev/pandas/commit/4f433773141d2eb384325714a2776bcc5b2e20f7 + ser = Series(range(5), index=list(range(5))) + result = ser[list(range(5))] + tm.assert_series_equal(result, ser) + + # ------------------------------------------------------------------ + # Series with DatetimeIndex + + @pytest.mark.parametrize("tzstr", ["Europe/Berlin", "dateutil/Europe/Berlin"]) + def test_getitem_pydatetime_tz(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + + index = date_range( + start="2012-12-24 16:00", end="2012-12-24 18:00", freq="h", tz=tzstr + ) + ts = Series(index=index, data=index.hour) + time_pandas = Timestamp("2012-12-24 17:00", tz=tzstr) + + dt = datetime(2012, 12, 24, 17, 0) + time_datetime = conversion.localize_pydatetime(dt, tz) + assert ts[time_pandas] == ts[time_datetime] + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_string_index_alias_tz_aware(self, tz): + rng = date_range("1/1/2000", periods=10, tz=tz) + ser = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + + result = ser["1/3/2000"] + tm.assert_almost_equal(result, ser.iloc[2]) + + def test_getitem_time_object(self): + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + + mask = (rng.hour == 9) & (rng.minute == 30) + result = ts[time(9, 30)] + expected = ts[mask] + result.index = result.index._with_freq(None) + tm.assert_series_equal(result, expected) + + # ------------------------------------------------------------------ + # Series with CategoricalIndex + + def test_getitem_scalar_categorical_index(self): + cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) + + ser = Series([1, 2], index=cats) + + expected = ser.iloc[0] + result = ser[cats[0]] + assert result == expected + + def test_getitem_numeric_categorical_listlike_matches_scalar(self): + # GH#15470 + ser = Series(["a", "b", "c"], index=pd.CategoricalIndex([2, 1, 0])) + + # 0 is treated as a label + assert ser[0] == "c" + + # the listlike analogue should also be treated as labels + res = ser[[0]] + expected = ser.iloc[-1:] + tm.assert_series_equal(res, expected) + + res2 = ser[[0, 1, 2]] + tm.assert_series_equal(res2, ser.iloc[::-1]) + + def test_getitem_integer_categorical_not_positional(self): + # GH#14865 + ser = Series(["a", "b", "c"], index=Index([1, 2, 3], dtype="category")) + assert ser.get(3) == "c" + assert ser[3] == "c" + + def test_getitem_str_with_timedeltaindex(self): + rng = timedelta_range("1 day 10:11:12", freq="h", periods=500) + ser = Series(np.arange(len(rng)), index=rng) + + key = "6 days, 23:11:12" + indexer = rng.get_loc(key) + assert indexer == 133 + + result = ser[key] + assert result == ser.iloc[133] + + msg = r"^Timedelta\('50 days 00:00:00'\)$" + with pytest.raises(KeyError, match=msg): + rng.get_loc("50 days") + with pytest.raises(KeyError, match=msg): + ser["50 days"] + + def test_getitem_bool_index_positional(self): + # GH#48653 + ser = Series({True: 1, False: 0}) + msg = "Series.__getitem__ treating keys as positions is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser[0] + assert result == 1 + + +class TestSeriesGetitemSlices: + def test_getitem_partial_str_slice_with_datetimeindex(self): + # GH#34860 + arr = date_range("1/1/2008", "1/1/2009") + ser = arr.to_series() + result = ser["2008"] + + rng = date_range(start="2008-01-01", end="2008-12-31") + expected = Series(rng, index=rng) + + tm.assert_series_equal(result, expected) + + def test_getitem_slice_strings_with_datetimeindex(self): + idx = DatetimeIndex( + ["1/1/2000", "1/2/2000", "1/2/2000", "1/3/2000", "1/4/2000"] + ) + + ts = Series(np.random.default_rng(2).standard_normal(len(idx)), index=idx) + + result = ts["1/2/2000":] + expected = ts[1:] + tm.assert_series_equal(result, expected) + + result = ts["1/2/2000":"1/3/2000"] + expected = ts[1:4] + tm.assert_series_equal(result, expected) + + def test_getitem_partial_str_slice_with_timedeltaindex(self): + rng = timedelta_range("1 day 10:11:12", freq="h", periods=500) + ser = Series(np.arange(len(rng)), index=rng) + + result = ser["5 day":"6 day"] + expected = ser.iloc[86:134] + tm.assert_series_equal(result, expected) + + result = ser["5 day":] + expected = ser.iloc[86:] + tm.assert_series_equal(result, expected) + + result = ser[:"6 day"] + expected = ser.iloc[:134] + tm.assert_series_equal(result, expected) + + def test_getitem_partial_str_slice_high_reso_with_timedeltaindex(self): + # higher reso + rng = timedelta_range("1 day 10:11:12", freq="us", periods=2000) + ser = Series(np.arange(len(rng)), index=rng) + + result = ser["1 day 10:11:12":] + expected = ser.iloc[0:] + tm.assert_series_equal(result, expected) + + result = ser["1 day 10:11:12.001":] + expected = ser.iloc[1000:] + tm.assert_series_equal(result, expected) + + result = ser["1 days, 10:11:12.001001"] + assert result == ser.iloc[1001] + + def test_getitem_slice_2d(self, datetime_series): + # GH#30588 multi-dimensional indexing deprecated + with pytest.raises(ValueError, match="Multi-dimensional indexing"): + datetime_series[:, np.newaxis] + + def test_getitem_median_slice_bug(self): + index = date_range("20090415", "20090519", freq="2B") + ser = Series(np.random.default_rng(2).standard_normal(13), index=index) + + indexer = [slice(6, 7, None)] + msg = "Indexing with a single-item list" + with pytest.raises(ValueError, match=msg): + # GH#31299 + ser[indexer] + # but we're OK with a single-element tuple + result = ser[(indexer[0],)] + expected = ser[indexer[0]] + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "slc, positions", + [ + [slice(date(2018, 1, 1), None), [0, 1, 2]], + [slice(date(2019, 1, 2), None), [2]], + [slice(date(2020, 1, 1), None), []], + [slice(None, date(2020, 1, 1)), [0, 1, 2]], + [slice(None, date(2019, 1, 1)), [0]], + ], + ) + def test_getitem_slice_date(self, slc, positions): + # https://github.com/pandas-dev/pandas/issues/31501 + ser = Series( + [0, 1, 2], + DatetimeIndex(["2019-01-01", "2019-01-01T06:00:00", "2019-01-02"]), + ) + result = ser[slc] + expected = ser.take(positions) + tm.assert_series_equal(result, expected) + + def test_getitem_slice_float_raises(self, datetime_series): + msg = ( + "cannot do slice indexing on DatetimeIndex with these indexers " + r"\[{key}\] of type float" + ) + with pytest.raises(TypeError, match=msg.format(key=r"4\.0")): + datetime_series[4.0:10.0] + + with pytest.raises(TypeError, match=msg.format(key=r"4\.5")): + datetime_series[4.5:10.0] + + def test_getitem_slice_bug(self): + ser = Series(range(10), index=list(range(10))) + result = ser[-12:] + tm.assert_series_equal(result, ser) + + result = ser[-7:] + tm.assert_series_equal(result, ser[3:]) + + result = ser[:-12] + tm.assert_series_equal(result, ser[:0]) + + def test_getitem_slice_integers(self): + ser = Series( + np.random.default_rng(2).standard_normal(8), + index=[2, 4, 6, 8, 10, 12, 14, 16], + ) + + result = ser[:4] + expected = Series(ser.values[:4], index=[2, 4, 6, 8]) + tm.assert_series_equal(result, expected) + + +class TestSeriesGetitemListLike: + @pytest.mark.parametrize("box", [list, np.array, Index, Series]) + def test_getitem_no_matches(self, box): + # GH#33462 we expect the same behavior for list/ndarray/Index/Series + ser = Series(["A", "B"]) + + key = Series(["C"], dtype=object) + key = box(key) + + msg = ( + r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]" + ) + with pytest.raises(KeyError, match=msg): + ser[key] + + def test_getitem_intlist_intindex_periodvalues(self): + ser = Series(period_range("2000-01-01", periods=10, freq="D")) + + result = ser[[2, 4]] + exp = Series( + [pd.Period("2000-01-03", freq="D"), pd.Period("2000-01-05", freq="D")], + index=[2, 4], + dtype="Period[D]", + ) + tm.assert_series_equal(result, exp) + assert result.dtype == "Period[D]" + + @pytest.mark.parametrize("box", [list, np.array, Index]) + def test_getitem_intlist_intervalindex_non_int(self, box): + # GH#33404 fall back to positional since ints are unambiguous + dti = date_range("2000-01-03", periods=3)._with_freq(None) + ii = pd.IntervalIndex.from_breaks(dti) + ser = Series(range(len(ii)), index=ii) + + expected = ser.iloc[:1] + key = box([0]) + msg = "Series.__getitem__ treating keys as positions is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser[key] + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("box", [list, np.array, Index]) + @pytest.mark.parametrize("dtype", [np.int64, np.float64, np.uint64]) + def test_getitem_intlist_multiindex_numeric_level(self, dtype, box): + # GH#33404 do _not_ fall back to positional since ints are ambiguous + idx = Index(range(4)).astype(dtype) + dti = date_range("2000-01-03", periods=3) + mi = pd.MultiIndex.from_product([idx, dti]) + ser = Series(range(len(mi))[::-1], index=mi) + + key = box([5]) + with pytest.raises(KeyError, match="5"): + ser[key] + + def test_getitem_uint_array_key(self, any_unsigned_int_numpy_dtype): + # GH #37218 + ser = Series([1, 2, 3]) + key = np.array([4], dtype=any_unsigned_int_numpy_dtype) + + with pytest.raises(KeyError, match="4"): + ser[key] + with pytest.raises(KeyError, match="4"): + ser.loc[key] + + +class TestGetitemBooleanMask: + def test_getitem_boolean(self, string_series): + ser = string_series + mask = ser > ser.median() + + # passing list is OK + result = ser[list(mask)] + expected = ser[mask] + tm.assert_series_equal(result, expected) + tm.assert_index_equal(result.index, ser.index[mask]) + + def test_getitem_boolean_empty(self): + ser = Series([], dtype=np.int64) + ser.index.name = "index_name" + ser = ser[ser.isna()] + assert ser.index.name == "index_name" + assert ser.dtype == np.int64 + + # GH#5877 + # indexing with empty series + ser = Series(["A", "B"], dtype=object) + expected = Series(dtype=object, index=Index([], dtype="int64")) + result = ser[Series([], dtype=object)] + tm.assert_series_equal(result, expected) + + # invalid because of the boolean indexer + # that's empty or not-aligned + msg = ( + r"Unalignable boolean Series provided as indexer \(index of " + r"the boolean Series and of the indexed object do not match" + ) + with pytest.raises(IndexingError, match=msg): + ser[Series([], dtype=bool)] + + with pytest.raises(IndexingError, match=msg): + ser[Series([True], dtype=bool)] + + def test_getitem_boolean_object(self, string_series): + # using column from DataFrame + + ser = string_series + mask = ser > ser.median() + omask = mask.astype(object) + + # getitem + result = ser[omask] + expected = ser[mask] + tm.assert_series_equal(result, expected) + + # setitem + s2 = ser.copy() + cop = ser.copy() + cop[omask] = 5 + s2[mask] = 5 + tm.assert_series_equal(cop, s2) + + # nans raise exception + omask[5:10] = np.nan + msg = "Cannot mask with non-boolean array containing NA / NaN values" + with pytest.raises(ValueError, match=msg): + ser[omask] + with pytest.raises(ValueError, match=msg): + ser[omask] = 5 + + def test_getitem_boolean_dt64_copies(self): + # GH#36210 + dti = date_range("2016-01-01", periods=4, tz="US/Pacific") + key = np.array([True, True, False, False]) + + ser = Series(dti._data) + + res = ser[key] + assert res._values._ndarray.base is None + + # compare with numeric case for reference + ser2 = Series(range(4)) + res2 = ser2[key] + assert res2._values.base is None + + def test_getitem_boolean_corner(self, datetime_series): + ts = datetime_series + mask_shifted = ts.shift(1, freq=BDay()) > ts.median() + + msg = ( + r"Unalignable boolean Series provided as indexer \(index of " + r"the boolean Series and of the indexed object do not match" + ) + with pytest.raises(IndexingError, match=msg): + ts[mask_shifted] + + with pytest.raises(IndexingError, match=msg): + ts.loc[mask_shifted] + + def test_getitem_boolean_different_order(self, string_series): + ordered = string_series.sort_values() + + sel = string_series[ordered > 0] + exp = string_series[string_series > 0] + tm.assert_series_equal(sel, exp) + + def test_getitem_boolean_contiguous_preserve_freq(self): + rng = date_range("1/1/2000", "3/1/2000", freq="B") + + mask = np.zeros(len(rng), dtype=bool) + mask[10:20] = True + + masked = rng[mask] + expected = rng[10:20] + assert expected.freq == rng.freq + tm.assert_index_equal(masked, expected) + + mask[22] = True + masked = rng[mask] + assert masked.freq is None + + +class TestGetitemCallable: + def test_getitem_callable(self): + # GH#12533 + ser = Series(4, index=list("ABCD")) + result = ser[lambda x: "A"] + assert result == ser.loc["A"] + + result = ser[lambda x: ["A", "B"]] + expected = ser.loc[["A", "B"]] + tm.assert_series_equal(result, expected) + + result = ser[lambda x: [True, False, True, True]] + expected = ser.iloc[[0, 2, 3]] + tm.assert_series_equal(result, expected) + + +def test_getitem_generator(string_series): + gen = (x > 0 for x in string_series) + result = string_series[gen] + result2 = string_series[iter(string_series > 0)] + expected = string_series[string_series > 0] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + +@pytest.mark.parametrize( + "series", + [ + Series([0, 1]), + Series(date_range("2012-01-01", periods=2)), + Series(date_range("2012-01-01", periods=2, tz="CET")), + ], +) +def test_getitem_ndim_deprecated(series): + with pytest.raises(ValueError, match="Multi-dimensional indexing"): + series[:, None] + + +def test_getitem_multilevel_scalar_slice_not_implemented( + multiindex_year_month_day_dataframe_random_data, +): + # not implementing this for now + df = multiindex_year_month_day_dataframe_random_data + ser = df["A"] + + msg = r"\(2000, slice\(3, 4, None\)\)" + with pytest.raises(TypeError, match=msg): + ser[2000, 3:4] + + +def test_getitem_dataframe_raises(): + rng = list(range(10)) + ser = Series(10, index=rng) + df = DataFrame(rng, index=rng) + msg = ( + "Indexing a Series with DataFrame is not supported, " + "use the appropriate DataFrame column" + ) + with pytest.raises(TypeError, match=msg): + ser[df > 5] + + +def test_getitem_assignment_series_alignment(): + # https://github.com/pandas-dev/pandas/issues/37427 + # with getitem, when assigning with a Series, it is not first aligned + ser = Series(range(10)) + idx = np.array([2, 4, 9]) + ser[idx] = Series([10, 11, 12]) + expected = Series([0, 1, 10, 3, 11, 5, 6, 7, 8, 12]) + tm.assert_series_equal(ser, expected) + + +def test_getitem_duplicate_index_mistyped_key_raises_keyerror(): + # GH#29189 float_index.get_loc(None) should raise KeyError, not TypeError + ser = Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) + with pytest.raises(KeyError, match="None"): + ser[None] + + with pytest.raises(KeyError, match="None"): + ser.index.get_loc(None) + + with pytest.raises(KeyError, match="None"): + ser.index._engine.get_loc(None) + + +def test_getitem_1tuple_slice_without_multiindex(): + ser = Series(range(5)) + key = (slice(3),) + + result = ser[key] + expected = ser[key[0]] + tm.assert_series_equal(result, expected) + + +def test_getitem_preserve_name(datetime_series): + result = datetime_series[datetime_series > 0] + assert result.name == datetime_series.name + + msg = "Series.__getitem__ treating keys as positions is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = datetime_series[[0, 2, 4]] + assert result.name == datetime_series.name + + result = datetime_series[5:10] + assert result.name == datetime_series.name + + +def test_getitem_with_integer_labels(): + # integer indexes, be careful + ser = Series( + np.random.default_rng(2).standard_normal(10), index=list(range(0, 20, 2)) + ) + inds = [0, 2, 5, 7, 8] + arr_inds = np.array([0, 2, 5, 7, 8]) + with pytest.raises(KeyError, match="not in index"): + ser[inds] + + with pytest.raises(KeyError, match="not in index"): + ser[arr_inds] + + +def test_getitem_missing(datetime_series): + # missing + d = datetime_series.index[0] - BDay() + msg = r"Timestamp\('1999-12-31 00:00:00'\)" + with pytest.raises(KeyError, match=msg): + datetime_series[d] + + +def test_getitem_fancy(string_series, object_series): + msg = "Series.__getitem__ treating keys as positions is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + slice1 = string_series[[1, 2, 3]] + slice2 = object_series[[1, 2, 3]] + assert string_series.index[2] == slice1.index[1] + assert object_series.index[2] == slice2.index[1] + assert string_series.iloc[2] == slice1.iloc[1] + assert object_series.iloc[2] == slice2.iloc[1] + + +def test_getitem_box_float64(datetime_series): + msg = "Series.__getitem__ treating keys as positions is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + value = datetime_series[5] + assert isinstance(value, np.float64) + + +def test_getitem_unordered_dup(): + obj = Series(range(5), index=["c", "a", "a", "b", "b"]) + assert is_scalar(obj["c"]) + assert obj["c"] == 0 + + +def test_getitem_dups(): + ser = Series(range(5), index=["A", "A", "B", "C", "C"], dtype=np.int64) + expected = Series([3, 4], index=["C", "C"], dtype=np.int64) + result = ser["C"] + tm.assert_series_equal(result, expected) + + +def test_getitem_categorical_str(): + # GH#31765 + ser = Series(range(5), index=Categorical(["a", "b", "c", "a", "b"])) + result = ser["a"] + expected = ser.iloc[[0, 3]] + tm.assert_series_equal(result, expected) + + +def test_slice_can_reorder_not_uniquely_indexed(): + ser = Series(1, index=["a", "a", "b", "b", "c"]) + ser[::-1] # it works! + + +@pytest.mark.parametrize("index_vals", ["aabcd", "aadcb"]) +def test_duplicated_index_getitem_positional_indexer(index_vals): + # GH 11747 + s = Series(range(5), index=list(index_vals)) + + msg = "Series.__getitem__ treating keys as positions is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s[3] + assert result == 3 + + +class TestGetitemDeprecatedIndexers: + @pytest.mark.parametrize("key", [{1}, {1: 1}]) + def test_getitem_dict_and_set_deprecated(self, key): + # GH#42825 enforced in 2.0 + ser = Series([1, 2, 3]) + with pytest.raises(TypeError, match="as an indexer is not supported"): + ser[key] + + @pytest.mark.parametrize("key", [{1}, {1: 1}]) + def test_setitem_dict_and_set_disallowed(self, key): + # GH#42825 enforced in 2.0 + ser = Series([1, 2, 3]) + with pytest.raises(TypeError, match="as an indexer is not supported"): + ser[key] = 1 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_indexing.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..f4992b758af74610286c45a386ccbf469a9a0b9d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_indexing.py @@ -0,0 +1,518 @@ +""" test get/set & misc """ +from datetime import timedelta +import re + +import numpy as np +import pytest + +from pandas.errors import IndexingError + +from pandas import ( + NA, + DataFrame, + Index, + IndexSlice, + MultiIndex, + NaT, + Series, + Timedelta, + Timestamp, + concat, + date_range, + isna, + period_range, + timedelta_range, +) +import pandas._testing as tm + + +def test_basic_indexing(): + s = Series( + np.random.default_rng(2).standard_normal(5), index=["a", "b", "a", "a", "b"] + ) + + warn_msg = "Series.__[sg]etitem__ treating keys as positions is deprecated" + msg = "index 5 is out of bounds for axis 0 with size 5" + with pytest.raises(IndexError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + s[5] + with pytest.raises(IndexError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + s[5] = 0 + + with pytest.raises(KeyError, match=r"^'c'$"): + s["c"] + + s = s.sort_index() + + with pytest.raises(IndexError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + s[5] + msg = r"index 5 is out of bounds for axis (0|1) with size 5|^5$" + with pytest.raises(IndexError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + s[5] = 0 + + +def test_getitem_numeric_should_not_fallback_to_positional(any_numeric_dtype): + # GH51053 + dtype = any_numeric_dtype + idx = Index([1, 0, 1], dtype=dtype) + ser = Series(range(3), index=idx) + result = ser[1] + expected = Series([0, 2], index=Index([1, 1], dtype=dtype)) + tm.assert_series_equal(result, expected, check_exact=True) + + +def test_setitem_numeric_should_not_fallback_to_positional(any_numeric_dtype): + # GH51053 + dtype = any_numeric_dtype + idx = Index([1, 0, 1], dtype=dtype) + ser = Series(range(3), index=idx) + ser[1] = 10 + expected = Series([10, 1, 10], index=idx) + tm.assert_series_equal(ser, expected, check_exact=True) + + +def test_basic_getitem_with_labels(datetime_series): + indices = datetime_series.index[[5, 10, 15]] + + result = datetime_series[indices] + expected = datetime_series.reindex(indices) + tm.assert_series_equal(result, expected) + + result = datetime_series[indices[0] : indices[2]] + expected = datetime_series.loc[indices[0] : indices[2]] + tm.assert_series_equal(result, expected) + + +def test_basic_getitem_dt64tz_values(): + # GH12089 + # with tz for values + ser = Series( + date_range("2011-01-01", periods=3, tz="US/Eastern"), index=["a", "b", "c"] + ) + expected = Timestamp("2011-01-01", tz="US/Eastern") + result = ser.loc["a"] + assert result == expected + result = ser.iloc[0] + assert result == expected + result = ser["a"] + assert result == expected + + +def test_getitem_setitem_ellipsis(using_copy_on_write, warn_copy_on_write): + s = Series(np.random.default_rng(2).standard_normal(10)) + + result = s[...] + tm.assert_series_equal(result, s) + + with tm.assert_cow_warning(warn_copy_on_write): + s[...] = 5 + if not using_copy_on_write: + assert (result == 5).all() + + +@pytest.mark.parametrize( + "result_1, duplicate_item, expected_1", + [ + [ + Series({1: 12, 2: [1, 2, 2, 3]}), + Series({1: 313}), + Series({1: 12}, dtype=object), + ], + [ + Series({1: [1, 2, 3], 2: [1, 2, 2, 3]}), + Series({1: [1, 2, 3]}), + Series({1: [1, 2, 3]}), + ], + ], +) +def test_getitem_with_duplicates_indices(result_1, duplicate_item, expected_1): + # GH 17610 + result = result_1._append(duplicate_item) + expected = expected_1._append(duplicate_item) + tm.assert_series_equal(result[1], expected) + assert result[2] == result_1[2] + + +def test_getitem_setitem_integers(): + # caused bug without test + s = Series([1, 2, 3], ["a", "b", "c"]) + + assert s.iloc[0] == s["a"] + s.iloc[0] = 5 + tm.assert_almost_equal(s["a"], 5) + + +def test_series_box_timestamp(): + rng = date_range("20090415", "20090519", freq="B") + ser = Series(rng) + assert isinstance(ser[0], Timestamp) + assert isinstance(ser.at[1], Timestamp) + assert isinstance(ser.iat[2], Timestamp) + assert isinstance(ser.loc[3], Timestamp) + assert isinstance(ser.iloc[4], Timestamp) + + ser = Series(rng, index=rng) + msg = "Series.__getitem__ treating keys as positions is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert isinstance(ser[0], Timestamp) + assert isinstance(ser.at[rng[1]], Timestamp) + assert isinstance(ser.iat[2], Timestamp) + assert isinstance(ser.loc[rng[3]], Timestamp) + assert isinstance(ser.iloc[4], Timestamp) + + +def test_series_box_timedelta(): + rng = timedelta_range("1 day 1 s", periods=5, freq="h") + ser = Series(rng) + assert isinstance(ser[0], Timedelta) + assert isinstance(ser.at[1], Timedelta) + assert isinstance(ser.iat[2], Timedelta) + assert isinstance(ser.loc[3], Timedelta) + assert isinstance(ser.iloc[4], Timedelta) + + +def test_getitem_ambiguous_keyerror(indexer_sl): + ser = Series(range(10), index=list(range(0, 20, 2))) + with pytest.raises(KeyError, match=r"^1$"): + indexer_sl(ser)[1] + + +def test_getitem_dups_with_missing(indexer_sl): + # breaks reindex, so need to use .loc internally + # GH 4246 + ser = Series([1, 2, 3, 4], ["foo", "bar", "foo", "bah"]) + with pytest.raises(KeyError, match=re.escape("['bam'] not in index")): + indexer_sl(ser)[["foo", "bar", "bah", "bam"]] + + +def test_setitem_ambiguous_keyerror(indexer_sl): + s = Series(range(10), index=list(range(0, 20, 2))) + + # equivalent of an append + s2 = s.copy() + indexer_sl(s2)[1] = 5 + expected = concat([s, Series([5], index=[1])]) + tm.assert_series_equal(s2, expected) + + +def test_setitem(datetime_series): + datetime_series[datetime_series.index[5]] = np.nan + datetime_series.iloc[[1, 2, 17]] = np.nan + datetime_series.iloc[6] = np.nan + assert np.isnan(datetime_series.iloc[6]) + assert np.isnan(datetime_series.iloc[2]) + datetime_series[np.isnan(datetime_series)] = 5 + assert not np.isnan(datetime_series.iloc[2]) + + +def test_setslice(datetime_series): + sl = datetime_series[5:20] + assert len(sl) == len(sl.index) + assert sl.index.is_unique is True + + +def test_basic_getitem_setitem_corner(datetime_series): + # invalid tuples, e.g. td.ts[:, None] vs. td.ts[:, 2] + msg = "key of type tuple not found and not a MultiIndex" + with pytest.raises(KeyError, match=msg): + datetime_series[:, 2] + with pytest.raises(KeyError, match=msg): + datetime_series[:, 2] = 2 + + # weird lists. [slice(0, 5)] raises but not two slices + msg = "Indexing with a single-item list" + with pytest.raises(ValueError, match=msg): + # GH#31299 + datetime_series[[slice(None, 5)]] + + # but we're OK with a single-element tuple + result = datetime_series[(slice(None, 5),)] + expected = datetime_series[:5] + tm.assert_series_equal(result, expected) + + # OK + msg = r"unhashable type(: 'slice')?" + with pytest.raises(TypeError, match=msg): + datetime_series[[5, [None, None]]] + with pytest.raises(TypeError, match=msg): + datetime_series[[5, [None, None]]] = 2 + + +def test_slice(string_series, object_series, using_copy_on_write, warn_copy_on_write): + original = string_series.copy() + numSlice = string_series[10:20] + numSliceEnd = string_series[-10:] + objSlice = object_series[10:20] + + assert string_series.index[9] not in numSlice.index + assert object_series.index[9] not in objSlice.index + + assert len(numSlice) == len(numSlice.index) + assert string_series[numSlice.index[0]] == numSlice[numSlice.index[0]] + + assert numSlice.index[1] == string_series.index[11] + tm.assert_numpy_array_equal(np.array(numSliceEnd), np.array(string_series)[-10:]) + + # Test return view. + sl = string_series[10:20] + with tm.assert_cow_warning(warn_copy_on_write): + sl[:] = 0 + + if using_copy_on_write: + # Doesn't modify parent (CoW) + tm.assert_series_equal(string_series, original) + else: + assert (string_series[10:20] == 0).all() + + +def test_timedelta_assignment(): + # GH 8209 + s = Series([], dtype=object) + s.loc["B"] = timedelta(1) + tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"])) + + s = s.reindex(s.index.insert(0, "A")) + tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) + + s.loc["A"] = timedelta(1) + expected = Series(Timedelta("1 days"), index=["A", "B"]) + tm.assert_series_equal(s, expected) + + +def test_underlying_data_conversion(using_copy_on_write): + # GH 4080 + df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) + return_value = df.set_index(["a", "b", "c"], inplace=True) + assert return_value is None + s = Series([1], index=[(2, 2, 2)]) + df["val"] = 0 + df_original = df.copy() + df + + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["val"].update(s) + expected = df_original + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["val"].update(s) + expected = DataFrame( + {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "val": [0, 1, 0]} + ) + return_value = expected.set_index(["a", "b", "c"], inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + + +def test_preserve_refs(datetime_series): + seq = datetime_series.iloc[[5, 10, 15]] + seq.iloc[1] = np.nan + assert not np.isnan(datetime_series.iloc[10]) + + +def test_multilevel_preserve_name(lexsorted_two_level_string_multiindex, indexer_sl): + index = lexsorted_two_level_string_multiindex + ser = Series( + np.random.default_rng(2).standard_normal(len(index)), index=index, name="sth" + ) + + result = indexer_sl(ser)["foo"] + assert result.name == ser.name + + +# miscellaneous methods + + +@pytest.mark.parametrize( + "index", + [ + date_range("2014-01-01", periods=20, freq="MS"), + period_range("2014-01", periods=20, freq="M"), + timedelta_range("0", periods=20, freq="h"), + ], +) +def test_slice_with_negative_step(index): + keystr1 = str(index[9]) + keystr2 = str(index[13]) + + ser = Series(np.arange(20), index) + SLC = IndexSlice + + for key in [keystr1, index[9]]: + tm.assert_indexing_slices_equivalent(ser, SLC[key::-1], SLC[9::-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[:key:-1], SLC[:8:-1]) + + for key2 in [keystr2, index[13]]: + tm.assert_indexing_slices_equivalent(ser, SLC[key2:key:-1], SLC[13:8:-1]) + tm.assert_indexing_slices_equivalent(ser, SLC[key:key2:-1], SLC[0:0:-1]) + + +def test_tuple_index(): + # GH 35534 - Selecting values when a Series has an Index of tuples + s = Series([1, 2], index=[("a",), ("b",)]) + assert s[("a",)] == 1 + assert s[("b",)] == 2 + s[("b",)] = 3 + assert s[("b",)] == 3 + + +def test_frozenset_index(): + # GH35747 - Selecting values when a Series has an Index of frozenset + idx0, idx1 = frozenset("a"), frozenset("b") + s = Series([1, 2], index=[idx0, idx1]) + assert s[idx0] == 1 + assert s[idx1] == 2 + s[idx1] = 3 + assert s[idx1] == 3 + + +def test_loc_setitem_all_false_indexer(): + # GH#45778 + ser = Series([1, 2], index=["a", "b"]) + expected = ser.copy() + rhs = Series([6, 7], index=["a", "b"]) + ser.loc[ser > 100] = rhs + tm.assert_series_equal(ser, expected) + + +def test_loc_boolean_indexer_non_matching_index(): + # GH#46551 + ser = Series([1]) + result = ser.loc[Series([NA, False], dtype="boolean")] + expected = Series([], dtype="int64") + tm.assert_series_equal(result, expected) + + +def test_loc_boolean_indexer_miss_matching_index(): + # GH#46551 + ser = Series([1]) + indexer = Series([NA, False], dtype="boolean", index=[1, 2]) + with pytest.raises(IndexingError, match="Unalignable"): + ser.loc[indexer] + + +def test_loc_setitem_nested_data_enlargement(): + # GH#48614 + df = DataFrame({"a": [1]}) + ser = Series({"label": df}) + ser.loc["new_label"] = df + expected = Series({"label": df, "new_label": df}) + tm.assert_series_equal(ser, expected) + + +def test_loc_ea_numeric_index_oob_slice_end(): + # GH#50161 + ser = Series(1, index=Index([0, 1, 2], dtype="Int64")) + result = ser.loc[2:3] + expected = Series(1, index=Index([2], dtype="Int64")) + tm.assert_series_equal(result, expected) + + +def test_getitem_bool_int_key(): + # GH#48653 + ser = Series({True: 1, False: 0}) + with pytest.raises(KeyError, match="0"): + ser.loc[0] + + +@pytest.mark.parametrize("val", [{}, {"b": "x"}]) +@pytest.mark.parametrize("indexer", [[], [False, False], slice(0, -1), np.array([])]) +def test_setitem_empty_indexer(indexer, val): + # GH#45981 + df = DataFrame({"a": [1, 2], **val}) + expected = df.copy() + df.loc[indexer] = 1.5 + tm.assert_frame_equal(df, expected) + + +class TestDeprecatedIndexers: + @pytest.mark.parametrize("key", [{1}, {1: 1}]) + def test_getitem_dict_and_set_deprecated(self, key): + # GH#42825 enforced in 2.0 + ser = Series([1, 2]) + with pytest.raises(TypeError, match="as an indexer is not supported"): + ser.loc[key] + + @pytest.mark.parametrize("key", [{1}, {1: 1}, ({1}, 2), ({1: 1}, 2)]) + def test_getitem_dict_and_set_deprecated_multiindex(self, key): + # GH#42825 enforced in 2.0 + ser = Series([1, 2], index=MultiIndex.from_tuples([(1, 2), (3, 4)])) + with pytest.raises(TypeError, match="as an indexer is not supported"): + ser.loc[key] + + @pytest.mark.parametrize("key", [{1}, {1: 1}]) + def test_setitem_dict_and_set_disallowed(self, key): + # GH#42825 enforced in 2.0 + ser = Series([1, 2]) + with pytest.raises(TypeError, match="as an indexer is not supported"): + ser.loc[key] = 1 + + @pytest.mark.parametrize("key", [{1}, {1: 1}, ({1}, 2), ({1: 1}, 2)]) + def test_setitem_dict_and_set_disallowed_multiindex(self, key): + # GH#42825 enforced in 2.0 + ser = Series([1, 2], index=MultiIndex.from_tuples([(1, 2), (3, 4)])) + with pytest.raises(TypeError, match="as an indexer is not supported"): + ser.loc[key] = 1 + + +class TestSetitemValidation: + # This is adapted from pandas/tests/arrays/masked/test_indexing.py + # but checks for warnings instead of errors. + def _check_setitem_invalid(self, ser, invalid, indexer, warn): + msg = "Setting an item of incompatible dtype is deprecated" + msg = re.escape(msg) + + orig_ser = ser.copy() + + with tm.assert_produces_warning(warn, match=msg): + ser[indexer] = invalid + ser = orig_ser.copy() + + with tm.assert_produces_warning(warn, match=msg): + ser.iloc[indexer] = invalid + ser = orig_ser.copy() + + with tm.assert_produces_warning(warn, match=msg): + ser.loc[indexer] = invalid + ser = orig_ser.copy() + + with tm.assert_produces_warning(warn, match=msg): + ser[:] = invalid + + _invalid_scalars = [ + 1 + 2j, + "True", + "1", + "1.0", + NaT, + np.datetime64("NaT"), + np.timedelta64("NaT"), + ] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] + + @pytest.mark.parametrize( + "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] + ) + @pytest.mark.parametrize("indexer", _indexers) + def test_setitem_validation_scalar_bool(self, invalid, indexer): + ser = Series([True, False, False], dtype="bool") + self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) + + @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) + @pytest.mark.parametrize("indexer", _indexers) + def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): + ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) + if isna(invalid) and invalid is not NaT and not np.isnat(invalid): + warn = None + else: + warn = FutureWarning + self._check_setitem_invalid(ser, invalid, indexer, warn) + + @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) + @pytest.mark.parametrize("indexer", _indexers) + def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): + ser = Series([1, 2, None], dtype=float_numpy_dtype) + self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_setitem.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_setitem.py new file mode 100644 index 0000000000000000000000000000000000000000..ed681563f6fcd0535efd21b0d4c8809126b7b726 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/indexing/test_setitem.py @@ -0,0 +1,1855 @@ +from datetime import ( + date, + datetime, +) +from decimal import Decimal +import os + +import numpy as np +import pytest + +from pandas.compat.numpy import ( + np_version_gt2, + np_version_gte1p24, +) +from pandas.errors import IndexingError + +from pandas.core.dtypes.common import is_list_like + +from pandas import ( + NA, + Categorical, + DataFrame, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + MultiIndex, + NaT, + Period, + Series, + Timedelta, + Timestamp, + array, + concat, + date_range, + interval_range, + period_range, + timedelta_range, +) +import pandas._testing as tm + +from pandas.tseries.offsets import BDay + + +class TestSetitemDT64Values: + def test_setitem_none_nan(self): + series = Series(date_range("1/1/2000", periods=10)) + series[3] = None + assert series[3] is NaT + + series[3:5] = None + assert series[4] is NaT + + series[5] = np.nan + assert series[5] is NaT + + series[5:7] = np.nan + assert series[6] is NaT + + def test_setitem_multiindex_empty_slice(self): + # https://github.com/pandas-dev/pandas/issues/35878 + idx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) + result = Series([1, 2], index=idx) + expected = result.copy() + result.loc[[]] = 0 + tm.assert_series_equal(result, expected) + + def test_setitem_with_string_index(self): + # GH#23451 + # Set object dtype to avoid upcast when setting date.today() + ser = Series([1, 2, 3], index=["Date", "b", "other"], dtype=object) + ser["Date"] = date.today() + assert ser.Date == date.today() + assert ser["Date"] == date.today() + + def test_setitem_tuple_with_datetimetz_values(self): + # GH#20441 + arr = date_range("2017", periods=4, tz="US/Eastern") + index = [(0, 1), (0, 2), (0, 3), (0, 4)] + result = Series(arr, index=index) + expected = result.copy() + result[(0, 1)] = np.nan + expected.iloc[0] = np.nan + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"]) + def test_setitem_with_tz(self, tz, indexer_sli): + orig = Series(date_range("2016-01-01", freq="h", periods=3, tz=tz)) + assert orig.dtype == f"datetime64[ns, {tz}]" + + exp = Series( + [ + Timestamp("2016-01-01 00:00", tz=tz), + Timestamp("2011-01-01 00:00", tz=tz), + Timestamp("2016-01-01 02:00", tz=tz), + ], + dtype=orig.dtype, + ) + + # scalar + ser = orig.copy() + indexer_sli(ser)[1] = Timestamp("2011-01-01", tz=tz) + tm.assert_series_equal(ser, exp) + + # vector + vals = Series( + [Timestamp("2011-01-01", tz=tz), Timestamp("2012-01-01", tz=tz)], + index=[1, 2], + dtype=orig.dtype, + ) + assert vals.dtype == f"datetime64[ns, {tz}]" + + exp = Series( + [ + Timestamp("2016-01-01 00:00", tz=tz), + Timestamp("2011-01-01 00:00", tz=tz), + Timestamp("2012-01-01 00:00", tz=tz), + ], + dtype=orig.dtype, + ) + + ser = orig.copy() + indexer_sli(ser)[[1, 2]] = vals + tm.assert_series_equal(ser, exp) + + def test_setitem_with_tz_dst(self, indexer_sli): + # GH#14146 trouble setting values near DST boundary + tz = "US/Eastern" + orig = Series(date_range("2016-11-06", freq="h", periods=3, tz=tz)) + assert orig.dtype == f"datetime64[ns, {tz}]" + + exp = Series( + [ + Timestamp("2016-11-06 00:00-04:00", tz=tz), + Timestamp("2011-01-01 00:00-05:00", tz=tz), + Timestamp("2016-11-06 01:00-05:00", tz=tz), + ], + dtype=orig.dtype, + ) + + # scalar + ser = orig.copy() + indexer_sli(ser)[1] = Timestamp("2011-01-01", tz=tz) + tm.assert_series_equal(ser, exp) + + # vector + vals = Series( + [Timestamp("2011-01-01", tz=tz), Timestamp("2012-01-01", tz=tz)], + index=[1, 2], + dtype=orig.dtype, + ) + assert vals.dtype == f"datetime64[ns, {tz}]" + + exp = Series( + [ + Timestamp("2016-11-06 00:00", tz=tz), + Timestamp("2011-01-01 00:00", tz=tz), + Timestamp("2012-01-01 00:00", tz=tz), + ], + dtype=orig.dtype, + ) + + ser = orig.copy() + indexer_sli(ser)[[1, 2]] = vals + tm.assert_series_equal(ser, exp) + + def test_object_series_setitem_dt64array_exact_match(self): + # make sure the dt64 isn't cast by numpy to integers + # https://github.com/numpy/numpy/issues/12550 + + ser = Series({"X": np.nan}, dtype=object) + + indexer = [True] + + # "exact_match" -> size of array being set matches size of ser + value = np.array([4], dtype="M8[ns]") + + ser.iloc[indexer] = value + + expected = Series([value[0]], index=["X"], dtype=object) + assert all(isinstance(x, np.datetime64) for x in expected.values) + + tm.assert_series_equal(ser, expected) + + +class TestSetitemScalarIndexer: + def test_setitem_negative_out_of_bounds(self): + ser = Series(["a"] * 10, index=["a"] * 10) + + # string index falls back to positional + msg = "index -11|-1 is out of bounds for axis 0 with size 10" + warn_msg = "Series.__setitem__ treating keys as positions is deprecated" + with pytest.raises(IndexError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + ser[-11] = "foo" + + @pytest.mark.parametrize("indexer", [tm.loc, tm.at]) + @pytest.mark.parametrize("ser_index", [0, 1]) + def test_setitem_series_object_dtype(self, indexer, ser_index): + # GH#38303 + ser = Series([0, 0], dtype="object") + idxr = indexer(ser) + idxr[0] = Series([42], index=[ser_index]) + expected = Series([Series([42], index=[ser_index]), 0], dtype="object") + tm.assert_series_equal(ser, expected) + + @pytest.mark.parametrize("index, exp_value", [(0, 42), (1, np.nan)]) + def test_setitem_series(self, index, exp_value): + # GH#38303 + ser = Series([0, 0]) + ser.loc[0] = Series([42], index=[index]) + expected = Series([exp_value, 0]) + tm.assert_series_equal(ser, expected) + + +class TestSetitemSlices: + def test_setitem_slice_float_raises(self, datetime_series): + msg = ( + "cannot do slice indexing on DatetimeIndex with these indexers " + r"\[{key}\] of type float" + ) + with pytest.raises(TypeError, match=msg.format(key=r"4\.0")): + datetime_series[4.0:10.0] = 0 + + with pytest.raises(TypeError, match=msg.format(key=r"4\.5")): + datetime_series[4.5:10.0] = 0 + + def test_setitem_slice(self): + ser = Series(range(10), index=list(range(10))) + ser[-12:] = 0 + assert (ser == 0).all() + + ser[:-12] = 5 + assert (ser == 0).all() + + def test_setitem_slice_integers(self): + ser = Series( + np.random.default_rng(2).standard_normal(8), + index=[2, 4, 6, 8, 10, 12, 14, 16], + ) + + ser[:4] = 0 + assert (ser[:4] == 0).all() + assert not (ser[4:] == 0).any() + + def test_setitem_slicestep(self): + # caught this bug when writing tests + series = Series( + np.arange(20, dtype=np.float64), index=np.arange(20, dtype=np.int64) + ) + + series[::2] = 0 + assert (series[::2] == 0).all() + + def test_setitem_multiindex_slice(self, indexer_sli): + # GH 8856 + mi = MultiIndex.from_product(([0, 1], list("abcde"))) + result = Series(np.arange(10, dtype=np.int64), mi) + indexer_sli(result)[::4] = 100 + expected = Series([100, 1, 2, 3, 100, 5, 6, 7, 100, 9], mi) + tm.assert_series_equal(result, expected) + + +class TestSetitemBooleanMask: + def test_setitem_mask_cast(self): + # GH#2746 + # need to upcast + ser = Series([1, 2], index=[1, 2], dtype="int64") + ser[[True, False]] = Series([0], index=[1], dtype="int64") + expected = Series([0, 2], index=[1, 2], dtype="int64") + + tm.assert_series_equal(ser, expected) + + def test_setitem_mask_align_and_promote(self): + # GH#8387: test that changing types does not break alignment + ts = Series( + np.random.default_rng(2).standard_normal(100), index=np.arange(100, 0, -1) + ).round(5) + mask = ts > 0 + left = ts.copy() + right = ts[mask].copy().map(str) + with tm.assert_produces_warning( + FutureWarning, match="item of incompatible dtype" + ): + left[mask] = right + expected = ts.map(lambda t: str(t) if t > 0 else t) + tm.assert_series_equal(left, expected) + + def test_setitem_mask_promote_strs(self): + ser = Series([0, 1, 2, 0]) + mask = ser > 0 + ser2 = ser[mask].map(str) + with tm.assert_produces_warning( + FutureWarning, match="item of incompatible dtype" + ): + ser[mask] = ser2 + + expected = Series([0, "1", "2", 0]) + tm.assert_series_equal(ser, expected) + + def test_setitem_mask_promote(self): + ser = Series([0, "foo", "bar", 0]) + mask = Series([False, True, True, False]) + ser2 = ser[mask] + ser[mask] = ser2 + + expected = Series([0, "foo", "bar", 0]) + tm.assert_series_equal(ser, expected) + + def test_setitem_boolean(self, string_series): + mask = string_series > string_series.median() + + # similar indexed series + result = string_series.copy() + result[mask] = string_series * 2 + expected = string_series * 2 + tm.assert_series_equal(result[mask], expected[mask]) + + # needs alignment + result = string_series.copy() + result[mask] = (string_series * 2)[0:5] + expected = (string_series * 2)[0:5].reindex_like(string_series) + expected[-mask] = string_series[mask] + tm.assert_series_equal(result[mask], expected[mask]) + + def test_setitem_boolean_corner(self, datetime_series): + ts = datetime_series + mask_shifted = ts.shift(1, freq=BDay()) > ts.median() + + msg = ( + r"Unalignable boolean Series provided as indexer \(index of " + r"the boolean Series and of the indexed object do not match" + ) + with pytest.raises(IndexingError, match=msg): + ts[mask_shifted] = 1 + + with pytest.raises(IndexingError, match=msg): + ts.loc[mask_shifted] = 1 + + def test_setitem_boolean_different_order(self, string_series): + ordered = string_series.sort_values() + + copy = string_series.copy() + copy[ordered > 0] = 0 + + expected = string_series.copy() + expected[expected > 0] = 0 + + tm.assert_series_equal(copy, expected) + + @pytest.mark.parametrize("func", [list, np.array, Series]) + def test_setitem_boolean_python_list(self, func): + # GH19406 + ser = Series([None, "b", None]) + mask = func([True, False, True]) + ser[mask] = ["a", "c"] + expected = Series(["a", "b", "c"]) + tm.assert_series_equal(ser, expected) + + def test_setitem_boolean_nullable_int_types(self, any_numeric_ea_dtype): + # GH: 26468 + ser = Series([5, 6, 7, 8], dtype=any_numeric_ea_dtype) + ser[ser > 6] = Series(range(4), dtype=any_numeric_ea_dtype) + expected = Series([5, 6, 2, 3], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(ser, expected) + + ser = Series([5, 6, 7, 8], dtype=any_numeric_ea_dtype) + ser.loc[ser > 6] = Series(range(4), dtype=any_numeric_ea_dtype) + tm.assert_series_equal(ser, expected) + + ser = Series([5, 6, 7, 8], dtype=any_numeric_ea_dtype) + loc_ser = Series(range(4), dtype=any_numeric_ea_dtype) + ser.loc[ser > 6] = loc_ser.loc[loc_ser > 1] + tm.assert_series_equal(ser, expected) + + def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(self): + # GH#30567 + ser = Series([None] * 10) + mask = [False] * 3 + [True] * 5 + [False] * 2 + ser[mask] = range(5) + result = ser + expected = Series([None] * 3 + list(range(5)) + [None] * 2, dtype=object) + tm.assert_series_equal(result, expected) + + def test_setitem_nan_with_bool(self): + # GH 13034 + result = Series([True, False, True]) + with tm.assert_produces_warning( + FutureWarning, match="item of incompatible dtype" + ): + result[0] = np.nan + expected = Series([np.nan, False, True], dtype=object) + tm.assert_series_equal(result, expected) + + def test_setitem_mask_smallint_upcast(self): + orig = Series([1, 2, 3], dtype="int8") + alt = np.array([999, 1000, 1001], dtype=np.int64) + + mask = np.array([True, False, True]) + + ser = orig.copy() + with tm.assert_produces_warning( + FutureWarning, match="item of incompatible dtype" + ): + ser[mask] = Series(alt) + expected = Series([999, 2, 1001]) + tm.assert_series_equal(ser, expected) + + ser2 = orig.copy() + with tm.assert_produces_warning( + FutureWarning, match="item of incompatible dtype" + ): + ser2.mask(mask, alt, inplace=True) + tm.assert_series_equal(ser2, expected) + + ser3 = orig.copy() + res = ser3.where(~mask, Series(alt)) + tm.assert_series_equal(res, expected) + + def test_setitem_mask_smallint_no_upcast(self): + # like test_setitem_mask_smallint_upcast, but while we can't hold 'alt', + # we *can* hold alt[mask] without casting + orig = Series([1, 2, 3], dtype="uint8") + alt = Series([245, 1000, 246], dtype=np.int64) + + mask = np.array([True, False, True]) + + ser = orig.copy() + ser[mask] = alt + expected = Series([245, 2, 246], dtype="uint8") + tm.assert_series_equal(ser, expected) + + ser2 = orig.copy() + ser2.mask(mask, alt, inplace=True) + tm.assert_series_equal(ser2, expected) + + # TODO: ser.where(~mask, alt) unnecessarily upcasts to int64 + ser3 = orig.copy() + res = ser3.where(~mask, alt) + tm.assert_series_equal(res, expected, check_dtype=False) + + +class TestSetitemViewCopySemantics: + def test_setitem_invalidates_datetime_index_freq(self, using_copy_on_write): + # GH#24096 altering a datetime64tz Series inplace invalidates the + # `freq` attribute on the underlying DatetimeIndex + + dti = date_range("20130101", periods=3, tz="US/Eastern") + ts = dti[1] + ser = Series(dti) + assert ser._values is not dti + if using_copy_on_write: + assert ser._values._ndarray.base is dti._data._ndarray.base + else: + assert ser._values._ndarray.base is not dti._data._ndarray.base + assert dti.freq == "D" + ser.iloc[1] = NaT + assert ser._values.freq is None + + # check that the DatetimeIndex was not altered in place + assert ser._values is not dti + assert ser._values._ndarray.base is not dti._data._ndarray.base + assert dti[1] == ts + assert dti.freq == "D" + + def test_dt64tz_setitem_does_not_mutate_dti(self, using_copy_on_write): + # GH#21907, GH#24096 + dti = date_range("2016-01-01", periods=10, tz="US/Pacific") + ts = dti[0] + ser = Series(dti) + assert ser._values is not dti + if using_copy_on_write: + assert ser._values._ndarray.base is dti._data._ndarray.base + assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base + else: + assert ser._values._ndarray.base is not dti._data._ndarray.base + assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base + + assert ser._mgr.arrays[0] is not dti + + ser[::3] = NaT + assert ser[0] is NaT + assert dti[0] == ts + + +class TestSetitemCallable: + def test_setitem_callable_key(self): + # GH#12533 + ser = Series([1, 2, 3, 4], index=list("ABCD")) + ser[lambda x: "A"] = -1 + + expected = Series([-1, 2, 3, 4], index=list("ABCD")) + tm.assert_series_equal(ser, expected) + + def test_setitem_callable_other(self): + # GH#13299 + inc = lambda x: x + 1 + + # set object dtype to avoid upcast when setting inc + ser = Series([1, 2, -1, 4], dtype=object) + ser[ser < 0] = inc + + expected = Series([1, 2, inc, 4]) + tm.assert_series_equal(ser, expected) + + +class TestSetitemWithExpansion: + def test_setitem_empty_series(self): + # GH#10193 + key = Timestamp("2012-01-01") + series = Series(dtype=object) + series[key] = 47 + expected = Series(47, [key]) + tm.assert_series_equal(series, expected) + + def test_setitem_empty_series_datetimeindex_preserves_freq(self): + # GH#33573 our index should retain its freq + dti = DatetimeIndex([], freq="D", dtype="M8[ns]") + series = Series([], index=dti, dtype=object) + key = Timestamp("2012-01-01") + series[key] = 47 + expected = Series(47, DatetimeIndex([key], freq="D").as_unit("ns")) + tm.assert_series_equal(series, expected) + assert series.index.freq == expected.index.freq + + def test_setitem_empty_series_timestamp_preserves_dtype(self): + # GH 21881 + timestamp = Timestamp(1412526600000000000) + series = Series([timestamp], index=["timestamp"], dtype=object) + expected = series["timestamp"] + + series = Series([], dtype=object) + series["anything"] = 300.0 + series["timestamp"] = timestamp + result = series["timestamp"] + assert result == expected + + @pytest.mark.parametrize( + "td", + [ + Timedelta("9 days"), + Timedelta("9 days").to_timedelta64(), + Timedelta("9 days").to_pytimedelta(), + ], + ) + def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): + # GH#22717 inserting a Timedelta should _not_ cast to int64 + if using_infer_string and not isinstance(td, Timedelta): + # TODO: GH#56010 + request.applymarker(pytest.mark.xfail(reason="inferred as string")) + + expected = Series(["x", td], index=[0, "td"], dtype=object) + + ser = Series(["x"]) + ser["td"] = td + tm.assert_series_equal(ser, expected) + assert isinstance(ser["td"], Timedelta) + + ser = Series(["x"]) + ser.loc["td"] = Timedelta("9 days") + tm.assert_series_equal(ser, expected) + assert isinstance(ser["td"], Timedelta) + + def test_setitem_with_expansion_type_promotion(self): + # GH#12599 + ser = Series(dtype=object) + ser["a"] = Timestamp("2016-01-01") + ser["b"] = 3.0 + ser["c"] = "foo" + expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) + tm.assert_series_equal(ser, expected) + + def test_setitem_not_contained(self, string_series): + # set item that's not contained + ser = string_series.copy() + assert "foobar" not in ser.index + ser["foobar"] = 1 + + app = Series([1], index=["foobar"], name="series") + expected = concat([string_series, app]) + tm.assert_series_equal(ser, expected) + + def test_setitem_keep_precision(self, any_numeric_ea_dtype): + # GH#32346 + ser = Series([1, 2], dtype=any_numeric_ea_dtype) + ser[2] = 10 + expected = Series([1, 2, 10], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(ser, expected) + + @pytest.mark.parametrize( + "na, target_na, dtype, target_dtype, indexer, warn", + [ + (NA, NA, "Int64", "Int64", 1, None), + (NA, NA, "Int64", "Int64", 2, None), + (NA, np.nan, "int64", "float64", 1, None), + (NA, np.nan, "int64", "float64", 2, None), + (NaT, NaT, "int64", "object", 1, FutureWarning), + (NaT, NaT, "int64", "object", 2, None), + (np.nan, NA, "Int64", "Int64", 1, None), + (np.nan, NA, "Int64", "Int64", 2, None), + (np.nan, NA, "Float64", "Float64", 1, None), + (np.nan, NA, "Float64", "Float64", 2, None), + (np.nan, np.nan, "int64", "float64", 1, None), + (np.nan, np.nan, "int64", "float64", 2, None), + ], + ) + def test_setitem_enlarge_with_na( + self, na, target_na, dtype, target_dtype, indexer, warn + ): + # GH#32346 + ser = Series([1, 2], dtype=dtype) + with tm.assert_produces_warning(warn, match="incompatible dtype"): + ser[indexer] = na + expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] + expected = Series(expected_values, dtype=target_dtype) + tm.assert_series_equal(ser, expected) + + def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string): + # GH#48665 + ser = Series(["a", "b"]) + ser[3] = nulls_fixture + dtype = ( + "string[pyarrow_numpy]" + if using_infer_string and not isinstance(nulls_fixture, Decimal) + else object + ) + expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3], dtype=dtype) + tm.assert_series_equal(ser, expected) + if using_infer_string: + ser[3] is np.nan + else: + assert ser[3] is nulls_fixture + + +def test_setitem_scalar_into_readonly_backing_data(): + # GH#14359: test that you cannot mutate a read only buffer + + array = np.zeros(5) + array.flags.writeable = False # make the array immutable + series = Series(array, copy=False) + + for n in series.index: + msg = "assignment destination is read-only" + with pytest.raises(ValueError, match=msg): + series[n] = 1 + + assert array[n] == 0 + + +def test_setitem_slice_into_readonly_backing_data(): + # GH#14359: test that you cannot mutate a read only buffer + + array = np.zeros(5) + array.flags.writeable = False # make the array immutable + series = Series(array, copy=False) + + msg = "assignment destination is read-only" + with pytest.raises(ValueError, match=msg): + series[1:3] = 1 + + assert not array.any() + + +def test_setitem_categorical_assigning_ops(): + orig = Series(Categorical(["b", "b"], categories=["a", "b"])) + ser = orig.copy() + ser[:] = "a" + exp = Series(Categorical(["a", "a"], categories=["a", "b"])) + tm.assert_series_equal(ser, exp) + + ser = orig.copy() + ser[1] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"])) + tm.assert_series_equal(ser, exp) + + ser = orig.copy() + ser[ser.index > 0] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"])) + tm.assert_series_equal(ser, exp) + + ser = orig.copy() + ser[[False, True]] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"])) + tm.assert_series_equal(ser, exp) + + ser = orig.copy() + ser.index = ["x", "y"] + ser["y"] = "a" + exp = Series(Categorical(["b", "a"], categories=["a", "b"]), index=["x", "y"]) + tm.assert_series_equal(ser, exp) + + +def test_setitem_nan_into_categorical(): + # ensure that one can set something to np.nan + ser = Series(Categorical([1, 2, 3])) + exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3])) + ser[1] = np.nan + tm.assert_series_equal(ser, exp) + + +class TestSetitemCasting: + @pytest.mark.parametrize("unique", [True, False]) + @pytest.mark.parametrize("val", [3, 3.0, "3"], ids=type) + def test_setitem_non_bool_into_bool(self, val, indexer_sli, unique): + # dont cast these 3-like values to bool + ser = Series([True, False]) + if not unique: + ser.index = [1, 1] + + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + indexer_sli(ser)[1] = val + assert type(ser.iloc[1]) == type(val) + + expected = Series([True, val], dtype=object, index=ser.index) + if not unique and indexer_sli is not tm.iloc: + expected = Series([val, val], dtype=object, index=[1, 1]) + tm.assert_series_equal(ser, expected) + + def test_setitem_boolean_array_into_npbool(self): + # GH#45462 + ser = Series([True, False, True]) + values = ser._values + arr = array([True, False, None]) + + ser[:2] = arr[:2] # no NAs -> can set inplace + assert ser._values is values + + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser[1:] = arr[1:] # has an NA -> cast to boolean dtype + expected = Series(arr) + tm.assert_series_equal(ser, expected) + + +class SetitemCastingEquivalents: + """ + Check each of several methods that _should_ be equivalent to `obj[key] = val` + + We assume that + - obj.index is the default Index(range(len(obj))) + - the setitem does not expand the obj + """ + + @pytest.fixture + def is_inplace(self, obj, expected): + """ + Whether we expect the setting to be in-place or not. + """ + return expected.dtype == obj.dtype + + def check_indexer(self, obj, key, expected, val, indexer, is_inplace): + orig = obj + obj = obj.copy() + arr = obj._values + + indexer(obj)[key] = val + tm.assert_series_equal(obj, expected) + + self._check_inplace(is_inplace, orig, arr, obj) + + def _check_inplace(self, is_inplace, orig, arr, obj): + if is_inplace is None: + # We are not (yet) checking whether setting is inplace or not + pass + elif is_inplace: + if arr.dtype.kind in ["m", "M"]: + # We may not have the same DTA/TDA, but will have the same + # underlying data + assert arr._ndarray is obj._values._ndarray + else: + assert obj._values is arr + else: + # otherwise original array should be unchanged + tm.assert_equal(arr, orig._values) + + def test_int_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): + if not isinstance(key, int): + pytest.skip("Not relevant for int key") + + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) + + if indexer_sli is tm.loc: + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, key, expected, val, tm.at, is_inplace) + elif indexer_sli is tm.iloc: + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, key, expected, val, tm.iat, is_inplace) + + rng = range(key, key + 1) + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, rng, expected, val, indexer_sli, is_inplace) + + if indexer_sli is not tm.loc: + # Note: no .loc because that handles slice edges differently + slc = slice(key, key + 1) + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, slc, expected, val, indexer_sli, is_inplace) + + ilkey = [key] + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) + + indkey = np.array(ilkey) + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) + + genkey = (x for x in [key]) + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, genkey, expected, val, indexer_sli, is_inplace) + + def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): + if not isinstance(key, slice): + pytest.skip("Not relevant for slice key") + + if indexer_sli is not tm.loc: + # Note: no .loc because that handles slice edges differently + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, key, expected, val, indexer_sli, is_inplace) + + ilkey = list(range(len(obj)))[key] + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, ilkey, expected, val, indexer_sli, is_inplace) + + indkey = np.array(ilkey) + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, indkey, expected, val, indexer_sli, is_inplace) + + genkey = (x for x in indkey) + with tm.assert_produces_warning(warn, match="incompatible dtype"): + self.check_indexer(obj, genkey, expected, val, indexer_sli, is_inplace) + + def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): + # setitem with boolean mask + mask = np.zeros(obj.shape, dtype=bool) + mask[key] = True + + obj = obj.copy() + + if is_list_like(val) and len(val) < mask.sum(): + msg = "boolean index did not match indexed array along dimension" + with pytest.raises(IndexError, match=msg): + indexer_sli(obj)[mask] = val + return + + with tm.assert_produces_warning(warn, match="incompatible dtype"): + indexer_sli(obj)[mask] = val + tm.assert_series_equal(obj, expected) + + def test_series_where(self, obj, key, expected, warn, val, is_inplace): + mask = np.zeros(obj.shape, dtype=bool) + mask[key] = True + + if is_list_like(val) and len(val) < len(obj): + # Series.where is not valid here + msg = "operands could not be broadcast together with shapes" + with pytest.raises(ValueError, match=msg): + obj.where(~mask, val) + return + + orig = obj + obj = obj.copy() + arr = obj._values + + res = obj.where(~mask, val) + + if val is NA and res.dtype == object: + expected = expected.fillna(NA) + elif val is None and res.dtype == object: + assert expected.dtype == object + expected = expected.copy() + expected[expected.isna()] = None + tm.assert_series_equal(res, expected) + + self._check_inplace(is_inplace, orig, arr, obj) + + def test_index_where(self, obj, key, expected, warn, val, using_infer_string): + mask = np.zeros(obj.shape, dtype=bool) + mask[key] = True + + if using_infer_string and obj.dtype == object: + with pytest.raises(TypeError, match="Scalar must"): + Index(obj).where(~mask, val) + else: + res = Index(obj).where(~mask, val) + expected_idx = Index(expected, dtype=expected.dtype) + tm.assert_index_equal(res, expected_idx) + + def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): + mask = np.zeros(obj.shape, dtype=bool) + mask[key] = True + + if using_infer_string and obj.dtype == object: + with pytest.raises(TypeError, match="Scalar must"): + Index(obj).putmask(mask, val) + else: + res = Index(obj).putmask(mask, val) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) + + +@pytest.mark.parametrize( + "obj,expected,key,warn", + [ + pytest.param( + # GH#45568 setting a valid NA value into IntervalDtype[int] should + # cast to IntervalDtype[float] + Series(interval_range(1, 5)), + Series( + [Interval(1, 2), np.nan, Interval(3, 4), Interval(4, 5)], + dtype="interval[float64]", + ), + 1, + FutureWarning, + id="interval_int_na_value", + ), + pytest.param( + # these induce dtype changes + Series([2, 3, 4, 5, 6, 7, 8, 9, 10]), + Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]), + slice(None, None, 2), + None, + id="int_series_slice_key_step", + ), + pytest.param( + Series([True, True, False, False]), + Series([np.nan, True, np.nan, False], dtype=object), + slice(None, None, 2), + FutureWarning, + id="bool_series_slice_key_step", + ), + pytest.param( + # these induce dtype changes + Series(np.arange(10)), + Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]), + slice(None, 5), + None, + id="int_series_slice_key", + ), + pytest.param( + # changes dtype GH#4463 + Series([1, 2, 3]), + Series([np.nan, 2, 3]), + 0, + None, + id="int_series_int_key", + ), + pytest.param( + # changes dtype GH#4463 + Series([False]), + Series([np.nan], dtype=object), + # TODO: maybe go to float64 since we are changing the _whole_ Series? + 0, + FutureWarning, + id="bool_series_int_key_change_all", + ), + pytest.param( + # changes dtype GH#4463 + Series([False, True]), + Series([np.nan, True], dtype=object), + 0, + FutureWarning, + id="bool_series_int_key", + ), + ], +) +class TestSetitemCastingEquivalents(SetitemCastingEquivalents): + @pytest.fixture(params=[np.nan, np.float64("NaN"), None, NA]) + def val(self, request): + """ + NA values that should generally be valid_na for *all* dtypes. + + Include both python float NaN and np.float64; only np.float64 has a + `dtype` attribute. + """ + return request.param + + +class TestSetitemTimedelta64IntoNumeric(SetitemCastingEquivalents): + # timedelta64 should not be treated as integers when setting into + # numeric Series + + @pytest.fixture + def val(self): + td = np.timedelta64(4, "ns") + return td + # TODO: could also try np.full((1,), td) + + @pytest.fixture(params=[complex, int, float]) + def dtype(self, request): + return request.param + + @pytest.fixture + def obj(self, dtype): + arr = np.arange(5).astype(dtype) + ser = Series(arr) + return ser + + @pytest.fixture + def expected(self, dtype): + arr = np.arange(5).astype(dtype) + ser = Series(arr) + ser = ser.astype(object) + ser.iloc[0] = np.timedelta64(4, "ns") + return ser + + @pytest.fixture + def key(self): + return 0 + + @pytest.fixture + def warn(self): + return FutureWarning + + +class TestSetitemDT64IntoInt(SetitemCastingEquivalents): + # GH#39619 dont cast dt64 to int when doing this setitem + + @pytest.fixture(params=["M8[ns]", "m8[ns]"]) + def dtype(self, request): + return request.param + + @pytest.fixture + def scalar(self, dtype): + val = np.datetime64("2021-01-18 13:25:00", "ns") + if dtype == "m8[ns]": + val = val - val + return val + + @pytest.fixture + def expected(self, scalar): + expected = Series([scalar, scalar, 3], dtype=object) + assert isinstance(expected[0], type(scalar)) + return expected + + @pytest.fixture + def obj(self): + return Series([1, 2, 3]) + + @pytest.fixture + def key(self): + return slice(None, -1) + + @pytest.fixture(params=[None, list, np.array]) + def val(self, scalar, request): + box = request.param + if box is None: + return scalar + return box([scalar, scalar]) + + @pytest.fixture + def warn(self): + return FutureWarning + + +class TestSetitemNAPeriodDtype(SetitemCastingEquivalents): + # Setting compatible NA values into Series with PeriodDtype + + @pytest.fixture + def expected(self, key): + exp = Series(period_range("2000-01-01", periods=10, freq="D")) + exp._values.view("i8")[key] = NaT._value + assert exp[key] is NaT or all(x is NaT for x in exp[key]) + return exp + + @pytest.fixture + def obj(self): + return Series(period_range("2000-01-01", periods=10, freq="D")) + + @pytest.fixture(params=[3, slice(3, 5)]) + def key(self, request): + return request.param + + @pytest.fixture(params=[None, np.nan]) + def val(self, request): + return request.param + + @pytest.fixture + def warn(self): + return None + + +class TestSetitemNADatetimeLikeDtype(SetitemCastingEquivalents): + # some nat-like values should be cast to datetime64/timedelta64 when + # inserting into a datetime64/timedelta64 series. Others should coerce + # to object and retain their dtypes. + # GH#18586 for td64 and boolean mask case + + @pytest.fixture( + params=["m8[ns]", "M8[ns]", "datetime64[ns, UTC]", "datetime64[ns, US/Central]"] + ) + def dtype(self, request): + return request.param + + @pytest.fixture + def obj(self, dtype): + i8vals = date_range("2016-01-01", periods=3).asi8 + idx = Index(i8vals, dtype=dtype) + assert idx.dtype == dtype + return Series(idx) + + @pytest.fixture( + params=[ + None, + np.nan, + NaT, + np.timedelta64("NaT", "ns"), + np.datetime64("NaT", "ns"), + ] + ) + def val(self, request): + return request.param + + @pytest.fixture + def is_inplace(self, val, obj): + # td64 -> cast to object iff val is datetime64("NaT") + # dt64 -> cast to object iff val is timedelta64("NaT") + # dt64tz -> cast to object with anything _but_ NaT + return val is NaT or val is None or val is np.nan or obj.dtype == val.dtype + + @pytest.fixture + def expected(self, obj, val, is_inplace): + dtype = obj.dtype if is_inplace else object + expected = Series([val] + list(obj[1:]), dtype=dtype) + return expected + + @pytest.fixture + def key(self): + return 0 + + @pytest.fixture + def warn(self, is_inplace): + return None if is_inplace else FutureWarning + + +class TestSetitemMismatchedTZCastsToObject(SetitemCastingEquivalents): + # GH#24024 + @pytest.fixture + def obj(self): + return Series(date_range("2000", periods=2, tz="US/Central")) + + @pytest.fixture + def val(self): + return Timestamp("2000", tz="US/Eastern") + + @pytest.fixture + def key(self): + return 0 + + @pytest.fixture + def expected(self, obj, val): + # pre-2.0 this would cast to object, in 2.0 we cast the val to + # the target tz + expected = Series( + [ + val.tz_convert("US/Central"), + Timestamp("2000-01-02 00:00:00-06:00", tz="US/Central"), + ], + dtype=obj.dtype, + ) + return expected + + @pytest.fixture + def warn(self): + return None + + +@pytest.mark.parametrize( + "obj,expected,warn", + [ + # For numeric series, we should coerce to NaN. + (Series([1, 2, 3]), Series([np.nan, 2, 3]), None), + (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0]), None), + # For datetime series, we should coerce to NaT. + ( + Series([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]), + Series([NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), + None, + ), + # For objects, we should preserve the None value. + (Series(["foo", "bar", "baz"]), Series([None, "bar", "baz"]), None), + ], +) +class TestSeriesNoneCoercion(SetitemCastingEquivalents): + @pytest.fixture + def key(self): + return 0 + + @pytest.fixture + def val(self): + return None + + +class TestSetitemFloatIntervalWithIntIntervalValues(SetitemCastingEquivalents): + # GH#44201 Cast to shared IntervalDtype rather than object + + def test_setitem_example(self): + # Just a case here to make obvious what this test class is aimed at + idx = IntervalIndex.from_breaks(range(4)) + obj = Series(idx) + val = Interval(0.5, 1.5) + + with tm.assert_produces_warning( + FutureWarning, match="Setting an item of incompatible dtype" + ): + obj[0] = val + assert obj.dtype == "Interval[float64, right]" + + @pytest.fixture + def obj(self): + idx = IntervalIndex.from_breaks(range(4)) + return Series(idx) + + @pytest.fixture + def val(self): + return Interval(0.5, 1.5) + + @pytest.fixture + def key(self): + return 0 + + @pytest.fixture + def expected(self, obj, val): + data = [val] + list(obj[1:]) + idx = IntervalIndex(data, dtype="Interval[float64]") + return Series(idx) + + @pytest.fixture + def warn(self): + return FutureWarning + + +class TestSetitemRangeIntoIntegerSeries(SetitemCastingEquivalents): + # GH#44261 Setting a range with sufficiently-small integers into + # small-itemsize integer dtypes should not need to upcast + + @pytest.fixture + def obj(self, any_int_numpy_dtype): + dtype = np.dtype(any_int_numpy_dtype) + ser = Series(range(5), dtype=dtype) + return ser + + @pytest.fixture + def val(self): + return range(2, 4) + + @pytest.fixture + def key(self): + return slice(0, 2) + + @pytest.fixture + def expected(self, any_int_numpy_dtype): + dtype = np.dtype(any_int_numpy_dtype) + exp = Series([2, 3, 2, 3, 4], dtype=dtype) + return exp + + @pytest.fixture + def warn(self): + return None + + +@pytest.mark.parametrize( + "val, warn", + [ + (np.array([2.0, 3.0]), None), + (np.array([2.5, 3.5]), FutureWarning), + ( + np.array([2**65, 2**65 + 1], dtype=np.float64), + FutureWarning, + ), # all ints, but can't cast + ], +) +class TestSetitemFloatNDarrayIntoIntegerSeries(SetitemCastingEquivalents): + @pytest.fixture + def obj(self): + return Series(range(5), dtype=np.int64) + + @pytest.fixture + def key(self): + return slice(0, 2) + + @pytest.fixture + def expected(self, val): + if val[0] == 2: + # NB: this condition is based on currently-hardcoded "val" cases + dtype = np.int64 + else: + dtype = np.float64 + res_values = np.array(range(5), dtype=dtype) + res_values[:2] = val + return Series(res_values) + + +@pytest.mark.parametrize("val", [512, np.int16(512)]) +class TestSetitemIntoIntegerSeriesNeedsUpcast(SetitemCastingEquivalents): + @pytest.fixture + def obj(self): + return Series([1, 2, 3], dtype=np.int8) + + @pytest.fixture + def key(self): + return 1 + + @pytest.fixture + def expected(self): + return Series([1, 512, 3], dtype=np.int16) + + @pytest.fixture + def warn(self): + return FutureWarning + + +@pytest.mark.parametrize("val", [2**33 + 1.0, 2**33 + 1.1, 2**62]) +class TestSmallIntegerSetitemUpcast(SetitemCastingEquivalents): + # https://github.com/pandas-dev/pandas/issues/39584#issuecomment-941212124 + @pytest.fixture + def obj(self): + return Series([1, 2, 3], dtype="i4") + + @pytest.fixture + def key(self): + return 0 + + @pytest.fixture + def expected(self, val): + if val % 1 != 0: + dtype = "f8" + else: + dtype = "i8" + return Series([val, 2, 3], dtype=dtype) + + @pytest.fixture + def warn(self): + return FutureWarning + + +class CoercionTest(SetitemCastingEquivalents): + # Tests ported from tests.indexing.test_coercion + + @pytest.fixture + def key(self): + return 1 + + @pytest.fixture + def expected(self, obj, key, val, exp_dtype): + vals = list(obj) + vals[key] = val + return Series(vals, dtype=exp_dtype) + + +@pytest.mark.parametrize( + "val,exp_dtype,warn", + [(np.int32(1), np.int8, None), (np.int16(2**9), np.int16, FutureWarning)], +) +class TestCoercionInt8(CoercionTest): + # previously test_setitem_series_int8 in tests.indexing.test_coercion + @pytest.fixture + def obj(self): + return Series([1, 2, 3, 4], dtype=np.int8) + + +@pytest.mark.parametrize("val", [1, 1.1, 1 + 1j, True]) +@pytest.mark.parametrize("exp_dtype", [object]) +class TestCoercionObject(CoercionTest): + # previously test_setitem_series_object in tests.indexing.test_coercion + @pytest.fixture + def obj(self): + return Series(["a", "b", "c", "d"], dtype=object) + + @pytest.fixture + def warn(self): + return None + + +@pytest.mark.parametrize( + "val,exp_dtype,warn", + [ + (1, np.complex128, None), + (1.1, np.complex128, None), + (1 + 1j, np.complex128, None), + (True, object, FutureWarning), + ], +) +class TestCoercionComplex(CoercionTest): + # previously test_setitem_series_complex128 in tests.indexing.test_coercion + @pytest.fixture + def obj(self): + return Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) + + +@pytest.mark.parametrize( + "val,exp_dtype,warn", + [ + (1, object, FutureWarning), + ("3", object, FutureWarning), + (3, object, FutureWarning), + (1.1, object, FutureWarning), + (1 + 1j, object, FutureWarning), + (True, bool, None), + ], +) +class TestCoercionBool(CoercionTest): + # previously test_setitem_series_bool in tests.indexing.test_coercion + @pytest.fixture + def obj(self): + return Series([True, False, True, False], dtype=bool) + + +@pytest.mark.parametrize( + "val,exp_dtype,warn", + [ + (1, np.int64, None), + (1.1, np.float64, FutureWarning), + (1 + 1j, np.complex128, FutureWarning), + (True, object, FutureWarning), + ], +) +class TestCoercionInt64(CoercionTest): + # previously test_setitem_series_int64 in tests.indexing.test_coercion + @pytest.fixture + def obj(self): + return Series([1, 2, 3, 4]) + + +@pytest.mark.parametrize( + "val,exp_dtype,warn", + [ + (1, np.float64, None), + (1.1, np.float64, None), + (1 + 1j, np.complex128, FutureWarning), + (True, object, FutureWarning), + ], +) +class TestCoercionFloat64(CoercionTest): + # previously test_setitem_series_float64 in tests.indexing.test_coercion + @pytest.fixture + def obj(self): + return Series([1.1, 2.2, 3.3, 4.4]) + + +@pytest.mark.parametrize( + "val,exp_dtype,warn", + [ + (1, np.float32, None), + pytest.param( + 1.1, + np.float32, + None, + marks=pytest.mark.xfail( + ( + not np_version_gte1p24 + or ( + np_version_gte1p24 + and not np_version_gt2 + and os.environ.get("NPY_PROMOTION_STATE", "legacy") != "weak" + ) + ), + reason="np.float32(1.1) ends up as 1.100000023841858, so " + "np_can_hold_element raises and we cast to float64", + ), + ), + (1 + 1j, np.complex128, FutureWarning), + (True, object, FutureWarning), + (np.uint8(2), np.float32, None), + (np.uint32(2), np.float32, None), + # float32 cannot hold np.iinfo(np.uint32).max exactly + # (closest it can hold is 4294967300.0 which off by 5.0), so + # we cast to float64 + (np.uint32(np.iinfo(np.uint32).max), np.float64, FutureWarning), + (np.uint64(2), np.float32, None), + (np.int64(2), np.float32, None), + ], +) +class TestCoercionFloat32(CoercionTest): + @pytest.fixture + def obj(self): + return Series([1.1, 2.2, 3.3, 4.4], dtype=np.float32) + + def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): + super().test_slice_key(obj, key, expected, warn, val, indexer_sli, is_inplace) + + if isinstance(val, float): + # the xfail would xpass bc test_slice_key short-circuits + raise AssertionError("xfail not relevant for this test.") + + +@pytest.mark.parametrize( + "val,exp_dtype,warn", + [ + (Timestamp("2012-01-01"), "datetime64[ns]", None), + (1, object, FutureWarning), + ("x", object, FutureWarning), + ], +) +class TestCoercionDatetime64(CoercionTest): + # previously test_setitem_series_datetime64 in tests.indexing.test_coercion + + @pytest.fixture + def obj(self): + return Series(date_range("2011-01-01", freq="D", periods=4)) + + @pytest.fixture + def warn(self): + return None + + +@pytest.mark.parametrize( + "val,exp_dtype,warn", + [ + (Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]", None), + # pre-2.0, a mis-matched tz would end up casting to object + (Timestamp("2012-01-01", tz="US/Pacific"), "datetime64[ns, US/Eastern]", None), + (Timestamp("2012-01-01"), object, FutureWarning), + (1, object, FutureWarning), + ], +) +class TestCoercionDatetime64TZ(CoercionTest): + # previously test_setitem_series_datetime64tz in tests.indexing.test_coercion + @pytest.fixture + def obj(self): + tz = "US/Eastern" + return Series(date_range("2011-01-01", freq="D", periods=4, tz=tz)) + + @pytest.fixture + def warn(self): + return None + + +@pytest.mark.parametrize( + "val,exp_dtype,warn", + [ + (Timedelta("12 day"), "timedelta64[ns]", None), + (1, object, FutureWarning), + ("x", object, FutureWarning), + ], +) +class TestCoercionTimedelta64(CoercionTest): + # previously test_setitem_series_timedelta64 in tests.indexing.test_coercion + @pytest.fixture + def obj(self): + return Series(timedelta_range("1 day", periods=4)) + + @pytest.fixture + def warn(self): + return None + + +@pytest.mark.parametrize( + "val", ["foo", Period("2016", freq="Y"), Interval(1, 2, closed="both")] +) +@pytest.mark.parametrize("exp_dtype", [object]) +class TestPeriodIntervalCoercion(CoercionTest): + # GH#45768 + @pytest.fixture( + params=[ + period_range("2016-01-01", periods=3, freq="D"), + interval_range(1, 5), + ] + ) + def obj(self, request): + return Series(request.param) + + @pytest.fixture + def warn(self): + return FutureWarning + + +def test_20643(): + # closed by GH#45121 + orig = Series([0, 1, 2], index=["a", "b", "c"]) + + expected = Series([0, 2.7, 2], index=["a", "b", "c"]) + + ser = orig.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser.at["b"] = 2.7 + tm.assert_series_equal(ser, expected) + + ser = orig.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser.loc["b"] = 2.7 + tm.assert_series_equal(ser, expected) + + ser = orig.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser["b"] = 2.7 + tm.assert_series_equal(ser, expected) + + ser = orig.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser.iat[1] = 2.7 + tm.assert_series_equal(ser, expected) + + ser = orig.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser.iloc[1] = 2.7 + tm.assert_series_equal(ser, expected) + + orig_df = orig.to_frame("A") + expected_df = expected.to_frame("A") + + df = orig_df.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.at["b", "A"] = 2.7 + tm.assert_frame_equal(df, expected_df) + + df = orig_df.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc["b", "A"] = 2.7 + tm.assert_frame_equal(df, expected_df) + + df = orig_df.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.iloc[1, 0] = 2.7 + tm.assert_frame_equal(df, expected_df) + + df = orig_df.copy() + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.iat[1, 0] = 2.7 + tm.assert_frame_equal(df, expected_df) + + +def test_20643_comment(): + # https://github.com/pandas-dev/pandas/issues/20643#issuecomment-431244590 + # fixed sometime prior to GH#45121 + orig = Series([0, 1, 2], index=["a", "b", "c"]) + expected = Series([np.nan, 1, 2], index=["a", "b", "c"]) + + ser = orig.copy() + ser.iat[0] = None + tm.assert_series_equal(ser, expected) + + ser = orig.copy() + ser.iloc[0] = None + tm.assert_series_equal(ser, expected) + + +def test_15413(): + # fixed by GH#45121 + ser = Series([1, 2, 3]) + + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser[ser == 2] += 0.5 + expected = Series([1, 2.5, 3]) + tm.assert_series_equal(ser, expected) + + ser = Series([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser[1] += 0.5 + tm.assert_series_equal(ser, expected) + + ser = Series([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser.loc[1] += 0.5 + tm.assert_series_equal(ser, expected) + + ser = Series([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser.iloc[1] += 0.5 + tm.assert_series_equal(ser, expected) + + ser = Series([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser.iat[1] += 0.5 + tm.assert_series_equal(ser, expected) + + ser = Series([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser.at[1] += 0.5 + tm.assert_series_equal(ser, expected) + + +def test_32878_int_itemsize(): + # Fixed by GH#45121 + arr = np.arange(5).astype("i4") + ser = Series(arr) + val = np.int64(np.iinfo(np.int64).max) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser[0] = val + expected = Series([val, 1, 2, 3, 4], dtype=np.int64) + tm.assert_series_equal(ser, expected) + + +def test_32878_complex_itemsize(): + arr = np.arange(5).astype("c8") + ser = Series(arr) + val = np.finfo(np.float64).max + val = val.astype("c16") + + # GH#32878 used to coerce val to inf+0.000000e+00j + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser[0] = val + assert ser[0] == val + expected = Series([val, 1, 2, 3, 4], dtype="c16") + tm.assert_series_equal(ser, expected) + + +def test_37692(indexer_al): + # GH#37692 + ser = Series([1, 2, 3], index=["a", "b", "c"]) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + indexer_al(ser)["b"] = "test" + expected = Series([1, "test", 3], index=["a", "b", "c"], dtype=object) + tm.assert_series_equal(ser, expected) + + +def test_setitem_bool_int_float_consistency(indexer_sli): + # GH#21513 + # bool-with-int and bool-with-float both upcast to object + # int-with-float and float-with-int are both non-casting so long + # as the setitem can be done losslessly + for dtype in [np.float64, np.int64]: + ser = Series(0, index=range(3), dtype=dtype) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + indexer_sli(ser)[0] = True + assert ser.dtype == object + + ser = Series(0, index=range(3), dtype=bool) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + ser[0] = dtype(1) + assert ser.dtype == object + + # 1.0 can be held losslessly, so no casting + ser = Series(0, index=range(3), dtype=np.int64) + indexer_sli(ser)[0] = np.float64(1.0) + assert ser.dtype == np.int64 + + # 1 can be held losslessly, so no casting + ser = Series(0, index=range(3), dtype=np.float64) + indexer_sli(ser)[0] = np.int64(1) + + +def test_setitem_positional_with_casting(): + # GH#45070 case where in __setitem__ we get a KeyError, then when + # we fallback we *also* get a ValueError if we try to set inplace. + ser = Series([1, 2, 3], index=["a", "b", "c"]) + + warn_msg = "Series.__setitem__ treating keys as positions is deprecated" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + ser[0] = "X" + expected = Series(["X", 2, 3], index=["a", "b", "c"], dtype=object) + tm.assert_series_equal(ser, expected) + + +def test_setitem_positional_float_into_int_coerces(): + # Case where we hit a KeyError and then trying to set in-place incorrectly + # casts a float to an int + ser = Series([1, 2, 3], index=["a", "b", "c"]) + + warn_msg = "Series.__setitem__ treating keys as positions is deprecated" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + ser[0] = 1.5 + expected = Series([1.5, 2, 3], index=["a", "b", "c"]) + tm.assert_series_equal(ser, expected) + + +def test_setitem_int_not_positional(): + # GH#42215 deprecated falling back to positional on __setitem__ with an + # int not contained in the index; enforced in 2.0 + ser = Series([1, 2, 3, 4], index=[1.1, 2.1, 3.0, 4.1]) + assert not ser.index._should_fallback_to_positional + # assert not ser.index.astype(object)._should_fallback_to_positional + + # 3.0 is in our index, so post-enforcement behavior is unchanged + ser[3] = 10 + expected = Series([1, 2, 10, 4], index=ser.index) + tm.assert_series_equal(ser, expected) + + # pre-enforcement `ser[5] = 5` raised IndexError + ser[5] = 5 + expected = Series([1, 2, 10, 4, 5], index=[1.1, 2.1, 3.0, 4.1, 5.0]) + tm.assert_series_equal(ser, expected) + + ii = IntervalIndex.from_breaks(range(10))[::2] + ser2 = Series(range(len(ii)), index=ii) + exp_index = ii.astype(object).append(Index([4])) + expected2 = Series([0, 1, 2, 3, 4, 9], index=exp_index) + # pre-enforcement `ser2[4] = 9` interpreted 4 as positional + ser2[4] = 9 + tm.assert_series_equal(ser2, expected2) + + mi = MultiIndex.from_product([ser.index, ["A", "B"]]) + ser3 = Series(range(len(mi)), index=mi) + expected3 = ser3.copy() + expected3.loc[4] = 99 + # pre-enforcement `ser3[4] = 99` interpreted 4 as positional + ser3[4] = 99 + tm.assert_series_equal(ser3, expected3) + + +def test_setitem_with_bool_indexer(): + # GH#42530 + + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df.pop("b").copy() + result[[True, False, False]] = 9 + expected = Series(data=[9, 5, 6], name="b") + tm.assert_series_equal(result, expected) + + df.loc[[True, False, False], "a"] = 10 + expected = DataFrame({"a": [10, 2, 3]}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("size", range(2, 6)) +@pytest.mark.parametrize( + "mask", [[True, False, False, False, False], [True, False], [False]] +) +@pytest.mark.parametrize( + "item", [2.0, np.nan, np.finfo(float).max, np.finfo(float).min] +) +# Test numpy arrays, lists and tuples as the input to be +# broadcast +@pytest.mark.parametrize( + "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)] +) +def test_setitem_bool_indexer_dont_broadcast_length1_values(size, mask, item, box): + # GH#44265 + # see also tests.series.indexing.test_where.test_broadcast + + selection = np.resize(mask, size) + + data = np.arange(size, dtype=float) + + ser = Series(data) + + if selection.sum() != 1: + msg = ( + "cannot set using a list-like indexer with a different " + "length than the value" + ) + with pytest.raises(ValueError, match=msg): + # GH#44265 + ser[selection] = box(item) + else: + # In this corner case setting is equivalent to setting with the unboxed + # item + ser[selection] = box(item) + + expected = Series(np.arange(size, dtype=float)) + expected[selection] = item + tm.assert_series_equal(ser, expected) + + +def test_setitem_empty_mask_dont_upcast_dt64(): + dti = date_range("2016-01-01", periods=3) + ser = Series(dti) + orig = ser.copy() + mask = np.zeros(3, dtype=bool) + + ser[mask] = "foo" + assert ser.dtype == dti.dtype # no-op -> dont upcast + tm.assert_series_equal(ser, orig) + + ser.mask(mask, "foo", inplace=True) + assert ser.dtype == dti.dtype # no-op -> dont upcast + tm.assert_series_equal(ser, orig) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_add_prefix_suffix.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_add_prefix_suffix.py new file mode 100644 index 0000000000000000000000000000000000000000..289a56b98b7e123fb1c1edf5cc9ff41369d51122 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_add_prefix_suffix.py @@ -0,0 +1,41 @@ +import pytest + +from pandas import Index +import pandas._testing as tm + + +def test_add_prefix_suffix(string_series): + with_prefix = string_series.add_prefix("foo#") + expected = Index([f"foo#{c}" for c in string_series.index]) + tm.assert_index_equal(with_prefix.index, expected) + + with_suffix = string_series.add_suffix("#foo") + expected = Index([f"{c}#foo" for c in string_series.index]) + tm.assert_index_equal(with_suffix.index, expected) + + with_pct_prefix = string_series.add_prefix("%") + expected = Index([f"%{c}" for c in string_series.index]) + tm.assert_index_equal(with_pct_prefix.index, expected) + + with_pct_suffix = string_series.add_suffix("%") + expected = Index([f"{c}%" for c in string_series.index]) + tm.assert_index_equal(with_pct_suffix.index, expected) + + +def test_add_prefix_suffix_axis(string_series): + # GH 47819 + with_prefix = string_series.add_prefix("foo#", axis=0) + expected = Index([f"foo#{c}" for c in string_series.index]) + tm.assert_index_equal(with_prefix.index, expected) + + with_pct_suffix = string_series.add_suffix("#foo", axis=0) + expected = Index([f"{c}#foo" for c in string_series.index]) + tm.assert_index_equal(with_pct_suffix.index, expected) + + +def test_add_prefix_suffix_invalid_axis(string_series): + with pytest.raises(ValueError, match="No axis named 1 for object type Series"): + string_series.add_prefix("foo#", axis=1) + + with pytest.raises(ValueError, match="No axis named 1 for object type Series"): + string_series.add_suffix("foo#", axis=1) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_argsort.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_argsort.py new file mode 100644 index 0000000000000000000000000000000000000000..432c0eceee01107cdcd23056a39e0ef4bd55545b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_argsort.py @@ -0,0 +1,84 @@ +import numpy as np +import pytest + +from pandas import ( + Series, + Timestamp, + isna, +) +import pandas._testing as tm + + +class TestSeriesArgsort: + def test_argsort_axis(self): + # GH#54257 + ser = Series(range(3)) + + msg = "No axis named 2 for object type Series" + with pytest.raises(ValueError, match=msg): + ser.argsort(axis=2) + + def test_argsort_numpy(self, datetime_series): + ser = datetime_series + + res = np.argsort(ser).values + expected = np.argsort(np.array(ser)) + tm.assert_numpy_array_equal(res, expected) + + # with missing values + ts = ser.copy() + ts[::2] = np.nan + + msg = "The behavior of Series.argsort in the presence of NA values" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + result = np.argsort(ts)[1::2] + expected = np.argsort(np.array(ts.dropna())) + + tm.assert_numpy_array_equal(result.values, expected) + + def test_argsort(self, datetime_series): + argsorted = datetime_series.argsort() + assert issubclass(argsorted.dtype.type, np.integer) + + def test_argsort_dt64(self, unit): + # GH#2967 (introduced bug in 0.11-dev I think) + ser = Series( + [Timestamp(f"201301{i:02d}") for i in range(1, 6)], dtype=f"M8[{unit}]" + ) + assert ser.dtype == f"datetime64[{unit}]" + shifted = ser.shift(-1) + assert shifted.dtype == f"datetime64[{unit}]" + assert isna(shifted[4]) + + result = ser.argsort() + expected = Series(range(5), dtype=np.intp) + tm.assert_series_equal(result, expected) + + msg = "The behavior of Series.argsort in the presence of NA values" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = shifted.argsort() + expected = Series(list(range(4)) + [-1], dtype=np.intp) + tm.assert_series_equal(result, expected) + + def test_argsort_stable(self): + ser = Series(np.random.default_rng(2).integers(0, 100, size=10000)) + mindexer = ser.argsort(kind="mergesort") + qindexer = ser.argsort() + + mexpected = np.argsort(ser.values, kind="mergesort") + qexpected = np.argsort(ser.values, kind="quicksort") + + tm.assert_series_equal(mindexer.astype(np.intp), Series(mexpected)) + tm.assert_series_equal(qindexer.astype(np.intp), Series(qexpected)) + msg = ( + r"ndarray Expected type , " + r"found instead" + ) + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(qindexer, mindexer) + + def test_argsort_preserve_name(self, datetime_series): + result = datetime_series.argsort() + assert result.name == datetime_series.name diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_asof.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_asof.py new file mode 100644 index 0000000000000000000000000000000000000000..2acc2921e5efc089b1dd4ed7aa9a6cebc1a3d0fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_asof.py @@ -0,0 +1,205 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs import IncompatibleFrequency + +from pandas import ( + DatetimeIndex, + PeriodIndex, + Series, + Timestamp, + date_range, + isna, + notna, + offsets, + period_range, +) +import pandas._testing as tm + + +class TestSeriesAsof: + def test_asof_nanosecond_index_access(self): + ts = Timestamp("20130101").as_unit("ns")._value + dti = DatetimeIndex([ts + 50 + i for i in range(100)]) + ser = Series(np.random.default_rng(2).standard_normal(100), index=dti) + + first_value = ser.asof(ser.index[0]) + + # GH#46903 previously incorrectly was "day" + assert dti.resolution == "nanosecond" + + # this used to not work bc parsing was done by dateutil that didn't + # handle nanoseconds + assert first_value == ser["2013-01-01 00:00:00.000000050"] + + expected_ts = np.datetime64("2013-01-01 00:00:00.000000050", "ns") + assert first_value == ser[Timestamp(expected_ts)] + + def test_basic(self): + # array or list or dates + N = 50 + rng = date_range("1/1/1990", periods=N, freq="53s") + ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) + ts.iloc[15:30] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + result = ts.asof(dates) + assert notna(result).all() + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + assert notna(result).all() + lb = ts.index[14] + ub = ts.index[30] + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + assert (rs == ts[lb]).all() + + val = result[result.index[result.index >= ub][0]] + assert ts[ub] == val + + def test_scalar(self): + N = 30 + rng = date_range("1/1/1990", periods=N, freq="53s") + # Explicit cast to float avoid implicit cast when setting nan + ts = Series(np.arange(N), index=rng, dtype="float") + ts.iloc[5:10] = np.nan + ts.iloc[15:20] = np.nan + + val1 = ts.asof(ts.index[7]) + val2 = ts.asof(ts.index[19]) + + assert val1 == ts.iloc[4] + assert val2 == ts.iloc[14] + + # accepts strings + val1 = ts.asof(str(ts.index[7])) + assert val1 == ts.iloc[4] + + # in there + result = ts.asof(ts.index[3]) + assert result == ts.iloc[3] + + # no as of value + d = ts.index[0] - offsets.BDay() + assert np.isnan(ts.asof(d)) + + def test_with_nan(self): + # basic asof test + rng = date_range("1/1/2000", "1/2/2000", freq="4h") + s = Series(np.arange(len(rng)), index=rng) + r = s.resample("2h").mean() + + result = r.asof(r.index) + expected = Series( + [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) + tm.assert_series_equal(result, expected) + + r.iloc[3:5] = np.nan + result = r.asof(r.index) + expected = Series( + [0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 5, 5, 6.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) + tm.assert_series_equal(result, expected) + + r.iloc[-3:] = np.nan + result = r.asof(r.index) + expected = Series( + [0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) + tm.assert_series_equal(result, expected) + + def test_periodindex(self): + # array or list or dates + N = 50 + rng = period_range("1/1/1990", periods=N, freq="h") + ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) + ts.iloc[15:30] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="37min") + + result = ts.asof(dates) + assert notna(result).all() + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + assert notna(result).all() + lb = ts.index[14] + ub = ts.index[30] + + pix = PeriodIndex(result.index.values, freq="h") + mask = (pix >= lb) & (pix < ub) + rs = result[mask] + assert (rs == ts[lb]).all() + + ts.iloc[5:10] = np.nan + ts.iloc[15:20] = np.nan + + val1 = ts.asof(ts.index[7]) + val2 = ts.asof(ts.index[19]) + + assert val1 == ts.iloc[4] + assert val2 == ts.iloc[14] + + # accepts strings + val1 = ts.asof(str(ts.index[7])) + assert val1 == ts.iloc[4] + + # in there + assert ts.asof(ts.index[3]) == ts.iloc[3] + + # no as of value + d = ts.index[0].to_timestamp() - offsets.BDay() + assert isna(ts.asof(d)) + + # Mismatched freq + msg = "Input has different freq" + with pytest.raises(IncompatibleFrequency, match=msg): + ts.asof(rng.asfreq("D")) + + def test_errors(self): + s = Series( + [1, 2, 3], + index=[Timestamp("20130101"), Timestamp("20130103"), Timestamp("20130102")], + ) + + # non-monotonic + assert not s.index.is_monotonic_increasing + with pytest.raises(ValueError, match="requires a sorted index"): + s.asof(s.index[0]) + + # subset with Series + N = 10 + rng = date_range("1/1/1990", periods=N, freq="53s") + s = Series(np.random.default_rng(2).standard_normal(N), index=rng) + with pytest.raises(ValueError, match="not valid for Series"): + s.asof(s.index[0], subset="foo") + + def test_all_nans(self): + # GH 15713 + # series is all nans + + # testing non-default indexes + N = 50 + rng = date_range("1/1/1990", periods=N, freq="53s") + + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + result = Series(np.nan, index=rng).asof(dates) + expected = Series(np.nan, index=dates) + tm.assert_series_equal(result, expected) + + # testing scalar input + date = date_range("1/1/1990", periods=N * 3, freq="25s")[0] + result = Series(np.nan, index=rng).asof(date) + assert isna(result) + + # test name is propagated + result = Series(np.nan, index=[1, 2, 3, 4], name="test").asof([4, 5]) + expected = Series(np.nan, index=[4, 5], name="test") + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_case_when.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_case_when.py new file mode 100644 index 0000000000000000000000000000000000000000..7cb60a11644a357811136c991143554c4477d485 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_case_when.py @@ -0,0 +1,148 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + array as pd_array, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture +def df(): + """ + base dataframe for testing + """ + return DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + +def test_case_when_caselist_is_not_a_list(df): + """ + Raise ValueError if caselist is not a list. + """ + msg = "The caselist argument should be a list; " + msg += "instead got.+" + with pytest.raises(TypeError, match=msg): # GH39154 + df["a"].case_when(caselist=()) + + +def test_case_when_no_caselist(df): + """ + Raise ValueError if no caselist is provided. + """ + msg = "provide at least one boolean condition, " + msg += "with a corresponding replacement." + with pytest.raises(ValueError, match=msg): # GH39154 + df["a"].case_when([]) + + +def test_case_when_odd_caselist(df): + """ + Raise ValueError if no of caselist is odd. + """ + msg = "Argument 0 must have length 2; " + msg += "a condition and replacement; instead got length 3." + + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), 1, df.a.gt(1))]) + + +def test_case_when_raise_error_from_mask(df): + """ + Raise Error from within Series.mask + """ + msg = "Failed to apply condition0 and replacement0." + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), [1, 2])]) + + +def test_case_when_single_condition(df): + """ + Test output on a single condition. + """ + result = Series([np.nan, np.nan, np.nan]).case_when([(df.a.eq(1), 1)]) + expected = Series([1, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions(df): + """ + Test output when booleans are derived from a computation + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [(df.a.eq(1), 1), (Series([False, True, False]), 2)] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_list(df): + """ + Test output when replacement is a list + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [([True, False, False], 1), (df["a"].gt(1) & df["b"].eq(5), [1, 2, 3])] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_extension_dtype(df): + """ + Test output when replacement has an extension dtype + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + ([True, False, False], 1), + (df["a"].gt(1) & df["b"].eq(5), pd_array([1, 2, 3], dtype="Int64")), + ], + ) + expected = Series([1, 2, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_series(df): + """ + Test output when replacement is a Series + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + (np.array([True, False, False]), 1), + (df["a"].gt(1) & df["b"].eq(5), Series([1, 2, 3])), + ], + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_non_range_index(): + """ + Test output if index is not RangeIndex + """ + rng = np.random.default_rng(seed=123) + dates = date_range("1/1/2000", periods=8) + df = DataFrame( + rng.standard_normal(size=(8, 4)), index=dates, columns=["A", "B", "C", "D"] + ) + result = Series(5, index=df.index, name="A").case_when([(df.A.gt(0), df.B)]) + expected = df.A.mask(df.A.gt(0), df.B).where(df.A.gt(0), 5) + tm.assert_series_equal(result, expected) + + +def test_case_when_callable(): + """ + Test output on a callable + """ + # https://numpy.org/doc/stable/reference/generated/numpy.piecewise.html + x = np.linspace(-2.5, 2.5, 6) + ser = Series(x) + result = ser.case_when( + caselist=[ + (lambda df: df < 0, lambda df: -df), + (lambda df: df >= 0, lambda df: df), + ] + ) + expected = np.piecewise(x, [x < 0, x >= 0], [lambda x: -x, lambda x: x]) + tm.assert_series_equal(result, Series(expected)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_clip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..5bbf35f6e39bb237ef1a3853403d47f1fed5b3de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_clip.py @@ -0,0 +1,146 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Series, + Timestamp, + isna, + notna, +) +import pandas._testing as tm + + +class TestSeriesClip: + def test_clip(self, datetime_series): + val = datetime_series.median() + + assert datetime_series.clip(lower=val).min() == val + assert datetime_series.clip(upper=val).max() == val + + result = datetime_series.clip(-0.5, 0.5) + expected = np.clip(datetime_series, -0.5, 0.5) + tm.assert_series_equal(result, expected) + assert isinstance(expected, Series) + + def test_clip_types_and_nulls(self): + sers = [ + Series([np.nan, 1.0, 2.0, 3.0]), + Series([None, "a", "b", "c"]), + Series(pd.to_datetime([np.nan, 1, 2, 3], unit="D")), + ] + + for s in sers: + thresh = s[2] + lower = s.clip(lower=thresh) + upper = s.clip(upper=thresh) + assert lower[notna(lower)].min() == thresh + assert upper[notna(upper)].max() == thresh + assert list(isna(s)) == list(isna(lower)) + assert list(isna(s)) == list(isna(upper)) + + def test_series_clipping_with_na_values(self, any_numeric_ea_dtype, nulls_fixture): + # Ensure that clipping method can handle NA values with out failing + # GH#40581 + + if nulls_fixture is pd.NaT: + # constructor will raise, see + # test_constructor_mismatched_null_nullable_dtype + pytest.skip("See test_constructor_mismatched_null_nullable_dtype") + + ser = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype) + s_clipped_upper = ser.clip(upper=2.0) + s_clipped_lower = ser.clip(lower=2.0) + + expected_upper = Series([nulls_fixture, 1.0, 2.0], dtype=any_numeric_ea_dtype) + expected_lower = Series([nulls_fixture, 2.0, 3.0], dtype=any_numeric_ea_dtype) + + tm.assert_series_equal(s_clipped_upper, expected_upper) + tm.assert_series_equal(s_clipped_lower, expected_lower) + + def test_clip_with_na_args(self): + """Should process np.nan argument as None""" + # GH#17276 + s = Series([1, 2, 3]) + + tm.assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) + tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) + + # GH#19992 + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + # TODO: avoid this warning here? seems like we should never be upcasting + # in the first place? + with tm.assert_produces_warning(FutureWarning, match=msg): + res = s.clip(lower=[0, 4, np.nan]) + tm.assert_series_equal(res, Series([1, 4, 3])) + with tm.assert_produces_warning(FutureWarning, match=msg): + res = s.clip(upper=[1, np.nan, 1]) + tm.assert_series_equal(res, Series([1, 2, 1])) + + # GH#40420 + s = Series([1, 2, 3]) + result = s.clip(0, [np.nan, np.nan, np.nan]) + tm.assert_series_equal(s, result) + + def test_clip_against_series(self): + # GH#6966 + + s = Series([1.0, 1.0, 4.0]) + + lower = Series([1.0, 2.0, 3.0]) + upper = Series([1.5, 2.5, 3.5]) + + tm.assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5])) + tm.assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5])) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])]) + def test_clip_against_list_like(self, inplace, upper): + # GH#15390 + original = Series([5, 6, 7]) + result = original.clip(upper=upper, inplace=inplace) + expected = Series([1, 2, 3]) + + if inplace: + result = original + tm.assert_series_equal(result, expected, check_exact=True) + + def test_clip_with_datetimes(self): + # GH#11838 + # naive and tz-aware datetimes + + t = Timestamp("2015-12-01 09:30:30") + s = Series([Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:31:00")]) + result = s.clip(upper=t) + expected = Series( + [Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:30:30")] + ) + tm.assert_series_equal(result, expected) + + t = Timestamp("2015-12-01 09:30:30", tz="US/Eastern") + s = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:31:00", tz="US/Eastern"), + ] + ) + result = s.clip(upper=t) + expected = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:30:30", tz="US/Eastern"), + ] + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", [object, "M8[us]"]) + def test_clip_with_timestamps_and_oob_datetimes(self, dtype): + # GH-42794 + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=dtype) + + result = ser.clip(lower=Timestamp.min, upper=Timestamp.max) + expected = Series([Timestamp.min, Timestamp.max], dtype=dtype) + + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_combine.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_combine.py new file mode 100644 index 0000000000000000000000000000000000000000..75d47e3daa10339f4c4cc7b35c52f24bbb20277a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_combine.py @@ -0,0 +1,17 @@ +from pandas import Series +import pandas._testing as tm + + +class TestCombine: + def test_combine_scalar(self): + # GH#21248 + # Note - combine() with another Series is tested elsewhere because + # it is used when testing operators + ser = Series([i * 10 for i in range(5)]) + result = ser.combine(3, lambda x, y: x + y) + expected = Series([i * 10 + 3 for i in range(5)]) + tm.assert_series_equal(result, expected) + + result = ser.combine(22, lambda x, y: min(x, y)) + expected = Series([min(i * 10, 22) for i in range(5)]) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_combine_first.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_combine_first.py new file mode 100644 index 0000000000000000000000000000000000000000..e1ec8afda33a9740806cc993474780aaf37d435e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_combine_first.py @@ -0,0 +1,149 @@ +from datetime import datetime + +import numpy as np + +import pandas as pd +from pandas import ( + Period, + Series, + date_range, + period_range, + to_datetime, +) +import pandas._testing as tm + + +class TestCombineFirst: + def test_combine_first_period_datetime(self): + # GH#3367 + didx = date_range(start="1950-01-31", end="1950-07-31", freq="ME") + pidx = period_range(start=Period("1950-1"), end=Period("1950-7"), freq="M") + # check to be consistent with DatetimeIndex + for idx in [didx, pidx]: + a = Series([1, np.nan, np.nan, 4, 5, np.nan, 7], index=idx) + b = Series([9, 9, 9, 9, 9, 9, 9], index=idx) + + result = a.combine_first(b) + expected = Series([1, 9, 9, 4, 5, 9, 7], index=idx, dtype=np.float64) + tm.assert_series_equal(result, expected) + + def test_combine_first_name(self, datetime_series): + result = datetime_series.combine_first(datetime_series[:5]) + assert result.name == datetime_series.name + + def test_combine_first(self): + values = np.arange(20, dtype=np.float64) + series = Series(values, index=np.arange(20, dtype=np.int64)) + + series_copy = series * 2 + series_copy[::2] = np.nan + + # nothing used from the input + combined = series.combine_first(series_copy) + + tm.assert_series_equal(combined, series) + + # Holes filled from input + combined = series_copy.combine_first(series) + assert np.isfinite(combined).all() + + tm.assert_series_equal(combined[::2], series[::2]) + tm.assert_series_equal(combined[1::2], series_copy[1::2]) + + # mixed types + index = pd.Index([str(i) for i in range(20)]) + floats = Series(np.random.default_rng(2).standard_normal(20), index=index) + strings = Series([str(i) for i in range(10)], index=index[::2], dtype=object) + + combined = strings.combine_first(floats) + + tm.assert_series_equal(strings, combined.loc[index[::2]]) + tm.assert_series_equal(floats[1::2].astype(object), combined.loc[index[1::2]]) + + # corner case + ser = Series([1.0, 2, 3], index=[0, 1, 2]) + empty = Series([], index=[], dtype=object) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.combine_first(empty) + ser.index = ser.index.astype("O") + tm.assert_series_equal(ser, result) + + def test_combine_first_dt64(self, unit): + s0 = to_datetime(Series(["2010", np.nan])).dt.as_unit(unit) + s1 = to_datetime(Series([np.nan, "2011"])).dt.as_unit(unit) + rs = s0.combine_first(s1) + xp = to_datetime(Series(["2010", "2011"])).dt.as_unit(unit) + tm.assert_series_equal(rs, xp) + + s0 = to_datetime(Series(["2010", np.nan])).dt.as_unit(unit) + s1 = Series([np.nan, "2011"]) + rs = s0.combine_first(s1) + + xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]") + + tm.assert_series_equal(rs, xp) + + def test_combine_first_dt_tz_values(self, tz_naive_fixture): + ser1 = Series( + pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), + name="ser1", + ) + ser2 = Series( + pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture), + index=[2, 3, 4], + name="ser2", + ) + result = ser1.combine_first(ser2) + exp_vals = pd.DatetimeIndex( + ["20150101", "20150102", "20150103", "20160515", "20160516"], + tz=tz_naive_fixture, + ) + exp = Series(exp_vals, name="ser1") + tm.assert_series_equal(exp, result) + + def test_combine_first_timezone_series_with_empty_series(self): + # GH 41800 + time_index = date_range( + datetime(2021, 1, 1, 1), + datetime(2021, 1, 1, 10), + freq="h", + tz="Europe/Rome", + ) + s1 = Series(range(10), index=time_index) + s2 = Series(index=time_index) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s1.combine_first(s2) + tm.assert_series_equal(result, s1) + + def test_combine_first_preserves_dtype(self): + # GH51764 + s1 = Series([1666880195890293744, 1666880195890293837]) + s2 = Series([1, 2, 3]) + result = s1.combine_first(s2) + expected = Series([1666880195890293744, 1666880195890293837, 3]) + tm.assert_series_equal(result, expected) + + def test_combine_mixed_timezone(self): + # GH 26283 + uniform_tz = Series({pd.Timestamp("2019-05-01", tz="UTC"): 1.0}) + multi_tz = Series( + { + pd.Timestamp("2019-05-01 01:00:00+0100", tz="Europe/London"): 2.0, + pd.Timestamp("2019-05-02", tz="UTC"): 3.0, + } + ) + + result = uniform_tz.combine_first(multi_tz) + expected = Series( + [1.0, 3.0], + index=pd.Index( + [ + pd.Timestamp("2019-05-01 00:00:00+00:00", tz="UTC"), + pd.Timestamp("2019-05-02 00:00:00+00:00", tz="UTC"), + ], + dtype="object", + ), + ) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_compare.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_compare.py new file mode 100644 index 0000000000000000000000000000000000000000..fe2016a245ec7c1373c72f40c7b6e7d899cf4f96 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_compare.py @@ -0,0 +1,141 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"]) +def test_compare_axis(align_axis): + # GH#30429 + s1 = pd.Series(["a", "b", "c"]) + s2 = pd.Series(["x", "b", "z"]) + + result = s1.compare(s2, align_axis=align_axis) + + if align_axis in (1, "columns"): + indices = pd.Index([0, 2]) + columns = pd.Index(["self", "other"]) + expected = pd.DataFrame( + [["a", "x"], ["c", "z"]], index=indices, columns=columns + ) + tm.assert_frame_equal(result, expected) + else: + indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) + expected = pd.Series(["a", "x", "c", "z"], index=indices) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "keep_shape, keep_equal", + [ + (True, False), + (False, True), + (True, True), + # False, False case is already covered in test_compare_axis + ], +) +def test_compare_various_formats(keep_shape, keep_equal): + s1 = pd.Series(["a", "b", "c"]) + s2 = pd.Series(["x", "b", "z"]) + + result = s1.compare(s2, keep_shape=keep_shape, keep_equal=keep_equal) + + if keep_shape: + indices = pd.Index([0, 1, 2]) + columns = pd.Index(["self", "other"]) + if keep_equal: + expected = pd.DataFrame( + [["a", "x"], ["b", "b"], ["c", "z"]], index=indices, columns=columns + ) + else: + expected = pd.DataFrame( + [["a", "x"], [np.nan, np.nan], ["c", "z"]], + index=indices, + columns=columns, + ) + else: + indices = pd.Index([0, 2]) + columns = pd.Index(["self", "other"]) + expected = pd.DataFrame( + [["a", "x"], ["c", "z"]], index=indices, columns=columns + ) + tm.assert_frame_equal(result, expected) + + +def test_compare_with_equal_nulls(): + # We want to make sure two NaNs are considered the same + # and dropped where applicable + s1 = pd.Series(["a", "b", np.nan]) + s2 = pd.Series(["x", "b", np.nan]) + + result = s1.compare(s2) + expected = pd.DataFrame([["a", "x"]], columns=["self", "other"]) + tm.assert_frame_equal(result, expected) + + +def test_compare_with_non_equal_nulls(): + # We want to make sure the relevant NaNs do not get dropped + s1 = pd.Series(["a", "b", "c"]) + s2 = pd.Series(["x", "b", np.nan]) + + result = s1.compare(s2, align_axis=0) + + indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]]) + expected = pd.Series(["a", "x", "c", np.nan], index=indices) + tm.assert_series_equal(result, expected) + + +def test_compare_multi_index(): + index = pd.MultiIndex.from_arrays([[0, 0, 1], [0, 1, 2]]) + s1 = pd.Series(["a", "b", "c"], index=index) + s2 = pd.Series(["x", "b", "z"], index=index) + + result = s1.compare(s2, align_axis=0) + + indices = pd.MultiIndex.from_arrays( + [[0, 0, 1, 1], [0, 0, 2, 2], ["self", "other", "self", "other"]] + ) + expected = pd.Series(["a", "x", "c", "z"], index=indices) + tm.assert_series_equal(result, expected) + + +def test_compare_unaligned_objects(): + # test Series with different indices + msg = "Can only compare identically-labeled Series objects" + with pytest.raises(ValueError, match=msg): + ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"]) + ser2 = pd.Series([1, 2, 3], index=["a", "b", "d"]) + ser1.compare(ser2) + + # test Series with different lengths + msg = "Can only compare identically-labeled Series objects" + with pytest.raises(ValueError, match=msg): + ser1 = pd.Series([1, 2, 3]) + ser2 = pd.Series([1, 2, 3, 4]) + ser1.compare(ser2) + + +def test_compare_datetime64_and_string(): + # Issue https://github.com/pandas-dev/pandas/issues/45506 + # Catch OverflowError when comparing datetime64 and string + data = [ + {"a": "2015-07-01", "b": "08335394550"}, + {"a": "2015-07-02", "b": "+49 (0) 0345 300033"}, + {"a": "2015-07-03", "b": "+49(0)2598 04457"}, + {"a": "2015-07-04", "b": "0741470003"}, + {"a": "2015-07-05", "b": "04181 83668"}, + ] + dtypes = {"a": "datetime64[ns]", "b": "string"} + df = pd.DataFrame(data=data).astype(dtypes) + + result_eq1 = df["a"].eq(df["b"]) + result_eq2 = df["a"] == df["b"] + result_neq = df["a"] != df["b"] + + expected_eq = pd.Series([False] * 5) # For .eq and == + expected_neq = pd.Series([True] * 5) # For != + + tm.assert_series_equal(result_eq1, expected_eq) + tm.assert_series_equal(result_eq2, expected_eq) + tm.assert_series_equal(result_neq, expected_neq) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_convert_dtypes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_convert_dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..b0a920ba02cadeb21a7c0cc93f615cb1bc49dcbb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_convert_dtypes.py @@ -0,0 +1,306 @@ +from itertools import product + +import numpy as np +import pytest + +from pandas._libs import lib + +import pandas as pd +import pandas._testing as tm + +# Each test case consists of a tuple with the data and dtype to create the +# test Series, the default dtype for the expected result (which is valid +# for most cases), and the specific cases where the result deviates from +# this default. Those overrides are defined as a dict with (keyword, val) as +# dictionary key. In case of multiple items, the last override takes precedence. + + +@pytest.fixture( + params=[ + ( + # data + [1, 2, 3], + # original dtype + np.dtype("int32"), + # default expected dtype + "Int32", + # exceptions on expected dtype + {("convert_integer", False): np.dtype("int32")}, + ), + ( + [1, 2, 3], + np.dtype("int64"), + "Int64", + {("convert_integer", False): np.dtype("int64")}, + ), + ( + ["x", "y", "z"], + np.dtype("O"), + pd.StringDtype(), + {("convert_string", False): np.dtype("O")}, + ), + ( + [True, False, np.nan], + np.dtype("O"), + pd.BooleanDtype(), + {("convert_boolean", False): np.dtype("O")}, + ), + ( + ["h", "i", np.nan], + np.dtype("O"), + pd.StringDtype(), + {("convert_string", False): np.dtype("O")}, + ), + ( # GH32117 + ["h", "i", 1], + np.dtype("O"), + np.dtype("O"), + {}, + ), + ( + [10, np.nan, 20], + np.dtype("float"), + "Int64", + { + ("convert_integer", False, "convert_floating", True): "Float64", + ("convert_integer", False, "convert_floating", False): np.dtype( + "float" + ), + }, + ), + ( + [np.nan, 100.5, 200], + np.dtype("float"), + "Float64", + {("convert_floating", False): np.dtype("float")}, + ), + ( + [3, 4, 5], + "Int8", + "Int8", + {}, + ), + ( + [[1, 2], [3, 4], [5]], + None, + np.dtype("O"), + {}, + ), + ( + [4, 5, 6], + np.dtype("uint32"), + "UInt32", + {("convert_integer", False): np.dtype("uint32")}, + ), + ( + [-10, 12, 13], + np.dtype("i1"), + "Int8", + {("convert_integer", False): np.dtype("i1")}, + ), + ( + [1.2, 1.3], + np.dtype("float32"), + "Float32", + {("convert_floating", False): np.dtype("float32")}, + ), + ( + [1, 2.0], + object, + "Int64", + { + ("convert_integer", False): "Float64", + ("convert_integer", False, "convert_floating", False): np.dtype( + "float" + ), + ("infer_objects", False): np.dtype("object"), + }, + ), + ( + [1, 2.5], + object, + "Float64", + { + ("convert_floating", False): np.dtype("float"), + ("infer_objects", False): np.dtype("object"), + }, + ), + (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("s"), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ms"), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("us"), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), + "datetime64[ns]", + np.dtype("datetime64[ns]"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), + object, + np.dtype("datetime64[ns]"), + {("infer_objects", False): np.dtype("object")}, + ), + ( + pd.period_range("1/1/2011", freq="M", periods=3), + None, + pd.PeriodDtype("M"), + {}, + ), + ( + pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), + None, + pd.IntervalDtype("int64", "right"), + {}, + ), + ] +) +def test_cases(request): + return request.param + + +class TestSeriesConvertDtypes: + @pytest.mark.parametrize("params", product(*[(True, False)] * 5)) + def test_convert_dtypes( + self, + test_cases, + params, + using_infer_string, + ): + data, maindtype, expected_default, expected_other = test_cases + if ( + hasattr(data, "dtype") + and lib.is_np_dtype(data.dtype, "M") + and isinstance(maindtype, pd.DatetimeTZDtype) + ): + # this astype is deprecated in favor of tz_localize + msg = "Cannot use .astype to convert from timezone-naive dtype" + with pytest.raises(TypeError, match=msg): + pd.Series(data, dtype=maindtype) + return + + if maindtype is not None: + series = pd.Series(data, dtype=maindtype) + else: + series = pd.Series(data) + + result = series.convert_dtypes(*params) + + param_names = [ + "infer_objects", + "convert_string", + "convert_integer", + "convert_boolean", + "convert_floating", + ] + params_dict = dict(zip(param_names, params)) + + expected_dtype = expected_default + for spec, dtype in expected_other.items(): + if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])): + expected_dtype = dtype + if ( + using_infer_string + and expected_default == "string" + and expected_dtype == object + and params[0] + and not params[1] + ): + # If we would convert with convert strings then infer_objects converts + # with the option + expected_dtype = "string[pyarrow_numpy]" + + expected = pd.Series(data, dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + # Test that it is a copy + copy = series.copy(deep=True) + + if result.notna().sum() > 0 and result.dtype in ["interval[int64, right]"]: + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + result[result.notna()] = np.nan + else: + result[result.notna()] = np.nan + + # Make sure original not changed + tm.assert_series_equal(series, copy) + + def test_convert_string_dtype(self, nullable_string_dtype): + # https://github.com/pandas-dev/pandas/issues/31731 -> converting columns + # that are already string dtype + df = pd.DataFrame( + {"A": ["a", "b", pd.NA], "B": ["ä", "ö", "ü"]}, dtype=nullable_string_dtype + ) + result = df.convert_dtypes() + tm.assert_frame_equal(df, result) + + def test_convert_bool_dtype(self): + # GH32287 + df = pd.DataFrame({"A": pd.array([True])}) + tm.assert_frame_equal(df, df.convert_dtypes()) + + def test_convert_byte_string_dtype(self): + # GH-43183 + byte_str = b"binary-string" + + df = pd.DataFrame(data={"A": byte_str}, index=[0]) + result = df.convert_dtypes() + expected = df + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "infer_objects, dtype", [(True, "Int64"), (False, "object")] + ) + def test_convert_dtype_object_with_na(self, infer_objects, dtype): + # GH#48791 + ser = pd.Series([1, pd.NA]) + result = ser.convert_dtypes(infer_objects=infer_objects) + expected = pd.Series([1, pd.NA], dtype=dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "infer_objects, dtype", [(True, "Float64"), (False, "object")] + ) + def test_convert_dtype_object_with_na_float(self, infer_objects, dtype): + # GH#48791 + ser = pd.Series([1.5, pd.NA]) + result = ser.convert_dtypes(infer_objects=infer_objects) + expected = pd.Series([1.5, pd.NA], dtype=dtype) + tm.assert_series_equal(result, expected) + + def test_convert_dtypes_pyarrow_to_np_nullable(self): + # GH 53648 + pytest.importorskip("pyarrow") + ser = pd.Series(range(2), dtype="int32[pyarrow]") + result = ser.convert_dtypes(dtype_backend="numpy_nullable") + expected = pd.Series(range(2), dtype="Int32") + tm.assert_series_equal(result, expected) + + def test_convert_dtypes_pyarrow_null(self): + # GH#55346 + pa = pytest.importorskip("pyarrow") + ser = pd.Series([None, None]) + result = ser.convert_dtypes(dtype_backend="pyarrow") + expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_diff.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_diff.py new file mode 100644 index 0000000000000000000000000000000000000000..18de81a927c3a7697ee67290c8a6c73336de5643 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_diff.py @@ -0,0 +1,88 @@ +import numpy as np +import pytest + +from pandas import ( + Series, + TimedeltaIndex, + date_range, +) +import pandas._testing as tm + + +class TestSeriesDiff: + def test_diff_np(self): + # TODO(__array_function__): could make np.diff return a Series + # matching ser.diff() + + ser = Series(np.arange(5)) + + res = np.diff(ser) + expected = np.array([1, 1, 1, 1]) + tm.assert_numpy_array_equal(res, expected) + + def test_diff_int(self): + # int dtype + a = 10000000000000000 + b = a + 1 + ser = Series([a, b]) + + result = ser.diff() + assert result[1] == 1 + + def test_diff_tz(self): + # Combined datetime diff, normal diff and boolean diff test + ts = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + ts.diff() + + # neg n + result = ts.diff(-1) + expected = ts - ts.shift(-1) + tm.assert_series_equal(result, expected) + + # 0 + result = ts.diff(0) + expected = ts - ts + tm.assert_series_equal(result, expected) + + def test_diff_dt64(self): + # datetime diff (GH#3100) + ser = Series(date_range("20130102", periods=5)) + result = ser.diff() + expected = ser - ser.shift(1) + tm.assert_series_equal(result, expected) + + # timedelta diff + result = result - result.shift(1) # previous result + expected = expected.diff() # previously expected + tm.assert_series_equal(result, expected) + + def test_diff_dt64tz(self): + # with tz + ser = Series( + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) + result = ser.diff() + expected = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "input,output,diff", + [([False, True, True, False, False], [np.nan, True, False, True, False], 1)], + ) + def test_diff_bool(self, input, output, diff): + # boolean series (test for fixing #17294) + ser = Series(input) + result = ser.diff() + expected = Series(output) + tm.assert_series_equal(result, expected) + + def test_diff_object_dtype(self): + # object series + ser = Series([False, True, 5.0, np.nan, True, False]) + result = ser.diff() + expected = ser - ser.shift(1) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_dropna.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_dropna.py new file mode 100644 index 0000000000000000000000000000000000000000..d03fcac24003e99b724a2e7ac43b66d3d9b51bcf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_dropna.py @@ -0,0 +1,117 @@ +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + IntervalIndex, + NaT, + Period, + Series, + Timestamp, +) +import pandas._testing as tm + + +class TestDropna: + def test_dropna_empty(self): + ser = Series([], dtype=object) + + assert len(ser.dropna()) == 0 + return_value = ser.dropna(inplace=True) + assert return_value is None + assert len(ser) == 0 + + # invalid axis + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): + ser.dropna(axis=1) + + def test_dropna_preserve_name(self, datetime_series): + datetime_series[:5] = np.nan + result = datetime_series.dropna() + assert result.name == datetime_series.name + name = datetime_series.name + ts = datetime_series.copy() + return_value = ts.dropna(inplace=True) + assert return_value is None + assert ts.name == name + + def test_dropna_no_nan(self): + for ser in [ + Series([1, 2, 3], name="x"), + Series([False, True, False], name="x"), + ]: + result = ser.dropna() + tm.assert_series_equal(result, ser) + assert result is not ser + + s2 = ser.copy() + return_value = s2.dropna(inplace=True) + assert return_value is None + tm.assert_series_equal(s2, ser) + + def test_dropna_intervals(self): + ser = Series( + [np.nan, 1, 2, 3], + IntervalIndex.from_arrays([np.nan, 0, 1, 2], [np.nan, 1, 2, 3]), + ) + + result = ser.dropna() + expected = ser.iloc[1:] + tm.assert_series_equal(result, expected) + + def test_dropna_period_dtype(self): + # GH#13737 + ser = Series([Period("2011-01", freq="M"), Period("NaT", freq="M")]) + result = ser.dropna() + expected = Series([Period("2011-01", freq="M")]) + + tm.assert_series_equal(result, expected) + + def test_datetime64_tz_dropna(self, unit): + # DatetimeLikeBlock + ser = Series( + [ + Timestamp("2011-01-01 10:00"), + NaT, + Timestamp("2011-01-03 10:00"), + NaT, + ], + dtype=f"M8[{unit}]", + ) + result = ser.dropna() + expected = Series( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-03 10:00")], + index=[0, 2], + dtype=f"M8[{unit}]", + ) + tm.assert_series_equal(result, expected) + + # DatetimeTZBlock + idx = DatetimeIndex( + ["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz="Asia/Tokyo" + ).as_unit(unit) + ser = Series(idx) + assert ser.dtype == f"datetime64[{unit}, Asia/Tokyo]" + result = ser.dropna() + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-03 10:00", tz="Asia/Tokyo"), + ], + index=[0, 2], + dtype=f"datetime64[{unit}, Asia/Tokyo]", + ) + assert result.dtype == f"datetime64[{unit}, Asia/Tokyo]" + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("val", [1, 1.5]) + def test_dropna_ignore_index(self, val): + # GH#31725 + ser = Series([1, 2, val], index=[3, 2, 1]) + result = ser.dropna(ignore_index=True) + expected = Series([1, 2, val]) + tm.assert_series_equal(result, expected) + + ser.dropna(ignore_index=True, inplace=True) + tm.assert_series_equal(ser, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_dtypes.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_dtypes.py new file mode 100644 index 0000000000000000000000000000000000000000..82260bc2a65b9f7e387532b380b9ce013d29cbb7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_dtypes.py @@ -0,0 +1,7 @@ +import numpy as np + + +class TestSeriesDtypes: + def test_dtype(self, datetime_series): + assert datetime_series.dtype == np.dtype("float64") + assert datetime_series.dtypes == np.dtype("float64") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_equals.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_equals.py new file mode 100644 index 0000000000000000000000000000000000000000..875ffdd3fe8514edb4c313c8f558330c787cef08 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_equals.py @@ -0,0 +1,145 @@ +from contextlib import nullcontext +import copy + +import numpy as np +import pytest + +from pandas._libs.missing import is_matching_na +from pandas.compat.numpy import np_version_gte1p25 + +from pandas.core.dtypes.common import is_float + +from pandas import ( + Index, + MultiIndex, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "arr, idx", + [ + ([1, 2, 3, 4], [0, 2, 1, 3]), + ([1, np.nan, 3, np.nan], [0, 2, 1, 3]), + ( + [1, np.nan, 3, np.nan], + MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c"), (3, "c")]), + ), + ], +) +def test_equals(arr, idx): + s1 = Series(arr, index=idx) + s2 = s1.copy() + assert s1.equals(s2) + + s1[1] = 9 + assert not s1.equals(s2) + + +@pytest.mark.parametrize( + "val", [1, 1.1, 1 + 1j, True, "abc", [1, 2], (1, 2), {1, 2}, {"a": 1}, None] +) +def test_equals_list_array(val): + # GH20676 Verify equals operator for list of Numpy arrays + arr = np.array([1, 2]) + s1 = Series([arr, arr]) + s2 = s1.copy() + assert s1.equals(s2) + + s1[1] = val + + cm = ( + tm.assert_produces_warning(FutureWarning, check_stacklevel=False) + if isinstance(val, str) and not np_version_gte1p25 + else nullcontext() + ) + with cm: + assert not s1.equals(s2) + + +def test_equals_false_negative(): + # GH8437 Verify false negative behavior of equals function for dtype object + arr = [False, np.nan] + s1 = Series(arr) + s2 = s1.copy() + s3 = Series(index=range(2), dtype=object) + s4 = s3.copy() + s5 = s3.copy() + s6 = s3.copy() + + s3[:-1] = s4[:-1] = s5[0] = s6[0] = False + assert s1.equals(s1) + assert s1.equals(s2) + assert s1.equals(s3) + assert s1.equals(s4) + assert s1.equals(s5) + assert s5.equals(s6) + + +def test_equals_matching_nas(): + # matching but not identical NAs + left = Series([np.datetime64("NaT")], dtype=object) + right = Series([np.datetime64("NaT")], dtype=object) + assert left.equals(right) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + assert Index(left).equals(Index(right)) + assert left.array.equals(right.array) + + left = Series([np.timedelta64("NaT")], dtype=object) + right = Series([np.timedelta64("NaT")], dtype=object) + assert left.equals(right) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + assert Index(left).equals(Index(right)) + assert left.array.equals(right.array) + + left = Series([np.float64("NaN")], dtype=object) + right = Series([np.float64("NaN")], dtype=object) + assert left.equals(right) + assert Index(left, dtype=left.dtype).equals(Index(right, dtype=right.dtype)) + assert left.array.equals(right.array) + + +def test_equals_mismatched_nas(nulls_fixture, nulls_fixture2): + # GH#39650 + left = nulls_fixture + right = nulls_fixture2 + if hasattr(right, "copy"): + right = right.copy() + else: + right = copy.copy(right) + + ser = Series([left], dtype=object) + ser2 = Series([right], dtype=object) + + if is_matching_na(left, right): + assert ser.equals(ser2) + elif (left is None and is_float(right)) or (right is None and is_float(left)): + assert ser.equals(ser2) + else: + assert not ser.equals(ser2) + + +def test_equals_none_vs_nan(): + # GH#39650 + ser = Series([1, None], dtype=object) + ser2 = Series([1, np.nan], dtype=object) + + assert ser.equals(ser2) + assert Index(ser, dtype=ser.dtype).equals(Index(ser2, dtype=ser2.dtype)) + assert ser.array.equals(ser2.array) + + +def test_equals_None_vs_float(): + # GH#44190 + left = Series([-np.inf, np.nan, -1.0, 0.0, 1.0, 10 / 3, np.inf], dtype=object) + right = Series([None] * len(left)) + + # these series were found to be equal due to a bug, check that they are correctly + # found to not equal + assert not left.equals(right) + assert not right.equals(left) + assert not left.to_frame().equals(right.to_frame()) + assert not right.to_frame().equals(left.to_frame()) + assert not Index(left, dtype="object").equals(Index(right, dtype="object")) + assert not Index(right, dtype="object").equals(Index(left, dtype="object")) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_explode.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_explode.py new file mode 100644 index 0000000000000000000000000000000000000000..5a0188585ef30c12f7222a054714957df4ed2ff2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_explode.py @@ -0,0 +1,175 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_basic(): + s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd"), name="foo") + result = s.explode() + expected = pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object, name="foo" + ) + tm.assert_series_equal(result, expected) + + +def test_mixed_type(): + s = pd.Series( + [[0, 1, 2], np.nan, None, np.array([]), pd.Series(["a", "b"])], name="foo" + ) + result = s.explode() + expected = pd.Series( + [0, 1, 2, np.nan, None, np.nan, "a", "b"], + index=[0, 0, 0, 1, 2, 3, 4, 4], + dtype=object, + name="foo", + ) + tm.assert_series_equal(result, expected) + + +def test_empty(): + s = pd.Series(dtype=object) + result = s.explode() + expected = s.copy() + tm.assert_series_equal(result, expected) + + +def test_nested_lists(): + s = pd.Series([[[1, 2, 3]], [1, 2], 1]) + result = s.explode() + expected = pd.Series([[1, 2, 3], 1, 2, 1], index=[0, 1, 1, 2]) + tm.assert_series_equal(result, expected) + + +def test_multi_index(): + s = pd.Series( + [[0, 1, 2], np.nan, [], (3, 4)], + name="foo", + index=pd.MultiIndex.from_product([list("ab"), range(2)], names=["foo", "bar"]), + ) + result = s.explode() + index = pd.MultiIndex.from_tuples( + [("a", 0), ("a", 0), ("a", 0), ("a", 1), ("b", 0), ("b", 1), ("b", 1)], + names=["foo", "bar"], + ) + expected = pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4], index=index, dtype=object, name="foo" + ) + tm.assert_series_equal(result, expected) + + +def test_large(): + s = pd.Series([range(256)]).explode() + result = s.explode() + tm.assert_series_equal(result, s) + + +def test_invert_array(): + df = pd.DataFrame({"a": pd.date_range("20190101", periods=3, tz="UTC")}) + + listify = df.apply(lambda x: x.array, axis=1) + result = listify.explode() + tm.assert_series_equal(result, df["a"].rename()) + + +@pytest.mark.parametrize( + "s", [pd.Series([1, 2, 3]), pd.Series(pd.date_range("2019", periods=3, tz="UTC"))] +) +def test_non_object_dtype(s): + result = s.explode() + tm.assert_series_equal(result, s) + + +def test_typical_usecase(): + df = pd.DataFrame( + [{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}], + columns=["var1", "var2"], + ) + exploded = df.var1.str.split(",").explode() + result = df[["var2"]].join(exploded) + expected = pd.DataFrame( + {"var2": [1, 1, 1, 2, 2, 2], "var1": list("abcdef")}, + columns=["var2", "var1"], + index=[0, 0, 0, 1, 1, 1], + ) + tm.assert_frame_equal(result, expected) + + +def test_nested_EA(): + # a nested EA array + s = pd.Series( + [ + pd.date_range("20170101", periods=3, tz="UTC"), + pd.date_range("20170104", periods=3, tz="UTC"), + ] + ) + result = s.explode() + expected = pd.Series( + pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1] + ) + tm.assert_series_equal(result, expected) + + +def test_duplicate_index(): + # GH 28005 + s = pd.Series([[1, 2], [3, 4]], index=[0, 0]) + result = s.explode() + expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object) + tm.assert_series_equal(result, expected) + + +def test_ignore_index(): + # GH 34932 + s = pd.Series([[1, 2], [3, 4]]) + result = s.explode(ignore_index=True) + expected = pd.Series([1, 2, 3, 4], index=[0, 1, 2, 3], dtype=object) + tm.assert_series_equal(result, expected) + + +def test_explode_sets(): + # https://github.com/pandas-dev/pandas/issues/35614 + s = pd.Series([{"a", "b", "c"}], index=[1]) + result = s.explode().sort_values() + expected = pd.Series(["a", "b", "c"], index=[1, 1, 1]) + tm.assert_series_equal(result, expected) + + +def test_explode_scalars_can_ignore_index(): + # https://github.com/pandas-dev/pandas/issues/40487 + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + result = s.explode(ignore_index=True) + expected = pd.Series([1, 2, 3]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ignore_index", [True, False]) +def test_explode_pyarrow_list_type(ignore_index): + # GH 53602 + pa = pytest.importorskip("pyarrow") + + data = [ + [None, None], + [1], + [], + [2, 3], + None, + ] + ser = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + result = ser.explode(ignore_index=ignore_index) + expected = pd.Series( + data=[None, None, 1, None, 2, 3, None], + index=None if ignore_index else [0, 0, 1, 2, 3, 3, 4], + dtype=pd.ArrowDtype(pa.int64()), + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ignore_index", [True, False]) +def test_explode_pyarrow_non_list_type(ignore_index): + pa = pytest.importorskip("pyarrow") + data = [1, 2, 3] + ser = pd.Series(data, dtype=pd.ArrowDtype(pa.int64())) + result = ser.explode(ignore_index=ignore_index) + expected = pd.Series([1, 2, 3], dtype="int64[pyarrow]", index=[0, 1, 2]) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_get_numeric_data.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_get_numeric_data.py new file mode 100644 index 0000000000000000000000000000000000000000..8325cc884ebcba069c18f457371cfa061893cf81 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_get_numeric_data.py @@ -0,0 +1,38 @@ +from pandas import ( + Index, + Series, + date_range, +) +import pandas._testing as tm + + +class TestGetNumericData: + def test_get_numeric_data_preserve_dtype( + self, using_copy_on_write, warn_copy_on_write + ): + # get the numeric data + obj = Series([1, 2, 3]) + result = obj._get_numeric_data() + tm.assert_series_equal(result, obj) + + # returned object is a shallow copy + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 0 + if using_copy_on_write: + assert obj.iloc[0] == 1 + else: + assert obj.iloc[0] == 0 + + obj = Series([1, "2", 3.0]) + result = obj._get_numeric_data() + expected = Series([], dtype=object, index=Index([], dtype=object)) + tm.assert_series_equal(result, expected) + + obj = Series([True, False, True]) + result = obj._get_numeric_data() + tm.assert_series_equal(result, obj) + + obj = Series(date_range("20130101", periods=3)) + result = obj._get_numeric_data() + expected = Series([], dtype="M8[ns]", index=Index([], dtype=object)) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_head_tail.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_head_tail.py new file mode 100644 index 0000000000000000000000000000000000000000..d9f8d85eda350e0627790626d52b7703d23ead02 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_head_tail.py @@ -0,0 +1,8 @@ +import pandas._testing as tm + + +def test_head_tail(string_series): + tm.assert_series_equal(string_series.head(), string_series[:5]) + tm.assert_series_equal(string_series.head(0), string_series[0:0]) + tm.assert_series_equal(string_series.tail(), string_series[-5:]) + tm.assert_series_equal(string_series.tail(0), string_series[0:0]) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_infer_objects.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_infer_objects.py new file mode 100644 index 0000000000000000000000000000000000000000..29abac6b3780ec500ef0c635785c9624e4264934 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_infer_objects.py @@ -0,0 +1,56 @@ +import numpy as np + +from pandas import ( + Series, + interval_range, +) +import pandas._testing as tm + + +class TestInferObjects: + def test_copy(self, index_or_series): + # GH#50096 + # case where we don't need to do inference because it is already non-object + obj = index_or_series(np.array([1, 2, 3], dtype="int64")) + + result = obj.infer_objects(copy=False) + assert tm.shares_memory(result, obj) + + # case where we try to do inference but can't do better than object + obj2 = index_or_series(np.array(["foo", 2], dtype=object)) + result2 = obj2.infer_objects(copy=False) + assert tm.shares_memory(result2, obj2) + + def test_infer_objects_series(self, index_or_series): + # GH#11221 + actual = index_or_series(np.array([1, 2, 3], dtype="O")).infer_objects() + expected = index_or_series([1, 2, 3]) + tm.assert_equal(actual, expected) + + actual = index_or_series(np.array([1, 2, 3, None], dtype="O")).infer_objects() + expected = index_or_series([1.0, 2.0, 3.0, np.nan]) + tm.assert_equal(actual, expected) + + # only soft conversions, unconvertible pass thru unchanged + + obj = index_or_series(np.array([1, 2, 3, None, "a"], dtype="O")) + actual = obj.infer_objects() + expected = index_or_series([1, 2, 3, None, "a"], dtype=object) + + assert actual.dtype == "object" + tm.assert_equal(actual, expected) + + def test_infer_objects_interval(self, index_or_series): + # GH#50090 + ii = interval_range(1, 10) + obj = index_or_series(ii) + + result = obj.astype(object).infer_objects() + tm.assert_equal(result, obj) + + def test_infer_objects_bytes(self): + # GH#49650 + ser = Series([b"a"], dtype="bytes") + expected = ser.copy() + result = ser.infer_objects() + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_isin.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_isin.py new file mode 100644 index 0000000000000000000000000000000000000000..f94f67b8cc40a2031cab0791ccaf2fbc207b5023 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_isin.py @@ -0,0 +1,252 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + Series, + date_range, +) +import pandas._testing as tm +from pandas.core import algorithms +from pandas.core.arrays import PeriodArray + + +class TestSeriesIsIn: + def test_isin(self): + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) + + result = s.isin(["A", "C"]) + expected = Series([True, False, True, False, False, False, True, True]) + tm.assert_series_equal(result, expected) + + # GH#16012 + # This specific issue has to have a series over 1e6 in len, but the + # comparison array (in_list) must be large enough so that numpy doesn't + # do a manual masking trick that will avoid this issue altogether + s = Series(list("abcdefghijk" * 10**5)) + # If numpy doesn't do the manual comparison/mask, these + # unorderable mixed types are what cause the exception in numpy + in_list = [-1, "a", "b", "G", "Y", "Z", "E", "K", "E", "S", "I", "R", "R"] * 6 + + assert s.isin(in_list).sum() == 200000 + + def test_isin_with_string_scalar(self): + # GH#4763 + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) + msg = ( + r"only list-like objects are allowed to be passed to isin\(\), " + r"you passed a `str`" + ) + with pytest.raises(TypeError, match=msg): + s.isin("a") + + s = Series(["aaa", "b", "c"]) + with pytest.raises(TypeError, match=msg): + s.isin("aaa") + + def test_isin_datetimelike_mismatched_reso(self): + expected = Series([True, True, False, False, False]) + + ser = Series(date_range("jan-01-2013", "jan-05-2013")) + + # fails on dtype conversion in the first place + day_values = np.asarray(ser[0:2].values).astype("datetime64[D]") + result = ser.isin(day_values) + tm.assert_series_equal(result, expected) + + dta = ser[:2]._values.astype("M8[s]") + result = ser.isin(dta) + tm.assert_series_equal(result, expected) + + def test_isin_datetimelike_mismatched_reso_list(self): + expected = Series([True, True, False, False, False]) + + ser = Series(date_range("jan-01-2013", "jan-05-2013")) + + dta = ser[:2]._values.astype("M8[s]") + result = ser.isin(list(dta)) + tm.assert_series_equal(result, expected) + + def test_isin_with_i8(self): + # GH#5021 + + expected = Series([True, True, False, False, False]) + expected2 = Series([False, True, False, False, False]) + + # datetime64[ns] + s = Series(date_range("jan-01-2013", "jan-05-2013")) + + result = s.isin(s[0:2]) + tm.assert_series_equal(result, expected) + + result = s.isin(s[0:2].values) + tm.assert_series_equal(result, expected) + + result = s.isin([s[1]]) + tm.assert_series_equal(result, expected2) + + result = s.isin([np.datetime64(s[1])]) + tm.assert_series_equal(result, expected2) + + result = s.isin(set(s[0:2])) + tm.assert_series_equal(result, expected) + + # timedelta64[ns] + s = Series(pd.to_timedelta(range(5), unit="d")) + result = s.isin(s[0:2]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) + def test_isin_empty(self, empty): + # see GH#16991 + s = Series(["a", "b"]) + expected = Series([False, False]) + + result = s.isin(empty) + tm.assert_series_equal(expected, result) + + def test_isin_read_only(self): + # https://github.com/pandas-dev/pandas/issues/37174 + arr = np.array([1, 2, 3]) + arr.setflags(write=False) + s = Series([1, 2, 3]) + result = s.isin(arr) + expected = Series([True, True, True]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", [object, None]) + def test_isin_dt64_values_vs_ints(self, dtype): + # GH#36621 dont cast integers to datetimes for isin + dti = date_range("2013-01-01", "2013-01-05") + ser = Series(dti) + + comps = np.asarray([1356998400000000000], dtype=dtype) + + res = dti.isin(comps) + expected = np.array([False] * len(dti), dtype=bool) + tm.assert_numpy_array_equal(res, expected) + + res = ser.isin(comps) + tm.assert_series_equal(res, Series(expected)) + + res = pd.core.algorithms.isin(ser, comps) + tm.assert_numpy_array_equal(res, expected) + + def test_isin_tzawareness_mismatch(self): + dti = date_range("2013-01-01", "2013-01-05") + ser = Series(dti) + + other = dti.tz_localize("UTC") + + res = dti.isin(other) + expected = np.array([False] * len(dti), dtype=bool) + tm.assert_numpy_array_equal(res, expected) + + res = ser.isin(other) + tm.assert_series_equal(res, Series(expected)) + + res = pd.core.algorithms.isin(ser, other) + tm.assert_numpy_array_equal(res, expected) + + def test_isin_period_freq_mismatch(self): + dti = date_range("2013-01-01", "2013-01-05") + pi = dti.to_period("M") + ser = Series(pi) + + # We construct another PeriodIndex with the same i8 values + # but different dtype + dtype = dti.to_period("Y").dtype + other = PeriodArray._simple_new(pi.asi8, dtype=dtype) + + res = pi.isin(other) + expected = np.array([False] * len(pi), dtype=bool) + tm.assert_numpy_array_equal(res, expected) + + res = ser.isin(other) + tm.assert_series_equal(res, Series(expected)) + + res = pd.core.algorithms.isin(ser, other) + tm.assert_numpy_array_equal(res, expected) + + @pytest.mark.parametrize("values", [[-9.0, 0.0], [-9, 0]]) + def test_isin_float_in_int_series(self, values): + # GH#19356 GH#21804 + ser = Series(values) + result = ser.isin([-9, -0.5]) + expected = Series([True, False]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"]) + @pytest.mark.parametrize( + "data,values,expected", + [ + ([0, 1, 0], [1], [False, True, False]), + ([0, 1, 0], [1, pd.NA], [False, True, False]), + ([0, pd.NA, 0], [1, 0], [True, False, True]), + ([0, 1, pd.NA], [1, pd.NA], [False, True, True]), + ([0, 1, pd.NA], [1, np.nan], [False, True, False]), + ([0, pd.NA, pd.NA], [np.nan, pd.NaT, None], [False, False, False]), + ], + ) + def test_isin_masked_types(self, dtype, data, values, expected): + # GH#42405 + ser = Series(data, dtype=dtype) + + result = ser.isin(values) + expected = Series(expected, dtype="boolean") + + tm.assert_series_equal(result, expected) + + +def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch): + # https://github.com/pandas-dev/pandas/issues/37094 + # combination of object dtype for the values + # and > _MINIMUM_COMP_ARR_LEN elements + min_isin_comp = 5 + ser = Series([1, 2, np.nan] * min_isin_comp) + with monkeypatch.context() as m: + m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp) + result = ser.isin({"foo", "bar"}) + expected = Series([False] * 3 * min_isin_comp) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "array,expected", + [ + ( + [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j], + Series([False, True, True, False, True, True, True], dtype=bool), + ) + ], +) +def test_isin_complex_numbers(array, expected): + # GH 17927 + result = Series(array).isin([1j, 1 + 1j, 1 + 2j]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data,is_in", + [([1, [2]], [1]), (["simple str", [{"values": 3}]], ["simple str"])], +) +def test_isin_filtering_with_mixed_object_types(data, is_in): + # GH 20883 + + ser = Series(data) + result = ser.isin(is_in) + expected = Series([True, False]) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("data", [[1, 2, 3], [1.0, 2.0, 3.0]]) +@pytest.mark.parametrize("isin", [[1, 2], [1.0, 2.0]]) +def test_isin_filtering_on_iterable(data, isin): + # GH 50234 + + ser = Series(data) + result = ser.isin(i for i in isin) + expected_result = Series([True, True, False]) + + tm.assert_series_equal(result, expected_result) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_matmul.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_matmul.py new file mode 100644 index 0000000000000000000000000000000000000000..4ca3ad3f7031e20b8d6cec36caadbcefef8c4196 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_matmul.py @@ -0,0 +1,82 @@ +import operator + +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class TestMatmul: + def test_matmul(self): + # matmul test is for GH#10259 + a = Series( + np.random.default_rng(2).standard_normal(4), index=["p", "q", "r", "s"] + ) + b = DataFrame( + np.random.default_rng(2).standard_normal((3, 4)), + index=["1", "2", "3"], + columns=["p", "q", "r", "s"], + ).T + + # Series @ DataFrame -> Series + result = operator.matmul(a, b) + expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + # DataFrame @ Series -> Series + result = operator.matmul(b.T, a) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + # Series @ Series -> scalar + result = operator.matmul(a, a) + expected = np.dot(a.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH#21530 + # vector (1D np.array) @ Series (__rmatmul__) + result = operator.matmul(a.values, a) + expected = np.dot(a.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH#21530 + # vector (1D list) @ Series (__rmatmul__) + result = operator.matmul(a.values.tolist(), a) + expected = np.dot(a.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH#21530 + # matrix (2D np.array) @ Series (__rmatmul__) + result = operator.matmul(b.T.values, a) + expected = np.dot(b.T.values, a.values) + tm.assert_almost_equal(result, expected) + + # GH#21530 + # matrix (2D nested lists) @ Series (__rmatmul__) + result = operator.matmul(b.T.values.tolist(), a) + expected = np.dot(b.T.values, a.values) + tm.assert_almost_equal(result, expected) + + # mixed dtype DataFrame @ Series + a["p"] = int(a.p) + result = operator.matmul(b.T, a) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + # different dtypes DataFrame @ Series + a = a.astype(int) + result = operator.matmul(b.T, a) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) + tm.assert_series_equal(result, expected) + + msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)" + # exception raised is of type Exception + with pytest.raises(Exception, match=msg): + a.dot(a.values[:3]) + msg = "matrices are not aligned" + with pytest.raises(ValueError, match=msg): + a.dot(b.T) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_rank.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_rank.py new file mode 100644 index 0000000000000000000000000000000000000000..24cf97c05c0a810bac00a8843b21d0ee88a1c00d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_rank.py @@ -0,0 +1,519 @@ +from itertools import chain +import operator + +import numpy as np +import pytest + +from pandas._libs.algos import ( + Infinity, + NegInfinity, +) +import pandas.util._test_decorators as td + +from pandas import ( + NA, + NaT, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.api.types import CategoricalDtype + + +@pytest.fixture +def ser(): + return Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]) + + +@pytest.fixture( + params=[ + ["average", np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5])], + ["min", np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5])], + ["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])], + ["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])], + ["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])], + ] +) +def results(request): + return request.param + + +@pytest.fixture( + params=[ + "object", + "float64", + "int64", + "Float64", + "Int64", + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ] +) +def dtype(request): + return request.param + + +class TestSeriesRank: + def test_rank(self, datetime_series): + sp_stats = pytest.importorskip("scipy.stats") + + datetime_series[::2] = np.nan + datetime_series[:10:3] = 4.0 + + ranks = datetime_series.rank() + oranks = datetime_series.astype("O").rank() + + tm.assert_series_equal(ranks, oranks) + + mask = np.isnan(datetime_series) + filled = datetime_series.fillna(np.inf) + + # rankdata returns a ndarray + exp = Series(sp_stats.rankdata(filled), index=filled.index, name="ts") + exp[mask] = np.nan + + tm.assert_series_equal(ranks, exp) + + iseries = Series(np.arange(5).repeat(2)) + + iranks = iseries.rank() + exp = iseries.astype(float).rank() + tm.assert_series_equal(iranks, exp) + iseries = Series(np.arange(5)) + 1.0 + exp = iseries / 5.0 + iranks = iseries.rank(pct=True) + + tm.assert_series_equal(iranks, exp) + + iseries = Series(np.repeat(1, 100)) + exp = Series(np.repeat(0.505, 100)) + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + # Explicit cast to float to avoid implicit cast when setting nan + iseries = iseries.astype("float") + iseries[1] = np.nan + exp = Series(np.repeat(50.0 / 99.0, 100)) + exp[1] = np.nan + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + iseries = Series(np.arange(5)) + 1.0 + iseries[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + iseries = Series(np.repeat(np.nan, 100)) + exp = iseries.copy() + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + # Explicit cast to float to avoid implicit cast when setting nan + iseries = Series(np.arange(5), dtype="float") + 1 + iseries[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + rng = date_range("1/1/1990", periods=5) + # Explicit cast to float to avoid implicit cast when setting nan + iseries = Series(np.arange(5), rng, dtype="float") + 1 + iseries.iloc[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + tm.assert_series_equal(iranks, exp) + + iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) + exp = Series([2, 1, 3, 5, 4, 6.0]) + iranks = iseries.rank() + tm.assert_series_equal(iranks, exp) + + # GH 5968 + iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]") + exp = Series([3, 2, 1, np.nan]) + iranks = iseries.rank() + tm.assert_series_equal(iranks, exp) + + values = np.array( + [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], + dtype="float64", + ) + random_order = np.random.default_rng(2).permutation(len(values)) + iseries = Series(values[random_order]) + exp = Series(random_order + 1.0, dtype="float64") + iranks = iseries.rank() + tm.assert_series_equal(iranks, exp) + + def test_rank_categorical(self): + # GH issue #15420 rank incorrectly orders ordered categories + + # Test ascending/descending ranking for ordered categoricals + exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) + exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) + ordered = Series( + ["first", "second", "third", "fourth", "fifth", "sixth"] + ).astype( + CategoricalDtype( + categories=["first", "second", "third", "fourth", "fifth", "sixth"], + ordered=True, + ) + ) + tm.assert_series_equal(ordered.rank(), exp) + tm.assert_series_equal(ordered.rank(ascending=False), exp_desc) + + # Unordered categoricals should be ranked as objects + unordered = Series( + ["first", "second", "third", "fourth", "fifth", "sixth"] + ).astype( + CategoricalDtype( + categories=["first", "second", "third", "fourth", "fifth", "sixth"], + ordered=False, + ) + ) + exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0]) + res = unordered.rank() + tm.assert_series_equal(res, exp_unordered) + + unordered1 = Series([1, 2, 3, 4, 5, 6]).astype( + CategoricalDtype([1, 2, 3, 4, 5, 6], False) + ) + exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) + res1 = unordered1.rank() + tm.assert_series_equal(res1, exp_unordered1) + + # Test na_option for rank data + na_ser = Series( + ["first", "second", "third", "fourth", "fifth", "sixth", np.nan] + ).astype( + CategoricalDtype( + ["first", "second", "third", "fourth", "fifth", "sixth", "seventh"], + True, + ) + ) + + exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0]) + exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) + exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.nan]) + + tm.assert_series_equal(na_ser.rank(na_option="top"), exp_top) + tm.assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot) + tm.assert_series_equal(na_ser.rank(na_option="keep"), exp_keep) + + # Test na_option for rank data with ascending False + exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) + exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0]) + exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.nan]) + + tm.assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top) + tm.assert_series_equal( + na_ser.rank(na_option="bottom", ascending=False), exp_bot + ) + tm.assert_series_equal(na_ser.rank(na_option="keep", ascending=False), exp_keep) + + # Test invalid values for na_option + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + + with pytest.raises(ValueError, match=msg): + na_ser.rank(na_option="bad", ascending=False) + + # invalid type + with pytest.raises(ValueError, match=msg): + na_ser.rank(na_option=True, ascending=False) + + # Test with pct=True + na_ser = Series(["first", "second", "third", "fourth", np.nan]).astype( + CategoricalDtype(["first", "second", "third", "fourth"], True) + ) + exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2]) + exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0]) + exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.nan]) + + tm.assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top) + tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot) + tm.assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep) + + def test_rank_signature(self): + s = Series([0, 1]) + s.rank(method="average") + msg = "No axis named average for object type Series" + with pytest.raises(ValueError, match=msg): + s.rank("average") + + @pytest.mark.parametrize("dtype", [None, object]) + def test_rank_tie_methods(self, ser, results, dtype): + method, exp = results + ser = ser if dtype is None else ser.astype(dtype) + result = ser.rank(method=method) + tm.assert_series_equal(result, Series(exp)) + + @pytest.mark.parametrize("ascending", [True, False]) + @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) + @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) + @pytest.mark.parametrize( + "dtype, na_value, pos_inf, neg_inf", + [ + ("object", None, Infinity(), NegInfinity()), + ("float64", np.nan, np.inf, -np.inf), + ("Float64", NA, np.inf, -np.inf), + pytest.param( + "float64[pyarrow]", + NA, + np.inf, + -np.inf, + marks=td.skip_if_no("pyarrow"), + ), + ], + ) + def test_rank_tie_methods_on_infs_nans( + self, method, na_option, ascending, dtype, na_value, pos_inf, neg_inf + ): + pytest.importorskip("scipy") + if dtype == "float64[pyarrow]": + if method == "average": + exp_dtype = "float64[pyarrow]" + else: + exp_dtype = "uint64[pyarrow]" + else: + exp_dtype = "float64" + + chunk = 3 + in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk + iseries = Series(in_arr, dtype=dtype) + exp_ranks = { + "average": ([2, 2, 2], [5, 5, 5], [8, 8, 8]), + "min": ([1, 1, 1], [4, 4, 4], [7, 7, 7]), + "max": ([3, 3, 3], [6, 6, 6], [9, 9, 9]), + "first": ([1, 2, 3], [4, 5, 6], [7, 8, 9]), + "dense": ([1, 1, 1], [2, 2, 2], [3, 3, 3]), + } + ranks = exp_ranks[method] + if na_option == "top": + order = [ranks[1], ranks[0], ranks[2]] + elif na_option == "bottom": + order = [ranks[0], ranks[2], ranks[1]] + else: + order = [ranks[0], [np.nan] * chunk, ranks[1]] + expected = order if ascending else order[::-1] + expected = list(chain.from_iterable(expected)) + result = iseries.rank(method=method, na_option=na_option, ascending=ascending) + tm.assert_series_equal(result, Series(expected, dtype=exp_dtype)) + + def test_rank_desc_mix_nans_infs(self): + # GH 19538 + # check descending ranking when mix nans and infs + iseries = Series([1, np.nan, np.inf, -np.inf, 25]) + result = iseries.rank(ascending=False) + exp = Series([3, np.nan, 1, 4, 2], dtype="float64") + tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) + @pytest.mark.parametrize( + "op, value", + [ + [operator.add, 0], + [operator.add, 1e6], + [operator.mul, 1e-6], + ], + ) + def test_rank_methods_series(self, method, op, value): + sp_stats = pytest.importorskip("scipy.stats") + + xs = np.random.default_rng(2).standard_normal(9) + xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates + np.random.default_rng(2).shuffle(xs) + + index = [chr(ord("a") + i) for i in range(len(xs))] + vals = op(xs, value) + ts = Series(vals, index=index) + result = ts.rank(method=method) + sprank = sp_stats.rankdata(vals, method if method != "first" else "ordinal") + expected = Series(sprank, index=index).astype("float64") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1]), + ([2], [1]), + ([0], [1]), + ([2, 2], [1, 1]), + ([1, 2, 3], [1, 2, 3]), + ([4, 2, 1], [3, 2, 1]), + ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), + ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5]), + ], + ) + def test_rank_dense_method(self, dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method="dense") + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + def test_rank_descending(self, ser, results, dtype): + method, _ = results + if "i" in dtype: + s = ser.dropna() + else: + s = ser.astype(dtype) + + res = s.rank(ascending=False) + expected = (s.max() - s).rank() + tm.assert_series_equal(res, expected) + + expected = (s.max() - s).rank(method=method) + res2 = s.rank(method=method, ascending=False) + tm.assert_series_equal(res2, expected) + + def test_rank_int(self, ser, results): + method, exp = results + s = ser.dropna().astype("i8") + + result = s.rank(method=method) + expected = Series(exp).dropna() + expected.index = result.index + tm.assert_series_equal(result, expected) + + def test_rank_object_bug(self): + # GH 13445 + + # smoke tests + Series([np.nan] * 32).astype(object).rank(ascending=True) + Series([np.nan] * 32).astype(object).rank(ascending=False) + + def test_rank_modify_inplace(self): + # GH 18521 + # Check rank does not mutate series + s = Series([Timestamp("2017-01-05 10:20:27.569000"), NaT]) + expected = s.copy() + + s.rank() + result = s + tm.assert_series_equal(result, expected) + + def test_rank_ea_small_values(self): + # GH#52471 + ser = Series( + [5.4954145e29, -9.791984e-21, 9.3715776e-26, NA, 1.8790257e-28], + dtype="Float64", + ) + result = ser.rank(method="min") + expected = Series([4, 1, 3, np.nan, 2]) + tm.assert_series_equal(result, expected) + + +# GH15630, pct should be on 100% basis when method='dense' + + +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0, 1.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 2, 2.0 / 2, 2.0 / 2]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 3, 1.0 / 3, 3.0 / 3, 3.0 / 3, 2.0 / 3]), + ([1, 1, 3, 3, 5, 5], [1.0 / 3, 1.0 / 3, 2.0 / 3, 2.0 / 3, 3.0 / 3, 3.0 / 3]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) +def test_rank_dense_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method="dense", pct=True) + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0 / 2, 1.0 / 2]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.0 / 3, 2.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 5, 1.0 / 5, 4.0 / 5, 4.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.0 / 6, 1.0 / 6, 3.0 / 6, 3.0 / 6, 5.0 / 6, 5.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) +def test_rank_min_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method="min", pct=True) + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0, 1.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 3.0 / 3, 3.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [2.0 / 5, 2.0 / 5, 5.0 / 5, 5.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [2.0 / 6, 2.0 / 6, 4.0 / 6, 4.0 / 6, 6.0 / 6, 6.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) +def test_rank_max_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method="max", pct=True) + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.5 / 2, 1.5 / 2]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.5 / 3, 2.5 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) +def test_rank_average_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method="average", pct=True) + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0 / 2, 2.0 / 2.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 5, 2.0 / 5, 4.0 / 5, 5.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.0 / 6, 2.0 / 6, 3.0 / 6, 4.0 / 6, 5.0 / 6, 6.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) +def test_rank_first_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method="first", pct=True) + expected = Series(exp).astype(result.dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.single_cpu +def test_pct_max_many_rows(): + # GH 18271 + s = Series(np.arange(2**24 + 1)) + result = s.rank(pct=True).max() + assert result == 1 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_reindex_like.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_reindex_like.py new file mode 100644 index 0000000000000000000000000000000000000000..7f24c778feb1b4556587773f711e21521efc537c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_reindex_like.py @@ -0,0 +1,41 @@ +from datetime import datetime + +import numpy as np + +from pandas import Series +import pandas._testing as tm + + +def test_reindex_like(datetime_series): + other = datetime_series[::2] + tm.assert_series_equal( + datetime_series.reindex(other.index), datetime_series.reindex_like(other) + ) + + # GH#7179 + day1 = datetime(2013, 3, 5) + day2 = datetime(2013, 5, 5) + day3 = datetime(2014, 3, 5) + + series1 = Series([5, None, None], [day1, day2, day3]) + series2 = Series([None, None], [day1, day3]) + + result = series1.reindex_like(series2, method="pad") + expected = Series([5, np.nan], index=[day1, day3]) + tm.assert_series_equal(result, expected) + + +def test_reindex_like_nearest(): + ser = Series(np.arange(10, dtype="int64")) + + target = [0.1, 0.9, 1.5, 2.0] + other = ser.reindex(target, method="nearest") + expected = Series(np.around(target).astype("int64"), target) + + result = ser.reindex_like(other, method="nearest") + tm.assert_series_equal(expected, result) + + result = ser.reindex_like(other, method="nearest", tolerance=1) + tm.assert_series_equal(expected, result) + result = ser.reindex_like(other, method="nearest", tolerance=[1, 2, 3, 4]) + tm.assert_series_equal(expected, result) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_rename.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_rename.py new file mode 100644 index 0000000000000000000000000000000000000000..119654bd19b3fa1611499b9f5bbd4676cc372bc8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_rename.py @@ -0,0 +1,184 @@ +from datetime import datetime +import re + +import numpy as np +import pytest + +from pandas import ( + Index, + MultiIndex, + Series, + array, +) +import pandas._testing as tm + + +class TestRename: + def test_rename(self, datetime_series): + ts = datetime_series + renamer = lambda x: x.strftime("%Y%m%d") + renamed = ts.rename(renamer) + assert renamed.index[0] == renamer(ts.index[0]) + + # dict + rename_dict = dict(zip(ts.index, renamed.index)) + renamed2 = ts.rename(rename_dict) + tm.assert_series_equal(renamed, renamed2) + + def test_rename_partial_dict(self): + # partial dict + ser = Series(np.arange(4), index=["a", "b", "c", "d"], dtype="int64") + renamed = ser.rename({"b": "foo", "d": "bar"}) + tm.assert_index_equal(renamed.index, Index(["a", "foo", "c", "bar"])) + + def test_rename_retain_index_name(self): + # index with name + renamer = Series( + np.arange(4), index=Index(["a", "b", "c", "d"], name="name"), dtype="int64" + ) + renamed = renamer.rename({}) + assert renamed.index.name == renamer.index.name + + def test_rename_by_series(self): + ser = Series(range(5), name="foo") + renamer = Series({1: 10, 2: 20}) + result = ser.rename(renamer) + expected = Series(range(5), index=[0, 10, 20, 3, 4], name="foo") + tm.assert_series_equal(result, expected) + + def test_rename_set_name(self, using_infer_string): + ser = Series(range(4), index=list("abcd")) + for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: + result = ser.rename(name) + assert result.name == name + if using_infer_string: + tm.assert_extension_array_equal(result.index.values, ser.index.values) + else: + tm.assert_numpy_array_equal(result.index.values, ser.index.values) + assert ser.name is None + + def test_rename_set_name_inplace(self, using_infer_string): + ser = Series(range(3), index=list("abc")) + for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: + ser.rename(name, inplace=True) + assert ser.name == name + exp = np.array(["a", "b", "c"], dtype=np.object_) + if using_infer_string: + exp = array(exp, dtype="string[pyarrow_numpy]") + tm.assert_extension_array_equal(ser.index.values, exp) + else: + tm.assert_numpy_array_equal(ser.index.values, exp) + + def test_rename_axis_supported(self): + # Supporting axis for compatibility, detailed in GH-18589 + ser = Series(range(5)) + ser.rename({}, axis=0) + ser.rename({}, axis="index") + + with pytest.raises(ValueError, match="No axis named 5"): + ser.rename({}, axis=5) + + def test_rename_inplace(self, datetime_series): + renamer = lambda x: x.strftime("%Y%m%d") + expected = renamer(datetime_series.index[0]) + + datetime_series.rename(renamer, inplace=True) + assert datetime_series.index[0] == expected + + def test_rename_with_custom_indexer(self): + # GH 27814 + class MyIndexer: + pass + + ix = MyIndexer() + ser = Series([1, 2, 3]).rename(ix) + assert ser.name is ix + + def test_rename_with_custom_indexer_inplace(self): + # GH 27814 + class MyIndexer: + pass + + ix = MyIndexer() + ser = Series([1, 2, 3]) + ser.rename(ix, inplace=True) + assert ser.name is ix + + def test_rename_callable(self): + # GH 17407 + ser = Series(range(1, 6), index=Index(range(2, 7), name="IntIndex")) + result = ser.rename(str) + expected = ser.rename(lambda i: str(i)) + tm.assert_series_equal(result, expected) + + assert result.name == expected.name + + def test_rename_none(self): + # GH 40977 + ser = Series([1, 2], name="foo") + result = ser.rename(None) + expected = Series([1, 2]) + tm.assert_series_equal(result, expected) + + def test_rename_series_with_multiindex(self): + # issue #43659 + arrays = [ + ["bar", "baz", "baz", "foo", "qux"], + ["one", "one", "two", "two", "one"], + ] + + index = MultiIndex.from_arrays(arrays, names=["first", "second"]) + ser = Series(np.ones(5), index=index) + result = ser.rename(index={"one": "yes"}, level="second", errors="raise") + + arrays_expected = [ + ["bar", "baz", "baz", "foo", "qux"], + ["yes", "yes", "two", "two", "yes"], + ] + + index_expected = MultiIndex.from_arrays( + arrays_expected, names=["first", "second"] + ) + series_expected = Series(np.ones(5), index=index_expected) + + tm.assert_series_equal(result, series_expected) + + def test_rename_series_with_multiindex_keeps_ea_dtypes(self): + # GH21055 + arrays = [ + Index([1, 2, 3], dtype="Int64").astype("category"), + Index([1, 2, 3], dtype="Int64"), + ] + mi = MultiIndex.from_arrays(arrays, names=["A", "B"]) + ser = Series(1, index=mi) + result = ser.rename({1: 4}, level=1) + + arrays_expected = [ + Index([1, 2, 3], dtype="Int64").astype("category"), + Index([4, 2, 3], dtype="Int64"), + ] + mi_expected = MultiIndex.from_arrays(arrays_expected, names=["A", "B"]) + expected = Series(1, index=mi_expected) + + tm.assert_series_equal(result, expected) + + def test_rename_error_arg(self): + # GH 46889 + ser = Series(["foo", "bar"]) + match = re.escape("[2] not found in axis") + with pytest.raises(KeyError, match=match): + ser.rename({2: 9}, errors="raise") + + def test_rename_copy_false(self, using_copy_on_write, warn_copy_on_write): + # GH 46889 + ser = Series(["foo", "bar"]) + ser_orig = ser.copy() + shallow_copy = ser.rename({1: 9}, copy=False) + with tm.assert_cow_warning(warn_copy_on_write): + ser[0] = "foobar" + if using_copy_on_write: + assert ser_orig[0] == shallow_copy[0] + assert ser_orig[1] == shallow_copy[9] + else: + assert ser[0] == shallow_copy[0] + assert ser[1] == shallow_copy[9] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_rename_axis.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_rename_axis.py new file mode 100644 index 0000000000000000000000000000000000000000..58c095d697ede213be5c730e305bf1789810ac4a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_rename_axis.py @@ -0,0 +1,47 @@ +import pytest + +from pandas import ( + Index, + MultiIndex, + Series, +) +import pandas._testing as tm + + +class TestSeriesRenameAxis: + def test_rename_axis_mapper(self): + # GH 19978 + mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) + ser = Series(list(range(len(mi))), index=mi) + + result = ser.rename_axis(index={"ll": "foo"}) + assert result.index.names == ["foo", "nn"] + + result = ser.rename_axis(index=str.upper, axis=0) + assert result.index.names == ["LL", "NN"] + + result = ser.rename_axis(index=["foo", "goo"]) + assert result.index.names == ["foo", "goo"] + + with pytest.raises(TypeError, match="unexpected"): + ser.rename_axis(columns="wrong") + + def test_rename_axis_inplace(self, datetime_series): + # GH 15704 + expected = datetime_series.rename_axis("foo") + result = datetime_series + no_return = result.rename_axis("foo", inplace=True) + + assert no_return is None + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("kwargs", [{"mapper": None}, {"index": None}, {}]) + def test_rename_axis_none(self, kwargs): + # GH 25034 + index = Index(list("abc"), name="foo") + ser = Series([1, 2, 3], index=index) + + result = ser.rename_axis(**kwargs) + expected_index = index.rename(None) if kwargs else index + expected = Series([1, 2, 3], index=expected_index) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_replace.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_replace.py new file mode 100644 index 0000000000000000000000000000000000000000..b0f4e233ba5eba4de29b320bb7592c6b2671ebcd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_replace.py @@ -0,0 +1,813 @@ +import re + +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +class TestSeriesReplace: + def test_replace_explicit_none(self): + # GH#36984 if the user explicitly passes value=None, give it to them + ser = pd.Series([0, 0, ""], dtype=object) + result = ser.replace("", None) + expected = pd.Series([0, 0, None], dtype=object) + tm.assert_series_equal(result, expected) + + # Cast column 2 to object to avoid implicit cast when setting entry to "" + df = pd.DataFrame(np.zeros((3, 3))).astype({2: object}) + df.iloc[2, 2] = "" + result = df.replace("", None) + expected = pd.DataFrame( + { + 0: np.zeros(3), + 1: np.zeros(3), + 2: np.array([0.0, 0.0, None], dtype=object), + } + ) + assert expected.iloc[2, 2] is None + tm.assert_frame_equal(result, expected) + + # GH#19998 same thing with object dtype + ser = pd.Series([10, 20, 30, "a", "a", "b", "a"]) + result = ser.replace("a", None) + expected = pd.Series([10, 20, 30, None, None, "b", None]) + assert expected.iloc[-1] is None + tm.assert_series_equal(result, expected) + + def test_replace_noop_doesnt_downcast(self): + # GH#44498 + ser = pd.Series([None, None, pd.Timestamp("2021-12-16 17:31")], dtype=object) + res = ser.replace({np.nan: None}) # should be a no-op + tm.assert_series_equal(res, ser) + assert res.dtype == object + + # same thing but different calling convention + res = ser.replace(np.nan, None) + tm.assert_series_equal(res, ser) + assert res.dtype == object + + def test_replace(self): + N = 50 + ser = pd.Series(np.random.default_rng(2).standard_normal(N)) + ser[0:4] = np.nan + ser[6:10] = 0 + + # replace list with a single value + return_value = ser.replace([np.nan], -1, inplace=True) + assert return_value is None + + exp = ser.fillna(-1) + tm.assert_series_equal(ser, exp) + + rs = ser.replace(0.0, np.nan) + ser[ser == 0.0] = np.nan + tm.assert_series_equal(rs, ser) + + ser = pd.Series( + np.fabs(np.random.default_rng(2).standard_normal(N)), + pd.date_range("2020-01-01", periods=N), + dtype=object, + ) + ser[:5] = np.nan + ser[6:10] = "foo" + ser[20:30] = "bar" + + # replace list with a single value + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace([np.nan, "foo", "bar"], -1) + + assert (rs[:5] == -1).all() + assert (rs[6:10] == -1).all() + assert (rs[20:30] == -1).all() + assert (pd.isna(ser[:5])).all() + + # replace with different values + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + + assert (rs[:5] == -1).all() + assert (rs[6:10] == -2).all() + assert (rs[20:30] == -3).all() + assert (pd.isna(ser[:5])).all() + + # replace with different values with 2 lists + with tm.assert_produces_warning(FutureWarning, match=msg): + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + tm.assert_series_equal(rs, rs2) + + # replace inplace + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + assert return_value is None + + assert (ser[:5] == -1).all() + assert (ser[6:10] == -1).all() + assert (ser[20:30] == -1).all() + + def test_replace_nan_with_inf(self): + ser = pd.Series([np.nan, 0, np.inf]) + tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + + ser = pd.Series([np.nan, 0, "foo", "bar", np.inf, None, pd.NaT]) + tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + filled = ser.copy() + filled[4] = 0 + tm.assert_series_equal(ser.replace(np.inf, 0), filled) + + def test_replace_listlike_value_listlike_target(self, datetime_series): + ser = pd.Series(datetime_series.index) + tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + + # malformed + msg = r"Replacement lists must match in length\. Expecting 3 got 2" + with pytest.raises(ValueError, match=msg): + ser.replace([1, 2, 3], [np.nan, 0]) + + # ser is dt64 so can't hold 1 or 2, so this replace is a no-op + result = ser.replace([1, 2], [np.nan, 0]) + tm.assert_series_equal(result, ser) + + ser = pd.Series([0, 1, 2, 3, 4]) + result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) + tm.assert_series_equal(result, pd.Series([4, 3, 2, 1, 0])) + + def test_replace_gh5319(self): + # API change from 0.12? + # GH 5319 + ser = pd.Series([0, np.nan, 2, 3, 4]) + expected = ser.ffill() + msg = ( + "Series.replace without 'value' and with non-dict-like " + "'to_replace' is deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace([np.nan]) + tm.assert_series_equal(result, expected) + + ser = pd.Series([0, np.nan, 2, 3, 4]) + expected = ser.ffill() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace(np.nan) + tm.assert_series_equal(result, expected) + + def test_replace_datetime64(self): + # GH 5797 + ser = pd.Series(pd.date_range("20130101", periods=5)) + expected = ser.copy() + expected.loc[2] = pd.Timestamp("20120101") + result = ser.replace({pd.Timestamp("20130103"): pd.Timestamp("20120101")}) + tm.assert_series_equal(result, expected) + result = ser.replace(pd.Timestamp("20130103"), pd.Timestamp("20120101")) + tm.assert_series_equal(result, expected) + + def test_replace_nat_with_tz(self): + # GH 11792: Test with replacing NaT in a list with tz data + ts = pd.Timestamp("2015/01/01", tz="UTC") + s = pd.Series([pd.NaT, pd.Timestamp("2015/01/01", tz="UTC")]) + result = s.replace([np.nan, pd.NaT], pd.Timestamp.min) + expected = pd.Series([pd.Timestamp.min, ts], dtype=object) + tm.assert_series_equal(expected, result) + + def test_replace_timedelta_td64(self): + tdi = pd.timedelta_range(0, periods=5) + ser = pd.Series(tdi) + + # Using a single dict argument means we go through replace_list + result = ser.replace({ser[1]: ser[3]}) + + expected = pd.Series([ser[0], ser[3], ser[2], ser[3], ser[4]]) + tm.assert_series_equal(result, expected) + + def test_replace_with_single_list(self): + ser = pd.Series([0, 1, 2, 3, 4]) + msg2 = ( + "Series.replace without 'value' and with non-dict-like " + "'to_replace' is deprecated" + ) + with tm.assert_produces_warning(FutureWarning, match=msg2): + result = ser.replace([1, 2, 3]) + tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4])) + + s = ser.copy() + with tm.assert_produces_warning(FutureWarning, match=msg2): + return_value = s.replace([1, 2, 3], inplace=True) + assert return_value is None + tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4])) + + # make sure things don't get corrupted when fillna call fails + s = ser.copy() + msg = ( + r"Invalid fill method\. Expecting pad \(ffill\) or backfill " + r"\(bfill\)\. Got crash_cymbal" + ) + msg3 = "The 'method' keyword in Series.replace is deprecated" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg3): + return_value = s.replace([1, 2, 3], inplace=True, method="crash_cymbal") + assert return_value is None + tm.assert_series_equal(s, ser) + + def test_replace_mixed_types(self): + ser = pd.Series(np.arange(5), dtype="int64") + + def check_replace(to_rep, val, expected): + sc = ser.copy() + result = ser.replace(to_rep, val) + return_value = sc.replace(to_rep, val, inplace=True) + assert return_value is None + tm.assert_series_equal(expected, result) + tm.assert_series_equal(expected, sc) + + # 3.0 can still be held in our int64 series, so we do not upcast GH#44940 + tr, v = [3], [3.0] + check_replace(tr, v, ser) + # Note this matches what we get with the scalars 3 and 3.0 + check_replace(tr[0], v[0], ser) + + # MUST upcast to float + e = pd.Series([0, 1, 2, 3.5, 4]) + tr, v = [3], [3.5] + check_replace(tr, v, e) + + # casts to object + e = pd.Series([0, 1, 2, 3.5, "a"]) + tr, v = [3, 4], [3.5, "a"] + check_replace(tr, v, e) + + # again casts to object + e = pd.Series([0, 1, 2, 3.5, pd.Timestamp("20130101")]) + tr, v = [3, 4], [3.5, pd.Timestamp("20130101")] + check_replace(tr, v, e) + + # casts to object + e = pd.Series([0, 1, 2, 3.5, True], dtype="object") + tr, v = [3, 4], [3.5, True] + check_replace(tr, v, e) + + # test an object with dates + floats + integers + strings + dr = pd.Series(pd.date_range("1/1/2001", "1/10/2001", freq="D")) + result = dr.astype(object).replace([dr[0], dr[1], dr[2]], [1.0, 2, "a"]) + expected = pd.Series([1.0, 2, "a"] + dr[3:].tolist(), dtype=object) + tm.assert_series_equal(result, expected) + + def test_replace_bool_with_string_no_op(self): + s = pd.Series([True, False, True]) + result = s.replace("fun", "in-the-sun") + tm.assert_series_equal(s, result) + + def test_replace_bool_with_string(self): + # nonexistent elements + s = pd.Series([True, False, True]) + result = s.replace(True, "2u") + expected = pd.Series(["2u", False, "2u"]) + tm.assert_series_equal(expected, result) + + def test_replace_bool_with_bool(self): + s = pd.Series([True, False, True]) + result = s.replace(True, False) + expected = pd.Series([False] * len(s)) + tm.assert_series_equal(expected, result) + + def test_replace_with_dict_with_bool_keys(self): + s = pd.Series([True, False, True]) + result = s.replace({"asdf": "asdb", True: "yes"}) + expected = pd.Series(["yes", False, "yes"]) + tm.assert_series_equal(result, expected) + + def test_replace_Int_with_na(self, any_int_ea_dtype): + # GH 38267 + result = pd.Series([0, None], dtype=any_int_ea_dtype).replace(0, pd.NA) + expected = pd.Series([pd.NA, pd.NA], dtype=any_int_ea_dtype) + tm.assert_series_equal(result, expected) + result = pd.Series([0, 1], dtype=any_int_ea_dtype).replace(0, pd.NA) + result.replace(1, pd.NA, inplace=True) + tm.assert_series_equal(result, expected) + + def test_replace2(self): + N = 50 + ser = pd.Series( + np.fabs(np.random.default_rng(2).standard_normal(N)), + pd.date_range("2020-01-01", periods=N), + dtype=object, + ) + ser[:5] = np.nan + ser[6:10] = "foo" + ser[20:30] = "bar" + + # replace list with a single value + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace([np.nan, "foo", "bar"], -1) + + assert (rs[:5] == -1).all() + assert (rs[6:10] == -1).all() + assert (rs[20:30] == -1).all() + assert (pd.isna(ser[:5])).all() + + # replace with different values + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + + assert (rs[:5] == -1).all() + assert (rs[6:10] == -2).all() + assert (rs[20:30] == -3).all() + assert (pd.isna(ser[:5])).all() + + # replace with different values with 2 lists + with tm.assert_produces_warning(FutureWarning, match=msg): + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + tm.assert_series_equal(rs, rs2) + + # replace inplace + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + assert return_value is None + assert (ser[:5] == -1).all() + assert (ser[6:10] == -1).all() + assert (ser[20:30] == -1).all() + + @pytest.mark.parametrize("inplace", [True, False]) + def test_replace_cascade(self, inplace): + # Test that replaced values are not replaced again + # GH #50778 + ser = pd.Series([1, 2, 3]) + expected = pd.Series([2, 3, 4]) + + res = ser.replace([1, 2, 3], [2, 3, 4], inplace=inplace) + if inplace: + tm.assert_series_equal(ser, expected) + else: + tm.assert_series_equal(res, expected) + + def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype): + # GH 32621, GH#44940 + ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) + expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype) + result = ser.replace({"one": "1", "two": "2"}) + tm.assert_series_equal(expected, result) + + def test_replace_with_empty_dictlike(self): + # GH 15289 + s = pd.Series(list("abcd")) + tm.assert_series_equal(s, s.replace({})) + + empty_series = pd.Series([]) + tm.assert_series_equal(s, s.replace(empty_series)) + + def test_replace_string_with_number(self): + # GH 15743 + s = pd.Series([1, 2, 3]) + result = s.replace("2", np.nan) + expected = pd.Series([1, 2, 3]) + tm.assert_series_equal(expected, result) + + def test_replace_replacer_equals_replacement(self): + # GH 20656 + # make sure all replacers are matching against original values + s = pd.Series(["a", "b"]) + expected = pd.Series(["b", "a"]) + result = s.replace({"a": "b", "b": "a"}) + tm.assert_series_equal(expected, result) + + def test_replace_unicode_with_number(self): + # GH 15743 + s = pd.Series([1, 2, 3]) + result = s.replace("2", np.nan) + expected = pd.Series([1, 2, 3]) + tm.assert_series_equal(expected, result) + + def test_replace_mixed_types_with_string(self): + # Testing mixed + s = pd.Series([1, 2, 3, "4", 4, 5]) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.replace([2, "4"], np.nan) + expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) + tm.assert_series_equal(expected, result) + + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") + @pytest.mark.parametrize( + "categorical, numeric", + [ + (pd.Categorical(["A"], categories=["A", "B"]), [1]), + (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), + ], + ) + def test_replace_categorical(self, categorical, numeric): + # GH 24971, GH#23305 + ser = pd.Series(categorical) + msg = "Downcasting behavior in `replace`" + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace({"A": 1, "B": 2}) + expected = pd.Series(numeric).astype("category") + if 2 not in expected.cat.categories: + # i.e. categories should be [1, 2] even if there are no "B"s present + # GH#44940 + expected = expected.cat.add_categories(2) + tm.assert_series_equal(expected, result) + + @pytest.mark.parametrize( + "data, data_exp", [(["a", "b", "c"], ["b", "b", "c"]), (["a"], ["b"])] + ) + def test_replace_categorical_inplace(self, data, data_exp): + # GH 53358 + result = pd.Series(data, dtype="category") + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result.replace(to_replace="a", value="b", inplace=True) + expected = pd.Series(data_exp, dtype="category") + tm.assert_series_equal(result, expected) + + def test_replace_categorical_single(self): + # GH 26988 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + s = pd.Series(dti) + c = s.astype("category") + + expected = c.copy() + expected = expected.cat.add_categories("foo") + expected[2] = "foo" + expected = expected.cat.remove_unused_categories() + assert c[2] != "foo" + + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = c.replace(c[2], "foo") + tm.assert_series_equal(expected, result) + assert c[2] != "foo" # ensure non-inplace call does not alter original + + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = c.replace(c[2], "foo", inplace=True) + assert return_value is None + tm.assert_series_equal(expected, c) + + first_value = c[0] + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = c.replace(c[1], c[0], inplace=True) + assert return_value is None + assert c[0] == c[1] == first_value # test replacing with existing value + + def test_replace_with_no_overflowerror(self): + # GH 25616 + # casts to object without Exception from OverflowError + s = pd.Series([0, 1, 2, 3, 4]) + result = s.replace([3], ["100000000000000000000"]) + expected = pd.Series([0, 1, 2, "100000000000000000000", 4]) + tm.assert_series_equal(result, expected) + + s = pd.Series([0, "100000000000000000000", "100000000000000000001"]) + result = s.replace(["100000000000000000000"], [1]) + expected = pd.Series([0, 1, "100000000000000000001"]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ser, to_replace, exp", + [ + ([1, 2, 3], {1: 2, 2: 3, 3: 4}, [2, 3, 4]), + (["1", "2", "3"], {"1": "2", "2": "3", "3": "4"}, ["2", "3", "4"]), + ], + ) + def test_replace_commutative(self, ser, to_replace, exp): + # GH 16051 + # DataFrame.replace() overwrites when values are non-numeric + + series = pd.Series(ser) + + expected = pd.Series(exp) + result = series.replace(to_replace) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ser, exp", [([1, 2, 3], [1, True, 3]), (["x", 2, 3], ["x", True, 3])] + ) + def test_replace_no_cast(self, ser, exp): + # GH 9113 + # BUG: replace int64 dtype with bool coerces to int64 + + series = pd.Series(ser) + result = series.replace(2, True) + expected = pd.Series(exp) + + tm.assert_series_equal(result, expected) + + def test_replace_invalid_to_replace(self): + # GH 18634 + # API: replace() should raise an exception if invalid argument is given + series = pd.Series(["a", "b", "c "]) + msg = ( + r"Expecting 'to_replace' to be either a scalar, array-like, " + r"dict or None, got invalid type.*" + ) + msg2 = ( + "Series.replace without 'value' and with non-dict-like " + "'to_replace' is deprecated" + ) + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=msg2): + series.replace(lambda x: x.strip()) + + @pytest.mark.parametrize("frame", [False, True]) + def test_replace_nonbool_regex(self, frame): + obj = pd.Series(["a", "b", "c "]) + if frame: + obj = obj.to_frame() + + msg = "'to_replace' must be 'None' if 'regex' is not a bool" + with pytest.raises(ValueError, match=msg): + obj.replace(to_replace=["a"], regex="foo") + + @pytest.mark.parametrize("frame", [False, True]) + def test_replace_empty_copy(self, frame): + obj = pd.Series([], dtype=np.float64) + if frame: + obj = obj.to_frame() + + res = obj.replace(4, 5, inplace=True) + assert res is None + + res = obj.replace(4, 5, inplace=False) + tm.assert_equal(res, obj) + assert res is not obj + + def test_replace_only_one_dictlike_arg(self, fixed_now_ts): + # GH#33340 + + ser = pd.Series([1, 2, "A", fixed_now_ts, True]) + to_replace = {0: 1, 2: "A"} + value = "foo" + msg = "Series.replace cannot use dict-like to_replace and non-None value" + with pytest.raises(ValueError, match=msg): + ser.replace(to_replace, value) + + to_replace = 1 + value = {0: "foo", 2: "bar"} + msg = "Series.replace cannot use dict-value and non-None to_replace" + with pytest.raises(ValueError, match=msg): + ser.replace(to_replace, value) + + def test_replace_extension_other(self, frame_or_series): + # https://github.com/pandas-dev/pandas/issues/34530 + obj = frame_or_series(pd.array([1, 2, 3], dtype="Int64")) + result = obj.replace("", "") # no exception + # should not have changed dtype + tm.assert_equal(obj, result) + + def _check_replace_with_method(self, ser: pd.Series): + df = ser.to_frame() + + msg1 = "The 'method' keyword in Series.replace is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg1): + res = ser.replace(ser[1], method="pad") + expected = pd.Series([ser[0], ser[0]] + list(ser[2:]), dtype=ser.dtype) + tm.assert_series_equal(res, expected) + + msg2 = "The 'method' keyword in DataFrame.replace is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg2): + res_df = df.replace(ser[1], method="pad") + tm.assert_frame_equal(res_df, expected.to_frame()) + + ser2 = ser.copy() + with tm.assert_produces_warning(FutureWarning, match=msg1): + res2 = ser2.replace(ser[1], method="pad", inplace=True) + assert res2 is None + tm.assert_series_equal(ser2, expected) + + with tm.assert_produces_warning(FutureWarning, match=msg2): + res_df2 = df.replace(ser[1], method="pad", inplace=True) + assert res_df2 is None + tm.assert_frame_equal(df, expected.to_frame()) + + def test_replace_ea_dtype_with_method(self, any_numeric_ea_dtype): + arr = pd.array([1, 2, pd.NA, 4], dtype=any_numeric_ea_dtype) + ser = pd.Series(arr) + + self._check_replace_with_method(ser) + + @pytest.mark.parametrize("as_categorical", [True, False]) + def test_replace_interval_with_method(self, as_categorical): + # in particular interval that can't hold NA + + idx = pd.IntervalIndex.from_breaks(range(4)) + ser = pd.Series(idx) + if as_categorical: + ser = ser.astype("category") + + self._check_replace_with_method(ser) + + @pytest.mark.parametrize("as_period", [True, False]) + @pytest.mark.parametrize("as_categorical", [True, False]) + def test_replace_datetimelike_with_method(self, as_period, as_categorical): + idx = pd.date_range("2016-01-01", periods=5, tz="US/Pacific") + if as_period: + idx = idx.tz_localize(None).to_period("D") + + ser = pd.Series(idx) + ser.iloc[-2] = pd.NaT + if as_categorical: + ser = ser.astype("category") + + self._check_replace_with_method(ser) + + def test_replace_with_compiled_regex(self): + # https://github.com/pandas-dev/pandas/issues/35680 + s = pd.Series(["a", "b", "c"]) + regex = re.compile("^a$") + result = s.replace({regex: "z"}, regex=True) + expected = pd.Series(["z", "b", "c"]) + tm.assert_series_equal(result, expected) + + def test_pandas_replace_na(self): + # GH#43344 + ser = pd.Series(["AA", "BB", "CC", "DD", "EE", "", pd.NA], dtype="string") + regex_mapping = { + "AA": "CC", + "BB": "CC", + "EE": "CC", + "CC": "CC-REPL", + } + result = ser.replace(regex_mapping, regex=True) + exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") + tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize( + "dtype, input_data, to_replace, expected_data", + [ + ("bool", [True, False], {True: False}, [False, False]), + ("int64", [1, 2], {1: 10, 2: 20}, [10, 20]), + ("Int64", [1, 2], {1: 10, 2: 20}, [10, 20]), + ("float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), + ("Float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), + ("string", ["one", "two"], {"one": "1", "two": "2"}, ["1", "2"]), + ( + pd.IntervalDtype("int64"), + IntervalArray([pd.Interval(1, 2), pd.Interval(2, 3)]), + {pd.Interval(1, 2): pd.Interval(10, 20)}, + IntervalArray([pd.Interval(10, 20), pd.Interval(2, 3)]), + ), + ( + pd.IntervalDtype("float64"), + IntervalArray([pd.Interval(1.0, 2.7), pd.Interval(2.8, 3.1)]), + {pd.Interval(1.0, 2.7): pd.Interval(10.6, 20.8)}, + IntervalArray([pd.Interval(10.6, 20.8), pd.Interval(2.8, 3.1)]), + ), + ( + pd.PeriodDtype("M"), + [pd.Period("2020-05", freq="M")], + {pd.Period("2020-05", freq="M"): pd.Period("2020-06", freq="M")}, + [pd.Period("2020-06", freq="M")], + ), + ], + ) + def test_replace_dtype(self, dtype, input_data, to_replace, expected_data): + # GH#33484 + ser = pd.Series(input_data, dtype=dtype) + result = ser.replace(to_replace) + expected = pd.Series(expected_data, dtype=dtype) + tm.assert_series_equal(result, expected) + + def test_replace_string_dtype(self): + # GH#40732, GH#44940 + ser = pd.Series(["one", "two", np.nan], dtype="string") + res = ser.replace({"one": "1", "two": "2"}) + expected = pd.Series(["1", "2", np.nan], dtype="string") + tm.assert_series_equal(res, expected) + + # GH#31644 + ser2 = pd.Series(["A", np.nan], dtype="string") + res2 = ser2.replace("A", "B") + expected2 = pd.Series(["B", np.nan], dtype="string") + tm.assert_series_equal(res2, expected2) + + ser3 = pd.Series(["A", "B"], dtype="string") + res3 = ser3.replace("A", pd.NA) + expected3 = pd.Series([pd.NA, "B"], dtype="string") + tm.assert_series_equal(res3, expected3) + + def test_replace_string_dtype_list_to_replace(self): + # GH#41215, GH#44940 + ser = pd.Series(["abc", "def"], dtype="string") + res = ser.replace(["abc", "any other string"], "xyz") + expected = pd.Series(["xyz", "def"], dtype="string") + tm.assert_series_equal(res, expected) + + def test_replace_string_dtype_regex(self): + # GH#31644 + ser = pd.Series(["A", "B"], dtype="string") + res = ser.replace(r".", "C", regex=True) + expected = pd.Series(["C", "C"], dtype="string") + tm.assert_series_equal(res, expected) + + def test_replace_nullable_numeric(self): + # GH#40732, GH#44940 + + floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype()) + assert floats.replace({1.0: 9}).dtype == floats.dtype + assert floats.replace(1.0, 9).dtype == floats.dtype + assert floats.replace({1.0: 9.0}).dtype == floats.dtype + assert floats.replace(1.0, 9.0).dtype == floats.dtype + + res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0]) + assert res.dtype == floats.dtype + + ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype()) + assert ints.replace({1: 9}).dtype == ints.dtype + assert ints.replace(1, 9).dtype == ints.dtype + assert ints.replace({1: 9.0}).dtype == ints.dtype + assert ints.replace(1, 9.0).dtype == ints.dtype + + # nullable (for now) raises instead of casting + with pytest.raises(TypeError, match="Invalid value"): + ints.replace({1: 9.5}) + with pytest.raises(TypeError, match="Invalid value"): + ints.replace(1, 9.5) + + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 1 in string") + @pytest.mark.parametrize("regex", [False, True]) + def test_replace_regex_dtype_series(self, regex): + # GH-48644 + series = pd.Series(["0"]) + expected = pd.Series([1]) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = series.replace(to_replace="0", value=1, regex=regex) + tm.assert_series_equal(result, expected) + + def test_replace_different_int_types(self, any_int_numpy_dtype): + # GH#45311 + labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype) + + maps = pd.Series([0, 2, 1], dtype=any_int_numpy_dtype) + map_dict = dict(zip(maps.values, maps.index)) + + result = labs.replace(map_dict) + expected = labs.replace({0: 0, 2: 1, 1: 2}) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("val", [2, np.nan, 2.0]) + def test_replace_value_none_dtype_numeric(self, val): + # GH#48231 + ser = pd.Series([1, val]) + result = ser.replace(val, None) + expected = pd.Series([1, None], dtype=object) + tm.assert_series_equal(result, expected) + + def test_replace_change_dtype_series(self, using_infer_string): + # GH#25797 + df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) + warn = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warn, match="Downcasting"): + df["Test"] = df["Test"].replace([True], [np.nan]) + expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]}) + tm.assert_frame_equal(df, expected) + + df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df["Test"] = df["Test"].replace([None], [np.nan]) + tm.assert_frame_equal(df, expected) + + df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df["Test"] = df["Test"].fillna(np.nan) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("dtype", ["object", "Int64"]) + def test_replace_na_in_obj_column(self, dtype): + # GH#47480 + ser = pd.Series([0, 1, pd.NA], dtype=dtype) + expected = pd.Series([0, 2, pd.NA], dtype=dtype) + result = ser.replace(to_replace=1, value=2) + tm.assert_series_equal(result, expected) + + ser.replace(to_replace=1, value=2, inplace=True) + tm.assert_series_equal(ser, expected) + + @pytest.mark.parametrize("val", [0, 0.5]) + def test_replace_numeric_column_with_na(self, val): + # GH#50758 + ser = pd.Series([val, 1]) + expected = pd.Series([val, pd.NA]) + result = ser.replace(to_replace=1, value=pd.NA) + tm.assert_series_equal(result, expected) + + ser.replace(to_replace=1, value=pd.NA, inplace=True) + tm.assert_series_equal(ser, expected) + + def test_replace_ea_float_with_bool(self): + # GH#55398 + ser = pd.Series([0.0], dtype="Float64") + expected = ser.copy() + result = ser.replace(False, 1.0) + tm.assert_series_equal(result, expected) + + ser = pd.Series([False], dtype="boolean") + expected = ser.copy() + result = ser.replace(0.0, True) + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_reset_index.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_reset_index.py new file mode 100644 index 0000000000000000000000000000000000000000..48e2608a1032a0664f7b6b3cb2c354e7f7c531f6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_reset_index.py @@ -0,0 +1,225 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + RangeIndex, + Series, + date_range, + option_context, +) +import pandas._testing as tm + + +class TestResetIndex: + def test_reset_index_dti_round_trip(self): + dti = date_range(start="1/1/2001", end="6/1/2001", freq="D")._with_freq(None) + d1 = DataFrame({"v": np.random.default_rng(2).random(len(dti))}, index=dti) + d2 = d1.reset_index() + assert d2.dtypes.iloc[0] == np.dtype("M8[ns]") + d3 = d2.set_index("index") + tm.assert_frame_equal(d1, d3, check_names=False) + + # GH#2329 + stamp = datetime(2012, 11, 22) + df = DataFrame([[stamp, 12.1]], columns=["Date", "Value"]) + df = df.set_index("Date") + + assert df.index[0] == stamp + assert df.reset_index()["Date"].iloc[0] == stamp + + def test_reset_index(self): + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + )[:5] + ser = df.stack(future_stack=True) + ser.index.names = ["hash", "category"] + + ser.name = "value" + df = ser.reset_index() + assert "value" in df + + df = ser.reset_index(name="value2") + assert "value2" in df + + # check inplace + s = ser.reset_index(drop=True) + s2 = ser + return_value = s2.reset_index(drop=True, inplace=True) + assert return_value is None + tm.assert_series_equal(s, s2) + + # level + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + s = Series(np.random.default_rng(2).standard_normal(6), index=index) + rs = s.reset_index(level=1) + assert len(rs.columns) == 2 + + rs = s.reset_index(level=[0, 2], drop=True) + tm.assert_index_equal(rs.index, Index(index.get_level_values(1))) + assert isinstance(rs, Series) + + def test_reset_index_name(self): + s = Series([1, 2, 3], index=Index(range(3), name="x")) + assert s.reset_index().index.name is None + assert s.reset_index(drop=True).index.name is None + + def test_reset_index_level(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) + + for levels in ["A", "B"], [0, 1]: + # With MultiIndex + s = df.set_index(["A", "B"])["C"] + + result = s.reset_index(level=levels[0]) + tm.assert_frame_equal(result, df.set_index("B")) + + result = s.reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df.set_index("B")) + + result = s.reset_index(level=levels) + tm.assert_frame_equal(result, df) + + result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True) + tm.assert_frame_equal(result, df[["C"]]) + + with pytest.raises(KeyError, match="Level E "): + s.reset_index(level=["A", "E"]) + + # With single-level Index + s = df.set_index("A")["B"] + + result = s.reset_index(level=levels[0]) + tm.assert_frame_equal(result, df[["A", "B"]]) + + result = s.reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df[["A", "B"]]) + + result = s.reset_index(level=levels[0], drop=True) + tm.assert_series_equal(result, df["B"]) + + with pytest.raises(IndexError, match="Too many levels"): + s.reset_index(level=[0, 1, 2]) + + # Check that .reset_index([],drop=True) doesn't fail + result = Series(range(4)).reset_index([], drop=True) + expected = Series(range(4)) + tm.assert_series_equal(result, expected) + + def test_reset_index_range(self): + # GH 12071 + s = Series(range(2), name="A", dtype="int64") + series_result = s.reset_index() + assert isinstance(series_result.index, RangeIndex) + series_expected = DataFrame( + [[0, 0], [1, 1]], columns=["index", "A"], index=RangeIndex(stop=2) + ) + tm.assert_frame_equal(series_result, series_expected) + + def test_reset_index_drop_errors(self): + # GH 20925 + + # KeyError raised for series index when passed level name is missing + s = Series(range(4)) + with pytest.raises(KeyError, match="does not match index name"): + s.reset_index("wrong", drop=True) + with pytest.raises(KeyError, match="does not match index name"): + s.reset_index("wrong") + + # KeyError raised for series when level to be dropped is missing + s = Series(range(4), index=MultiIndex.from_product([[1, 2]] * 2)) + with pytest.raises(KeyError, match="not found"): + s.reset_index("wrong", drop=True) + + def test_reset_index_with_drop(self): + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + data = np.random.default_rng(2).standard_normal(8) + ser = Series(data, index=index) + ser.iloc[3] = np.nan + + deleveled = ser.reset_index() + assert isinstance(deleveled, DataFrame) + assert len(deleveled.columns) == len(ser.index.levels) + 1 + assert deleveled.index.name == ser.index.name + + deleveled = ser.reset_index(drop=True) + assert isinstance(deleveled, Series) + assert deleveled.index.name == ser.index.name + + def test_reset_index_inplace_and_drop_ignore_name(self): + # GH#44575 + ser = Series(range(2), name="old") + ser.reset_index(name="new", drop=True, inplace=True) + expected = Series(range(2), name="old") + tm.assert_series_equal(ser, expected) + + def test_reset_index_drop_infer_string(self): + # GH#56160 + pytest.importorskip("pyarrow") + ser = Series(["a", "b", "c"], dtype=object) + with option_context("future.infer_string", True): + result = ser.reset_index(drop=True) + tm.assert_series_equal(result, ser) + + +@pytest.mark.parametrize( + "array, dtype", + [ + (["a", "b"], object), + ( + pd.period_range("12-1-2000", periods=2, freq="Q-DEC"), + pd.PeriodDtype(freq="Q-DEC"), + ), + ], +) +def test_reset_index_dtypes_on_empty_series_with_multiindex( + array, dtype, using_infer_string +): + # GH 19602 - Preserve dtype on empty Series with MultiIndex + idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) + result = Series(dtype=object, index=idx)[:0].reset_index().dtypes + exp = "string" if using_infer_string else object + expected = Series( + { + "level_0": np.int64, + "level_1": np.float64, + "level_2": exp if dtype == object else dtype, + 0: object, + } + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "names, expected_names", + [ + (["A", "A"], ["A", "A"]), + (["level_1", None], ["level_1", "level_1"]), + ], +) +@pytest.mark.parametrize("allow_duplicates", [False, True]) +def test_column_name_duplicates(names, expected_names, allow_duplicates): + # GH#44755 reset_index with duplicate column labels + s = Series([1], index=MultiIndex.from_arrays([[1], [1]], names=names)) + if allow_duplicates: + result = s.reset_index(allow_duplicates=True) + expected = DataFrame([[1, 1, 1]], columns=expected_names + [0]) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(ValueError, match="cannot insert"): + s.reset_index() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_set_name.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_set_name.py new file mode 100644 index 0000000000000000000000000000000000000000..cbc8ebde7a8ab79fabe94781123844c856c9c78b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_set_name.py @@ -0,0 +1,21 @@ +from datetime import datetime + +from pandas import Series + + +class TestSetName: + def test_set_name(self): + ser = Series([1, 2, 3]) + ser2 = ser._set_name("foo") + assert ser2.name == "foo" + assert ser.name is None + assert ser is not ser2 + + def test_set_name_attribute(self): + ser = Series([1, 2, 3]) + ser2 = Series([1, 2, 3], name="bar") + for name in [7, 7.0, "name", datetime(2001, 1, 1), (1,), "\u05D0"]: + ser.name = name + assert ser.name == name + ser2.name = name + assert ser2.name == name diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_sort_values.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_sort_values.py new file mode 100644 index 0000000000000000000000000000000000000000..4808272879071e8530c000daddbe7e9518deaefc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_sort_values.py @@ -0,0 +1,246 @@ +import numpy as np +import pytest + +from pandas import ( + Categorical, + DataFrame, + Series, +) +import pandas._testing as tm + + +class TestSeriesSortValues: + def test_sort_values(self, datetime_series, using_copy_on_write): + # check indexes are reordered corresponding with the values + ser = Series([3, 2, 4, 1], ["A", "B", "C", "D"]) + expected = Series([1, 2, 3, 4], ["D", "B", "A", "C"]) + result = ser.sort_values() + tm.assert_series_equal(expected, result) + + ts = datetime_series.copy() + ts[:5] = np.nan + vals = ts.values + + result = ts.sort_values() + assert np.isnan(result[-5:]).all() + tm.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:])) + + # na_position + result = ts.sort_values(na_position="first") + assert np.isnan(result[:5]).all() + tm.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:])) + + # something object-type + ser = Series(["A", "B"], [1, 2]) + # no failure + ser.sort_values() + + # ascending=False + ordered = ts.sort_values(ascending=False) + expected = np.sort(ts.dropna().values)[::-1] + tm.assert_almost_equal(expected, ordered.dropna().values) + ordered = ts.sort_values(ascending=False, na_position="first") + tm.assert_almost_equal(expected, ordered.dropna().values) + + # ascending=[False] should behave the same as ascending=False + ordered = ts.sort_values(ascending=[False]) + expected = ts.sort_values(ascending=False) + tm.assert_series_equal(expected, ordered) + ordered = ts.sort_values(ascending=[False], na_position="first") + expected = ts.sort_values(ascending=False, na_position="first") + tm.assert_series_equal(expected, ordered) + + msg = 'For argument "ascending" expected type bool, received type NoneType.' + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending=None) + msg = r"Length of ascending \(0\) must be 1 for Series" + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending=[]) + msg = r"Length of ascending \(3\) must be 1 for Series" + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending=[1, 2, 3]) + msg = r"Length of ascending \(2\) must be 1 for Series" + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending=[False, False]) + msg = 'For argument "ascending" expected type bool, received type str.' + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending="foobar") + + # inplace=True + ts = datetime_series.copy() + return_value = ts.sort_values(ascending=False, inplace=True) + assert return_value is None + tm.assert_series_equal(ts, datetime_series.sort_values(ascending=False)) + tm.assert_index_equal( + ts.index, datetime_series.sort_values(ascending=False).index + ) + + # GH#5856/5853 + # Series.sort_values operating on a view + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) + s = df.iloc[:, 0] + + msg = ( + "This Series is a view of some other array, to sort in-place " + "you must create a copy" + ) + if using_copy_on_write: + s.sort_values(inplace=True) + tm.assert_series_equal(s, df.iloc[:, 0].sort_values()) + else: + with pytest.raises(ValueError, match=msg): + s.sort_values(inplace=True) + + def test_sort_values_categorical(self): + c = Categorical(["a", "b", "b", "a"], ordered=False) + cat = Series(c.copy()) + + # sort in the categories order + expected = Series( + Categorical(["a", "a", "b", "b"], ordered=False), index=[0, 3, 1, 2] + ) + result = cat.sort_values() + tm.assert_series_equal(result, expected) + + cat = Series(Categorical(["a", "c", "b", "d"], ordered=True)) + res = cat.sort_values() + exp = np.array(["a", "b", "c", "d"], dtype=np.object_) + tm.assert_numpy_array_equal(res.__array__(), exp) + + cat = Series( + Categorical( + ["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True + ) + ) + res = cat.sort_values() + exp = np.array(["a", "b", "c", "d"], dtype=np.object_) + tm.assert_numpy_array_equal(res.__array__(), exp) + + res = cat.sort_values(ascending=False) + exp = np.array(["d", "c", "b", "a"], dtype=np.object_) + tm.assert_numpy_array_equal(res.__array__(), exp) + + raw_cat1 = Categorical( + ["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False + ) + raw_cat2 = Categorical( + ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True + ) + s = ["a", "b", "c", "d"] + df = DataFrame( + {"unsort": raw_cat1, "sort": raw_cat2, "string": s, "values": [1, 2, 3, 4]} + ) + + # Cats must be sorted in a dataframe + res = df.sort_values(by=["string"], ascending=False) + exp = np.array(["d", "c", "b", "a"], dtype=np.object_) + tm.assert_numpy_array_equal(res["sort"].values.__array__(), exp) + assert res["sort"].dtype == "category" + + res = df.sort_values(by=["sort"], ascending=False) + exp = df.sort_values(by=["string"], ascending=True) + tm.assert_series_equal(res["values"], exp["values"]) + assert res["sort"].dtype == "category" + assert res["unsort"].dtype == "category" + + # unordered cat, but we allow this + df.sort_values(by=["unsort"], ascending=False) + + # multi-columns sort + # GH#7848 + df = DataFrame( + {"id": [6, 5, 4, 3, 2, 1], "raw_grade": ["a", "b", "b", "a", "a", "e"]} + ) + df["grade"] = Categorical(df["raw_grade"], ordered=True) + df["grade"] = df["grade"].cat.set_categories(["b", "e", "a"]) + + # sorts 'grade' according to the order of the categories + result = df.sort_values(by=["grade"]) + expected = df.iloc[[1, 2, 5, 0, 3, 4]] + tm.assert_frame_equal(result, expected) + + # multi + result = df.sort_values(by=["grade", "id"]) + expected = df.iloc[[2, 1, 5, 4, 3, 0]] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_list, sorted_list, ignore_index, output_index", + [ + ([2, 3, 6, 1], [6, 3, 2, 1], True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [6, 3, 2, 1], False, [2, 1, 0, 3]), + ], + ) + def test_sort_values_ignore_index( + self, inplace, original_list, sorted_list, ignore_index, output_index + ): + # GH 30114 + ser = Series(original_list) + expected = Series(sorted_list, index=output_index) + kwargs = {"ignore_index": ignore_index, "inplace": inplace} + + if inplace: + result_ser = ser.copy() + result_ser.sort_values(ascending=False, **kwargs) + else: + result_ser = ser.sort_values(ascending=False, **kwargs) + + tm.assert_series_equal(result_ser, expected) + tm.assert_series_equal(ser, Series(original_list)) + + def test_mergesort_descending_stability(self): + # GH 28697 + s = Series([1, 2, 1, 3], ["first", "b", "second", "c"]) + result = s.sort_values(ascending=False, kind="mergesort") + expected = Series([3, 2, 1, 1], ["c", "b", "first", "second"]) + tm.assert_series_equal(result, expected) + + def test_sort_values_validate_ascending_for_value_error(self): + # GH41634 + ser = Series([23, 7, 21]) + + msg = 'For argument "ascending" expected type bool, received type str.' + with pytest.raises(ValueError, match=msg): + ser.sort_values(ascending="False") + + @pytest.mark.parametrize("ascending", [False, 0, 1, True]) + def test_sort_values_validate_ascending_functional(self, ascending): + # GH41634 + ser = Series([23, 7, 21]) + expected = np.sort(ser.values) + + sorted_ser = ser.sort_values(ascending=ascending) + if not ascending: + expected = expected[::-1] + + result = sorted_ser.values + tm.assert_numpy_array_equal(result, expected) + + +class TestSeriesSortingKey: + def test_sort_values_key(self): + series = Series(np.array(["Hello", "goodbye"])) + + result = series.sort_values(axis=0) + expected = series + tm.assert_series_equal(result, expected) + + result = series.sort_values(axis=0, key=lambda x: x.str.lower()) + expected = series[::-1] + tm.assert_series_equal(result, expected) + + def test_sort_values_key_nan(self): + series = Series(np.array([0, 5, np.nan, 3, 2, np.nan])) + + result = series.sort_values(axis=0) + expected = series.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_series_equal(result, expected) + + result = series.sort_values(axis=0, key=lambda x: x + 5) + expected = series.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_series_equal(result, expected) + + result = series.sort_values(axis=0, key=lambda x: -x, ascending=False) + expected = series.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_series_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_dict.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..41c01f4537f23fdb90110685d2e774427916aee1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_dict.py @@ -0,0 +1,38 @@ +import collections + +import numpy as np +import pytest + +from pandas import Series +import pandas._testing as tm + + +class TestSeriesToDict: + @pytest.mark.parametrize( + "mapping", (dict, collections.defaultdict(list), collections.OrderedDict) + ) + def test_to_dict(self, mapping, datetime_series): + # GH#16122 + result = Series(datetime_series.to_dict(into=mapping), name="ts") + expected = datetime_series.copy() + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected) + + from_method = Series(datetime_series.to_dict(into=collections.Counter)) + from_constructor = Series(collections.Counter(datetime_series.items())) + tm.assert_series_equal(from_method, from_constructor) + + @pytest.mark.parametrize( + "input", + ( + {"a": np.int64(64), "b": 10}, + {"a": np.int64(64), "b": 10, "c": "ABC"}, + {"a": np.uint64(64), "b": 10, "c": "ABC"}, + ), + ) + def test_to_dict_return_types(self, input): + # GH25969 + + d = Series(input).to_dict() + assert isinstance(d["a"], int) + assert isinstance(d["b"], int) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_frame.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_frame.py new file mode 100644 index 0000000000000000000000000000000000000000..0eadf696b34cc034d2be76ce5daba2cff679da74 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_to_frame.py @@ -0,0 +1,63 @@ +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +class TestToFrame: + def test_to_frame_respects_name_none(self): + # GH#44212 if we explicitly pass name=None, then that should be respected, + # not changed to 0 + # GH-45448 this is first deprecated & enforced in 2.0 + ser = Series(range(3)) + result = ser.to_frame(None) + + exp_index = Index([None], dtype=object) + tm.assert_index_equal(result.columns, exp_index) + + result = ser.rename("foo").to_frame(None) + exp_index = Index([None], dtype=object) + tm.assert_index_equal(result.columns, exp_index) + + def test_to_frame(self, datetime_series): + datetime_series.name = None + rs = datetime_series.to_frame() + xp = DataFrame(datetime_series.values, index=datetime_series.index) + tm.assert_frame_equal(rs, xp) + + datetime_series.name = "testname" + rs = datetime_series.to_frame() + xp = DataFrame( + {"testname": datetime_series.values}, index=datetime_series.index + ) + tm.assert_frame_equal(rs, xp) + + rs = datetime_series.to_frame(name="testdifferent") + xp = DataFrame( + {"testdifferent": datetime_series.values}, index=datetime_series.index + ) + tm.assert_frame_equal(rs, xp) + + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) + def test_to_frame_expanddim(self): + # GH#9762 + + class SubclassedSeries(Series): + @property + def _constructor_expanddim(self): + return SubclassedFrame + + class SubclassedFrame(DataFrame): + pass + + ser = SubclassedSeries([1, 2, 3], name="X") + result = ser.to_frame() + assert isinstance(result, SubclassedFrame) + expected = SubclassedFrame({"X": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_unique.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_unique.py new file mode 100644 index 0000000000000000000000000000000000000000..14e247332503253ede23e94fe1335e712a2675e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_unique.py @@ -0,0 +1,76 @@ +import numpy as np + +from pandas import ( + Categorical, + IntervalIndex, + Series, + date_range, +) +import pandas._testing as tm + + +class TestUnique: + def test_unique_uint64(self): + ser = Series([1, 2, 2**63, 2**63], dtype=np.uint64) + res = ser.unique() + exp = np.array([1, 2, 2**63], dtype=np.uint64) + tm.assert_numpy_array_equal(res, exp) + + def test_unique_data_ownership(self): + # it works! GH#1807 + Series(Series(["a", "c", "b"]).unique()).sort_values() + + def test_unique(self): + # GH#714 also, dtype=float + ser = Series([1.2345] * 100) + ser[::2] = np.nan + result = ser.unique() + assert len(result) == 2 + + # explicit f4 dtype + ser = Series([1.2345] * 100, dtype="f4") + ser[::2] = np.nan + result = ser.unique() + assert len(result) == 2 + + def test_unique_nan_object_dtype(self): + # NAs in object arrays GH#714 + ser = Series(["foo"] * 100, dtype="O") + ser[::2] = np.nan + result = ser.unique() + assert len(result) == 2 + + def test_unique_none(self): + # decision about None + ser = Series([1, 2, 3, None, None, None], dtype=object) + result = ser.unique() + expected = np.array([1, 2, 3, None], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_unique_categorical(self): + # GH#18051 + cat = Categorical([]) + ser = Series(cat) + result = ser.unique() + tm.assert_categorical_equal(result, cat) + + cat = Categorical([np.nan]) + ser = Series(cat) + result = ser.unique() + tm.assert_categorical_equal(result, cat) + + def test_tz_unique(self): + # GH 46128 + dti1 = date_range("2016-01-01", periods=3) + ii1 = IntervalIndex.from_breaks(dti1) + ser1 = Series(ii1) + uni1 = ser1.unique() + tm.assert_interval_array_equal(ser1.array, uni1) + + dti2 = date_range("2016-01-01", periods=3, tz="US/Eastern") + ii2 = IntervalIndex.from_breaks(dti2) + ser2 = Series(ii2) + uni2 = ser2.unique() + tm.assert_interval_array_equal(ser2.array, uni2) + + assert uni1.dtype != uni2.dtype diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_unstack.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_unstack.py new file mode 100644 index 0000000000000000000000000000000000000000..3c70e839c8e206c0b0e07a1046132666643329aa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_unstack.py @@ -0,0 +1,169 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + date_range, +) +import pandas._testing as tm + + +def test_unstack_preserves_object(): + mi = MultiIndex.from_product([["bar", "foo"], ["one", "two"]]) + + ser = Series(np.arange(4.0), index=mi, dtype=object) + + res1 = ser.unstack() + assert (res1.dtypes == object).all() + + res2 = ser.unstack(level=0) + assert (res2.dtypes == object).all() + + +def test_unstack(): + index = MultiIndex( + levels=[["bar", "foo"], ["one", "three", "two"]], + codes=[[1, 1, 0, 0], [0, 1, 0, 2]], + ) + + s = Series(np.arange(4.0), index=index) + unstacked = s.unstack() + + expected = DataFrame( + [[2.0, np.nan, 3.0], [0.0, 1.0, np.nan]], + index=["bar", "foo"], + columns=["one", "three", "two"], + ) + + tm.assert_frame_equal(unstacked, expected) + + unstacked = s.unstack(level=0) + tm.assert_frame_equal(unstacked, expected.T) + + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + s = Series(np.random.default_rng(2).standard_normal(6), index=index) + exp_index = MultiIndex( + levels=[["one", "two", "three"], [0, 1]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + expected = DataFrame({"bar": s.values}, index=exp_index).sort_index(level=0) + unstacked = s.unstack(0).sort_index() + tm.assert_frame_equal(unstacked, expected) + + # GH5873 + idx = MultiIndex.from_arrays([[101, 102], [3.5, np.nan]]) + ts = Series([1, 2], index=idx) + left = ts.unstack() + right = DataFrame( + [[np.nan, 1], [2, np.nan]], index=[101, 102], columns=[np.nan, 3.5] + ) + tm.assert_frame_equal(left, right) + + idx = MultiIndex.from_arrays( + [ + ["cat", "cat", "cat", "dog", "dog"], + ["a", "a", "b", "a", "b"], + [1, 2, 1, 1, np.nan], + ] + ) + ts = Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx) + right = DataFrame( + [[1.0, 1.3], [1.1, np.nan], [np.nan, 1.4], [1.2, np.nan]], + columns=["cat", "dog"], + ) + tpls = [("a", 1), ("a", 2), ("b", np.nan), ("b", 1)] + right.index = MultiIndex.from_tuples(tpls) + tm.assert_frame_equal(ts.unstack(level=0), right) + + +def test_unstack_tuplename_in_multiindex(): + # GH 19966 + idx = MultiIndex.from_product( + [["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")] + ) + ser = Series(1, index=idx) + result = ser.unstack(("A", "a")) + + expected = DataFrame( + [[1, 1, 1], [1, 1, 1], [1, 1, 1]], + columns=MultiIndex.from_tuples([("a",), ("b",), ("c",)], names=[("A", "a")]), + index=Index([1, 2, 3], name=("B", "b")), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "unstack_idx, expected_values, expected_index, expected_columns", + [ + ( + ("A", "a"), + [[1, 1], [1, 1], [1, 1], [1, 1]], + MultiIndex.from_tuples([(1, 3), (1, 4), (2, 3), (2, 4)], names=["B", "C"]), + MultiIndex.from_tuples([("a",), ("b",)], names=[("A", "a")]), + ), + ( + (("A", "a"), "B"), + [[1, 1, 1, 1], [1, 1, 1, 1]], + Index([3, 4], name="C"), + MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=[("A", "a"), "B"] + ), + ), + ], +) +def test_unstack_mixed_type_name_in_multiindex( + unstack_idx, expected_values, expected_index, expected_columns +): + # GH 19966 + idx = MultiIndex.from_product( + [["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"] + ) + ser = Series(1, index=idx) + result = ser.unstack(unstack_idx) + + expected = DataFrame( + expected_values, columns=expected_columns, index=expected_index + ) + tm.assert_frame_equal(result, expected) + + +def test_unstack_multi_index_categorical_values(): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + mi = df.stack(future_stack=True).index.rename(["major", "minor"]) + ser = Series(["foo"] * len(mi), index=mi, name="category", dtype="category") + + result = ser.unstack() + + dti = ser.index.levels[0] + c = pd.Categorical(["foo"] * len(dti)) + expected = DataFrame( + {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, + columns=Index(list("ABCD"), name="minor"), + index=dti.rename("major"), + ) + tm.assert_frame_equal(result, expected) + + +def test_unstack_mixed_level_names(): + # GH#48763 + arrays = [["a", "a"], [1, 2], ["red", "blue"]] + idx = MultiIndex.from_arrays(arrays, names=("x", 0, "y")) + ser = Series([1, 2], index=idx) + result = ser.unstack("x") + expected = DataFrame( + [[1], [2]], + columns=Index(["a"], name="x"), + index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]), + ) + tm.assert_frame_equal(result, expected) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_view.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_view.py new file mode 100644 index 0000000000000000000000000000000000000000..7e0ac372cd443cf9a468b469df1c22818f10aad2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/series/methods/test_view.py @@ -0,0 +1,61 @@ +import numpy as np +import pytest + +from pandas import ( + Index, + Series, + array, + date_range, +) +import pandas._testing as tm + +pytestmark = pytest.mark.filterwarnings( + "ignore:Series.view is deprecated and will be removed in a future version.:FutureWarning" # noqa: E501 +) + + +class TestView: + def test_view_i8_to_datetimelike(self): + dti = date_range("2000", periods=4, tz="US/Central") + ser = Series(dti.asi8) + + result = ser.view(dti.dtype) + tm.assert_datetime_array_equal(result._values, dti._data._with_freq(None)) + + pi = dti.tz_localize(None).to_period("D") + ser = Series(pi.asi8) + result = ser.view(pi.dtype) + tm.assert_period_array_equal(result._values, pi._data) + + def test_view_tz(self): + # GH#24024 + ser = Series(date_range("2000", periods=4, tz="US/Central")) + result = ser.view("i8") + expected = Series( + [ + 946706400000000000, + 946792800000000000, + 946879200000000000, + 946965600000000000, + ] + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "first", ["m8[ns]", "M8[ns]", "M8[ns, US/Central]", "period[D]"] + ) + @pytest.mark.parametrize( + "second", ["m8[ns]", "M8[ns]", "M8[ns, US/Central]", "period[D]"] + ) + @pytest.mark.parametrize("box", [Series, Index, array]) + def test_view_between_datetimelike(self, first, second, box): + dti = date_range("2016-01-01", periods=3) + + orig = box(dti) + obj = orig.view(first) + assert obj.dtype == first + tm.assert_numpy_array_equal(np.asarray(obj.view("i8")), dti.asi8) + + res = obj.view(second) + assert res.dtype == second + tm.assert_numpy_array_equal(np.asarray(obj.view("i8")), dti.asi8) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_get_dummies.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_get_dummies.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..290d1b551a2fc1b37466e6428ac64ed92e7bbf23 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/strings/__pycache__/test_get_dummies.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..347f88e6052c84c0ccad3b79d97775ec4f6aa96a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da956cc072613a768f191124a2632c745a174cc3 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_array_to_datetime.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_array_to_datetime.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5504077bb0ee9829bb4a2c35efca7eab22e859ac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_array_to_datetime.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_ccalendar.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_ccalendar.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2ce2ba8011196de7b3e60888743b9d189878402 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_ccalendar.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_conversion.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_conversion.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84a4c6300832cd39b60444d3df7ad5abb58c3012 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_conversion.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_fields.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_fields.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0e1621c71849edb4ed04ab4666a3bc2bb89a986 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_fields.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_libfrequencies.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_libfrequencies.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b250ecbcf1ff846118daa50e72e579434909d6a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_libfrequencies.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_liboffsets.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_liboffsets.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55a1d9279d93dd9ea0ef76b7caeb5620ccda4c10 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_liboffsets.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_np_datetime.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_np_datetime.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11254613c33b1dbef3f17c520aa2ab5d0c1076da Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_np_datetime.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_npy_units.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_npy_units.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..198cf0a4fcba31d3aab856117bada24200ace0f5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_npy_units.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_parse_iso8601.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_parse_iso8601.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f2e1802669473a0bf3130bcae6dc9eb93f9dcdd Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_parse_iso8601.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_parsing.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_parsing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af40c9324de1b1d49e3bb23ca01f52ee19721ab8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_parsing.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_period.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_period.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e69abd0c429091de73cfcf54904ed5334a1486d5 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_period.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_resolution.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_resolution.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..287052cf5814480559f9d83f89f6321be8bfe799 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_resolution.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_strptime.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_strptime.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29d78f4fe75cd2b9b30910f114e5046aff7399d2 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_strptime.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_timedeltas.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_timedeltas.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..630d9d30fec2fcae2c7fde939e82fc2675e0ad1c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_timedeltas.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_timezones.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_timezones.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a16588bcd0bb9a27fef189f0a25f4f3dc0b29fca Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_timezones.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_to_offset.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_to_offset.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20dc074ca2ff5a827e12fa1830fed6f03eb7e3c8 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_to_offset.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_tzconversion.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_tzconversion.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b24db7b96a9ddd5439967f009f52987e960c1daa Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tests/tslibs/__pycache__/test_tzconversion.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..380e17afc3aecb54bde4ef7edb6a5e511bf1cd9c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3b83bc3044b203f3829d8a355ddf44f8c66f4dc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/api.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/frequencies.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/frequencies.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec054b9eff65fc54f2baf4a6951a2b5a9b33ad84 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/frequencies.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/holiday.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/holiday.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1eaccb807558b30ae0a25912f695ee421679f622 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/holiday.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/offsets.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/offsets.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3e4043dad556bd37a357f44c0f555d4e110895c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pandas/tseries/__pycache__/offsets.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..450c79ba47f7159e4b0018e144ba422ce274c073 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/__version__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/__version__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f9d1150960aff21f35f50d2184351095b3a00e4 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/__version__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_base.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..309e575d26af208a0e245dbe24d312bf48833a90 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_base.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_common.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_common.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ad695939de4895a7626f6c6858a947b1a817955 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_common.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_const.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_const.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c4768dbb1105145b42422ca6cf95c59ab8c8476 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_const.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_filename.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_filename.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..392e9f00ed697bcfd73f1bb84639be0e6efb19dc Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_filename.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_filepath.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_filepath.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21f7e8f326ccd5aa8f46000f3fbf61be466de297 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_filepath.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_ltsv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_ltsv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac9e16d1aafb5dd972cc07c4d100011f6ef09b2f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_ltsv.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_symbol.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_symbol.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9d295cec8d655a3ab1726f6108a097d51808535 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_symbol.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_types.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_types.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aab66dd1ac20bdd0829c2206783cb772b7f9878e Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/_types.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/argparse.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/argparse.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4bbba119bf5c0fadc3b896a8619622dd005b74b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/argparse.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/click.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/click.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..edb9fc8d5e83858da8ad71deb9ee783b1fea534c Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/click.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/error.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/error.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a38da1f0ea0a175e080976851ce8e5322d57403f Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/error.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/handler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/handler.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f77afd51ab0233703f10124be17ed9af32827787 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/pathvalidate/__pycache__/handler.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e374d5c5604d0eeed555e68b7523bfac72fd9101 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/__init__.py @@ -0,0 +1,14 @@ +import importlib +import sys + +__version__, _, _ = sys.version.partition(' ') + + +try: + # Allow Debian and pkgsrc (only) to customize system + # behavior. Ref pypa/distutils#2 and pypa/distutils#16. + # This hook is deprecated and no other environments + # should use it. + importlib.import_module('_distutils_system_mod') +except ImportError: + pass diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_log.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_log.py new file mode 100644 index 0000000000000000000000000000000000000000..0148f157ff3bf8bd728c82f49cd3d1cd9cb5a43e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_log.py @@ -0,0 +1,3 @@ +import logging + +log = logging.getLogger() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_macos_compat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_macos_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..76ecb96abe478313ce7d09342b2643bf4a765331 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_macos_compat.py @@ -0,0 +1,12 @@ +import importlib +import sys + + +def bypass_compiler_fixup(cmd, args): + return cmd + + +if sys.platform == 'darwin': + compiler_fixup = importlib.import_module('_osx_support').compiler_fixup +else: + compiler_fixup = bypass_compiler_fixup diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_modified.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_modified.py new file mode 100644 index 0000000000000000000000000000000000000000..7cdca9398f487bf405e0db0272f8aa9672d30b5e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_modified.py @@ -0,0 +1,73 @@ +"""Timestamp comparison of files and groups of files.""" + +import functools +import os.path + +from jaraco.functools import splat + +from .compat.py39 import zip_strict +from .errors import DistutilsFileError + + +def _newer(source, target): + return not os.path.exists(target) or ( + os.path.getmtime(source) > os.path.getmtime(target) + ) + + +def newer(source, target): + """ + Is source modified more recently than target. + + Returns True if 'source' is modified more recently than + 'target' or if 'target' does not exist. + + Raises DistutilsFileError if 'source' does not exist. + """ + if not os.path.exists(source): + raise DistutilsFileError(f"file '{os.path.abspath(source)}' does not exist") + + return _newer(source, target) + + +def newer_pairwise(sources, targets, newer=newer): + """ + Filter filenames where sources are newer than targets. + + Walk two filename iterables in parallel, testing if each source is newer + than its corresponding target. Returns a pair of lists (sources, + targets) where source is newer than target, according to the semantics + of 'newer()'. + """ + newer_pairs = filter(splat(newer), zip_strict(sources, targets)) + return tuple(map(list, zip(*newer_pairs))) or ([], []) + + +def newer_group(sources, target, missing='error'): + """ + Is target out-of-date with respect to any file in sources. + + Return True if 'target' is out-of-date with respect to any file + listed in 'sources'. In other words, if 'target' exists and is newer + than every file in 'sources', return False; otherwise return True. + ``missing`` controls how to handle a missing source file: + + - error (default): allow the ``stat()`` call to fail. + - ignore: silently disregard any missing source files. + - newer: treat missing source files as "target out of date". This + mode is handy in "dry-run" mode: it will pretend to carry out + commands that wouldn't work because inputs are missing, but + that doesn't matter because dry-run won't run the commands. + """ + + def missing_as_newer(source): + return missing == 'newer' and not os.path.exists(source) + + ignored = os.path.exists if missing == 'ignore' else None + return not os.path.exists(target) or any( + missing_as_newer(source) or _newer(source, target) + for source in filter(ignored, sources) + ) + + +newer_pairwise_group = functools.partial(newer_pairwise, newer=newer_group) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_msvccompiler.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_msvccompiler.py new file mode 100644 index 0000000000000000000000000000000000000000..97b067c68611694e2c291eb77c156c016285ec64 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/_msvccompiler.py @@ -0,0 +1,604 @@ +"""distutils._msvccompiler + +Contains MSVCCompiler, an implementation of the abstract CCompiler class +for Microsoft Visual Studio 2015. + +This module requires VS 2015 or later. +""" + +# Written by Perry Stoll +# hacked by Robin Becker and Thomas Heller to do a better job of +# finding DevStudio (through the registry) +# ported to VS 2005 and VS 2008 by Christian Heimes +# ported to VS 2015 by Steve Dower + +import contextlib +import os +import subprocess +import unittest.mock as mock +import warnings + +with contextlib.suppress(ImportError): + import winreg + +from itertools import count + +from ._log import log +from .ccompiler import CCompiler, gen_lib_options +from .errors import ( + CompileError, + DistutilsExecError, + DistutilsPlatformError, + LibError, + LinkError, +) +from .util import get_host_platform, get_platform + + +def _find_vc2015(): + try: + key = winreg.OpenKeyEx( + winreg.HKEY_LOCAL_MACHINE, + r"Software\Microsoft\VisualStudio\SxS\VC7", + access=winreg.KEY_READ | winreg.KEY_WOW64_32KEY, + ) + except OSError: + log.debug("Visual C++ is not registered") + return None, None + + best_version = 0 + best_dir = None + with key: + for i in count(): + try: + v, vc_dir, vt = winreg.EnumValue(key, i) + except OSError: + break + if v and vt == winreg.REG_SZ and os.path.isdir(vc_dir): + try: + version = int(float(v)) + except (ValueError, TypeError): + continue + if version >= 14 and version > best_version: + best_version, best_dir = version, vc_dir + return best_version, best_dir + + +def _find_vc2017(): + """Returns "15, path" based on the result of invoking vswhere.exe + If no install is found, returns "None, None" + + The version is returned to avoid unnecessarily changing the function + result. It may be ignored when the path is not None. + + If vswhere.exe is not available, by definition, VS 2017 is not + installed. + """ + root = os.environ.get("ProgramFiles(x86)") or os.environ.get("ProgramFiles") + if not root: + return None, None + + variant = 'arm64' if get_platform() == 'win-arm64' else 'x86.x64' + suitable_components = ( + f"Microsoft.VisualStudio.Component.VC.Tools.{variant}", + "Microsoft.VisualStudio.Workload.WDExpress", + ) + + for component in suitable_components: + # Workaround for `-requiresAny` (only available on VS 2017 > 15.6) + with contextlib.suppress( + subprocess.CalledProcessError, OSError, UnicodeDecodeError + ): + path = ( + subprocess.check_output([ + os.path.join( + root, "Microsoft Visual Studio", "Installer", "vswhere.exe" + ), + "-latest", + "-prerelease", + "-requires", + component, + "-property", + "installationPath", + "-products", + "*", + ]) + .decode(encoding="mbcs", errors="strict") + .strip() + ) + + path = os.path.join(path, "VC", "Auxiliary", "Build") + if os.path.isdir(path): + return 15, path + + return None, None # no suitable component found + + +PLAT_SPEC_TO_RUNTIME = { + 'x86': 'x86', + 'x86_amd64': 'x64', + 'x86_arm': 'arm', + 'x86_arm64': 'arm64', +} + + +def _find_vcvarsall(plat_spec): + # bpo-38597: Removed vcruntime return value + _, best_dir = _find_vc2017() + + if not best_dir: + best_version, best_dir = _find_vc2015() + + if not best_dir: + log.debug("No suitable Visual C++ version found") + return None, None + + vcvarsall = os.path.join(best_dir, "vcvarsall.bat") + if not os.path.isfile(vcvarsall): + log.debug("%s cannot be found", vcvarsall) + return None, None + + return vcvarsall, None + + +def _get_vc_env(plat_spec): + if os.getenv("DISTUTILS_USE_SDK"): + return {key.lower(): value for key, value in os.environ.items()} + + vcvarsall, _ = _find_vcvarsall(plat_spec) + if not vcvarsall: + raise DistutilsPlatformError( + 'Microsoft Visual C++ 14.0 or greater is required. ' + 'Get it with "Microsoft C++ Build Tools": ' + 'https://visualstudio.microsoft.com/visual-cpp-build-tools/' + ) + + try: + out = subprocess.check_output( + f'cmd /u /c "{vcvarsall}" {plat_spec} && set', + stderr=subprocess.STDOUT, + ).decode('utf-16le', errors='replace') + except subprocess.CalledProcessError as exc: + log.error(exc.output) + raise DistutilsPlatformError(f"Error executing {exc.cmd}") + + env = { + key.lower(): value + for key, _, value in (line.partition('=') for line in out.splitlines()) + if key and value + } + + return env + + +def _find_exe(exe, paths=None): + """Return path to an MSVC executable program. + + Tries to find the program in several places: first, one of the + MSVC program search paths from the registry; next, the directories + in the PATH environment variable. If any of those work, return an + absolute path that is known to exist. If none of them work, just + return the original program name, 'exe'. + """ + if not paths: + paths = os.getenv('path').split(os.pathsep) + for p in paths: + fn = os.path.join(os.path.abspath(p), exe) + if os.path.isfile(fn): + return fn + return exe + + +_vcvars_names = { + 'win32': 'x86', + 'win-amd64': 'amd64', + 'win-arm32': 'arm', + 'win-arm64': 'arm64', +} + + +def _get_vcvars_spec(host_platform, platform): + """ + Given a host platform and platform, determine the spec for vcvarsall. + + Uses the native MSVC host if the host platform would need expensive + emulation for x86. + + >>> _get_vcvars_spec('win-arm64', 'win32') + 'arm64_x86' + >>> _get_vcvars_spec('win-arm64', 'win-amd64') + 'arm64_amd64' + + Otherwise, always cross-compile from x86 to work with the + lighter-weight MSVC installs that do not include native 64-bit tools. + + >>> _get_vcvars_spec('win32', 'win32') + 'x86' + >>> _get_vcvars_spec('win-arm32', 'win-arm32') + 'x86_arm' + >>> _get_vcvars_spec('win-amd64', 'win-arm64') + 'x86_arm64' + """ + if host_platform != 'win-arm64': + host_platform = 'win32' + vc_hp = _vcvars_names[host_platform] + vc_plat = _vcvars_names[platform] + return vc_hp if vc_hp == vc_plat else f'{vc_hp}_{vc_plat}' + + +class MSVCCompiler(CCompiler): + """Concrete class that implements an interface to Microsoft Visual C++, + as defined by the CCompiler abstract class.""" + + compiler_type = 'msvc' + + # Just set this so CCompiler's constructor doesn't barf. We currently + # don't use the 'set_executables()' bureaucracy provided by CCompiler, + # as it really isn't necessary for this sort of single-compiler class. + # Would be nice to have a consistent interface with UnixCCompiler, + # though, so it's worth thinking about. + executables = {} + + # Private class data (need to distinguish C from C++ source for compiler) + _c_extensions = ['.c'] + _cpp_extensions = ['.cc', '.cpp', '.cxx'] + _rc_extensions = ['.rc'] + _mc_extensions = ['.mc'] + + # Needed for the filename generation methods provided by the + # base class, CCompiler. + src_extensions = _c_extensions + _cpp_extensions + _rc_extensions + _mc_extensions + res_extension = '.res' + obj_extension = '.obj' + static_lib_extension = '.lib' + shared_lib_extension = '.dll' + static_lib_format = shared_lib_format = '%s%s' + exe_extension = '.exe' + + def __init__(self, verbose=False, dry_run=False, force=False): + super().__init__(verbose, dry_run, force) + # target platform (.plat_name is consistent with 'bdist') + self.plat_name = None + self.initialized = False + + @classmethod + def _configure(cls, vc_env): + """ + Set class-level include/lib dirs. + """ + cls.include_dirs = cls._parse_path(vc_env.get('include', '')) + cls.library_dirs = cls._parse_path(vc_env.get('lib', '')) + + @staticmethod + def _parse_path(val): + return [dir.rstrip(os.sep) for dir in val.split(os.pathsep) if dir] + + def initialize(self, plat_name=None): + # multi-init means we would need to check platform same each time... + assert not self.initialized, "don't init multiple times" + if plat_name is None: + plat_name = get_platform() + # sanity check for platforms to prevent obscure errors later. + if plat_name not in _vcvars_names: + raise DistutilsPlatformError( + f"--plat-name must be one of {tuple(_vcvars_names)}" + ) + + plat_spec = _get_vcvars_spec(get_host_platform(), plat_name) + + vc_env = _get_vc_env(plat_spec) + if not vc_env: + raise DistutilsPlatformError( + "Unable to find a compatible Visual Studio installation." + ) + self._configure(vc_env) + + self._paths = vc_env.get('path', '') + paths = self._paths.split(os.pathsep) + self.cc = _find_exe("cl.exe", paths) + self.linker = _find_exe("link.exe", paths) + self.lib = _find_exe("lib.exe", paths) + self.rc = _find_exe("rc.exe", paths) # resource compiler + self.mc = _find_exe("mc.exe", paths) # message compiler + self.mt = _find_exe("mt.exe", paths) # message compiler + + self.preprocess_options = None + # bpo-38597: Always compile with dynamic linking + # Future releases of Python 3.x will include all past + # versions of vcruntime*.dll for compatibility. + self.compile_options = ['/nologo', '/O2', '/W3', '/GL', '/DNDEBUG', '/MD'] + + self.compile_options_debug = [ + '/nologo', + '/Od', + '/MDd', + '/Zi', + '/W3', + '/D_DEBUG', + ] + + ldflags = ['/nologo', '/INCREMENTAL:NO', '/LTCG'] + + ldflags_debug = ['/nologo', '/INCREMENTAL:NO', '/LTCG', '/DEBUG:FULL'] + + self.ldflags_exe = [*ldflags, '/MANIFEST:EMBED,ID=1'] + self.ldflags_exe_debug = [*ldflags_debug, '/MANIFEST:EMBED,ID=1'] + self.ldflags_shared = [ + *ldflags, + '/DLL', + '/MANIFEST:EMBED,ID=2', + '/MANIFESTUAC:NO', + ] + self.ldflags_shared_debug = [ + *ldflags_debug, + '/DLL', + '/MANIFEST:EMBED,ID=2', + '/MANIFESTUAC:NO', + ] + self.ldflags_static = [*ldflags] + self.ldflags_static_debug = [*ldflags_debug] + + self._ldflags = { + (CCompiler.EXECUTABLE, None): self.ldflags_exe, + (CCompiler.EXECUTABLE, False): self.ldflags_exe, + (CCompiler.EXECUTABLE, True): self.ldflags_exe_debug, + (CCompiler.SHARED_OBJECT, None): self.ldflags_shared, + (CCompiler.SHARED_OBJECT, False): self.ldflags_shared, + (CCompiler.SHARED_OBJECT, True): self.ldflags_shared_debug, + (CCompiler.SHARED_LIBRARY, None): self.ldflags_static, + (CCompiler.SHARED_LIBRARY, False): self.ldflags_static, + (CCompiler.SHARED_LIBRARY, True): self.ldflags_static_debug, + } + + self.initialized = True + + # -- Worker methods ------------------------------------------------ + + @property + def out_extensions(self): + return { + **super().out_extensions, + **{ + ext: self.res_extension + for ext in self._rc_extensions + self._mc_extensions + }, + } + + def compile( # noqa: C901 + self, + sources, + output_dir=None, + macros=None, + include_dirs=None, + debug=False, + extra_preargs=None, + extra_postargs=None, + depends=None, + ): + if not self.initialized: + self.initialize() + compile_info = self._setup_compile( + output_dir, macros, include_dirs, sources, depends, extra_postargs + ) + macros, objects, extra_postargs, pp_opts, build = compile_info + + compile_opts = extra_preargs or [] + compile_opts.append('/c') + if debug: + compile_opts.extend(self.compile_options_debug) + else: + compile_opts.extend(self.compile_options) + + add_cpp_opts = False + + for obj in objects: + try: + src, ext = build[obj] + except KeyError: + continue + if debug: + # pass the full pathname to MSVC in debug mode, + # this allows the debugger to find the source file + # without asking the user to browse for it + src = os.path.abspath(src) + + if ext in self._c_extensions: + input_opt = "/Tc" + src + elif ext in self._cpp_extensions: + input_opt = "/Tp" + src + add_cpp_opts = True + elif ext in self._rc_extensions: + # compile .RC to .RES file + input_opt = src + output_opt = "/fo" + obj + try: + self.spawn([self.rc] + pp_opts + [output_opt, input_opt]) + except DistutilsExecError as msg: + raise CompileError(msg) + continue + elif ext in self._mc_extensions: + # Compile .MC to .RC file to .RES file. + # * '-h dir' specifies the directory for the + # generated include file + # * '-r dir' specifies the target directory of the + # generated RC file and the binary message resource + # it includes + # + # For now (since there are no options to change this), + # we use the source-directory for the include file and + # the build directory for the RC file and message + # resources. This works at least for win32all. + h_dir = os.path.dirname(src) + rc_dir = os.path.dirname(obj) + try: + # first compile .MC to .RC and .H file + self.spawn([self.mc, '-h', h_dir, '-r', rc_dir, src]) + base, _ = os.path.splitext(os.path.basename(src)) + rc_file = os.path.join(rc_dir, base + '.rc') + # then compile .RC to .RES file + self.spawn([self.rc, "/fo" + obj, rc_file]) + + except DistutilsExecError as msg: + raise CompileError(msg) + continue + else: + # how to handle this file? + raise CompileError(f"Don't know how to compile {src} to {obj}") + + args = [self.cc] + compile_opts + pp_opts + if add_cpp_opts: + args.append('/EHsc') + args.extend((input_opt, "/Fo" + obj)) + args.extend(extra_postargs) + + try: + self.spawn(args) + except DistutilsExecError as msg: + raise CompileError(msg) + + return objects + + def create_static_lib( + self, objects, output_libname, output_dir=None, debug=False, target_lang=None + ): + if not self.initialized: + self.initialize() + objects, output_dir = self._fix_object_args(objects, output_dir) + output_filename = self.library_filename(output_libname, output_dir=output_dir) + + if self._need_link(objects, output_filename): + lib_args = objects + ['/OUT:' + output_filename] + if debug: + pass # XXX what goes here? + try: + log.debug('Executing "%s" %s', self.lib, ' '.join(lib_args)) + self.spawn([self.lib] + lib_args) + except DistutilsExecError as msg: + raise LibError(msg) + else: + log.debug("skipping %s (up-to-date)", output_filename) + + def link( + self, + target_desc, + objects, + output_filename, + output_dir=None, + libraries=None, + library_dirs=None, + runtime_library_dirs=None, + export_symbols=None, + debug=False, + extra_preargs=None, + extra_postargs=None, + build_temp=None, + target_lang=None, + ): + if not self.initialized: + self.initialize() + objects, output_dir = self._fix_object_args(objects, output_dir) + fixed_args = self._fix_lib_args(libraries, library_dirs, runtime_library_dirs) + libraries, library_dirs, runtime_library_dirs = fixed_args + + if runtime_library_dirs: + self.warn( + "I don't know what to do with 'runtime_library_dirs': " + + str(runtime_library_dirs) + ) + + lib_opts = gen_lib_options(self, library_dirs, runtime_library_dirs, libraries) + if output_dir is not None: + output_filename = os.path.join(output_dir, output_filename) + + if self._need_link(objects, output_filename): + ldflags = self._ldflags[target_desc, debug] + + export_opts = ["/EXPORT:" + sym for sym in (export_symbols or [])] + + ld_args = ( + ldflags + lib_opts + export_opts + objects + ['/OUT:' + output_filename] + ) + + # The MSVC linker generates .lib and .exp files, which cannot be + # suppressed by any linker switches. The .lib files may even be + # needed! Make sure they are generated in the temporary build + # directory. Since they have different names for debug and release + # builds, they can go into the same directory. + build_temp = os.path.dirname(objects[0]) + if export_symbols is not None: + (dll_name, dll_ext) = os.path.splitext( + os.path.basename(output_filename) + ) + implib_file = os.path.join(build_temp, self.library_filename(dll_name)) + ld_args.append('/IMPLIB:' + implib_file) + + if extra_preargs: + ld_args[:0] = extra_preargs + if extra_postargs: + ld_args.extend(extra_postargs) + + output_dir = os.path.dirname(os.path.abspath(output_filename)) + self.mkpath(output_dir) + try: + log.debug('Executing "%s" %s', self.linker, ' '.join(ld_args)) + self.spawn([self.linker] + ld_args) + except DistutilsExecError as msg: + raise LinkError(msg) + else: + log.debug("skipping %s (up-to-date)", output_filename) + + def spawn(self, cmd): + env = dict(os.environ, PATH=self._paths) + with self._fallback_spawn(cmd, env) as fallback: + return super().spawn(cmd, env=env) + return fallback.value + + @contextlib.contextmanager + def _fallback_spawn(self, cmd, env): + """ + Discovered in pypa/distutils#15, some tools monkeypatch the compiler, + so the 'env' kwarg causes a TypeError. Detect this condition and + restore the legacy, unsafe behavior. + """ + bag = type('Bag', (), {})() + try: + yield bag + except TypeError as exc: + if "unexpected keyword argument 'env'" not in str(exc): + raise + else: + return + warnings.warn("Fallback spawn triggered. Please update distutils monkeypatch.") + with mock.patch.dict('os.environ', env): + bag.value = super().spawn(cmd) + + # -- Miscellaneous methods ----------------------------------------- + # These are all used by the 'gen_lib_options() function, in + # ccompiler.py. + + def library_dir_option(self, dir): + return "/LIBPATH:" + dir + + def runtime_library_dir_option(self, dir): + raise DistutilsPlatformError( + "don't know how to set runtime library search path for MSVC" + ) + + def library_option(self, lib): + return self.library_filename(lib) + + def find_library_file(self, dirs, lib, debug=False): + # Prefer a debugging library if found (and requested), but deal + # with it if we don't have one. + if debug: + try_names = [lib + "_d", lib] + else: + try_names = [lib] + for dir in dirs: + for name in try_names: + libfile = os.path.join(dir, self.library_filename(name)) + if os.path.isfile(libfile): + return libfile + else: + # Oops, didn't find it in *any* of 'dirs' + return None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/archive_util.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/archive_util.py new file mode 100644 index 0000000000000000000000000000000000000000..5bb6df763de0bd7f242e3bb786f6a9cd68bb78fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/archive_util.py @@ -0,0 +1,264 @@ +"""distutils.archive_util + +Utility functions for creating archive files (tarballs, zip files, +that sort of thing).""" + +import os + +try: + import zipfile +except ImportError: + zipfile = None + + +from ._log import log +from .dir_util import mkpath +from .errors import DistutilsExecError +from .spawn import spawn + +try: + from pwd import getpwnam +except ImportError: + getpwnam = None + +try: + from grp import getgrnam +except ImportError: + getgrnam = None + + +def _get_gid(name): + """Returns a gid, given a group name.""" + if getgrnam is None or name is None: + return None + try: + result = getgrnam(name) + except KeyError: + result = None + if result is not None: + return result[2] + return None + + +def _get_uid(name): + """Returns an uid, given a user name.""" + if getpwnam is None or name is None: + return None + try: + result = getpwnam(name) + except KeyError: + result = None + if result is not None: + return result[2] + return None + + +def make_tarball( + base_name, + base_dir, + compress="gzip", + verbose=False, + dry_run=False, + owner=None, + group=None, +): + """Create a (possibly compressed) tar file from all the files under + 'base_dir'. + + 'compress' must be "gzip" (the default), "bzip2", "xz", or None. + + 'owner' and 'group' can be used to define an owner and a group for the + archive that is being built. If not provided, the current owner and group + will be used. + + The output tar file will be named 'base_dir' + ".tar", possibly plus + the appropriate compression extension (".gz", ".bz2", ".xz" or ".Z"). + + Returns the output filename. + """ + tar_compression = { + 'gzip': 'gz', + 'bzip2': 'bz2', + 'xz': 'xz', + None: '', + } + compress_ext = {'gzip': '.gz', 'bzip2': '.bz2', 'xz': '.xz'} + + # flags for compression program, each element of list will be an argument + if compress is not None and compress not in compress_ext.keys(): + raise ValueError( + "bad value for 'compress': must be None, 'gzip', 'bzip2', 'xz'" + ) + + archive_name = base_name + '.tar' + archive_name += compress_ext.get(compress, '') + + mkpath(os.path.dirname(archive_name), dry_run=dry_run) + + # creating the tarball + import tarfile # late import so Python build itself doesn't break + + log.info('Creating tar archive') + + uid = _get_uid(owner) + gid = _get_gid(group) + + def _set_uid_gid(tarinfo): + if gid is not None: + tarinfo.gid = gid + tarinfo.gname = group + if uid is not None: + tarinfo.uid = uid + tarinfo.uname = owner + return tarinfo + + if not dry_run: + tar = tarfile.open(archive_name, f'w|{tar_compression[compress]}') + try: + tar.add(base_dir, filter=_set_uid_gid) + finally: + tar.close() + + return archive_name + + +def make_zipfile(base_name, base_dir, verbose=False, dry_run=False): # noqa: C901 + """Create a zip file from all the files under 'base_dir'. + + The output zip file will be named 'base_name' + ".zip". Uses either the + "zipfile" Python module (if available) or the InfoZIP "zip" utility + (if installed and found on the default search path). If neither tool is + available, raises DistutilsExecError. Returns the name of the output zip + file. + """ + zip_filename = base_name + ".zip" + mkpath(os.path.dirname(zip_filename), dry_run=dry_run) + + # If zipfile module is not available, try spawning an external + # 'zip' command. + if zipfile is None: + if verbose: + zipoptions = "-r" + else: + zipoptions = "-rq" + + try: + spawn(["zip", zipoptions, zip_filename, base_dir], dry_run=dry_run) + except DistutilsExecError: + # XXX really should distinguish between "couldn't find + # external 'zip' command" and "zip failed". + raise DistutilsExecError( + f"unable to create zip file '{zip_filename}': " + "could neither import the 'zipfile' module nor " + "find a standalone zip utility" + ) + + else: + log.info("creating '%s' and adding '%s' to it", zip_filename, base_dir) + + if not dry_run: + try: + zip = zipfile.ZipFile( + zip_filename, "w", compression=zipfile.ZIP_DEFLATED + ) + except RuntimeError: + zip = zipfile.ZipFile(zip_filename, "w", compression=zipfile.ZIP_STORED) + + with zip: + if base_dir != os.curdir: + path = os.path.normpath(os.path.join(base_dir, '')) + zip.write(path, path) + log.info("adding '%s'", path) + for dirpath, dirnames, filenames in os.walk(base_dir): + for name in dirnames: + path = os.path.normpath(os.path.join(dirpath, name, '')) + zip.write(path, path) + log.info("adding '%s'", path) + for name in filenames: + path = os.path.normpath(os.path.join(dirpath, name)) + if os.path.isfile(path): + zip.write(path, path) + log.info("adding '%s'", path) + + return zip_filename + + +ARCHIVE_FORMATS = { + 'gztar': (make_tarball, [('compress', 'gzip')], "gzip'ed tar-file"), + 'bztar': (make_tarball, [('compress', 'bzip2')], "bzip2'ed tar-file"), + 'xztar': (make_tarball, [('compress', 'xz')], "xz'ed tar-file"), + 'ztar': (make_tarball, [('compress', 'compress')], "compressed tar file"), + 'tar': (make_tarball, [('compress', None)], "uncompressed tar file"), + 'zip': (make_zipfile, [], "ZIP file"), +} + + +def check_archive_formats(formats): + """Returns the first format from the 'format' list that is unknown. + + If all formats are known, returns None + """ + for format in formats: + if format not in ARCHIVE_FORMATS: + return format + return None + + +def make_archive( + base_name, + format, + root_dir=None, + base_dir=None, + verbose=False, + dry_run=False, + owner=None, + group=None, +): + """Create an archive file (eg. zip or tar). + + 'base_name' is the name of the file to create, minus any format-specific + extension; 'format' is the archive format: one of "zip", "tar", "gztar", + "bztar", "xztar", or "ztar". + + 'root_dir' is a directory that will be the root directory of the + archive; ie. we typically chdir into 'root_dir' before creating the + archive. 'base_dir' is the directory where we start archiving from; + ie. 'base_dir' will be the common prefix of all files and + directories in the archive. 'root_dir' and 'base_dir' both default + to the current directory. Returns the name of the archive file. + + 'owner' and 'group' are used when creating a tar archive. By default, + uses the current owner and group. + """ + save_cwd = os.getcwd() + if root_dir is not None: + log.debug("changing into '%s'", root_dir) + base_name = os.path.abspath(base_name) + if not dry_run: + os.chdir(root_dir) + + if base_dir is None: + base_dir = os.curdir + + kwargs = {'dry_run': dry_run} + + try: + format_info = ARCHIVE_FORMATS[format] + except KeyError: + raise ValueError(f"unknown archive format '{format}'") + + func = format_info[0] + kwargs.update(format_info[1]) + + if format != 'zip': + kwargs['owner'] = owner + kwargs['group'] = group + + try: + filename = func(base_name, base_dir, **kwargs) + finally: + if root_dir is not None: + log.debug("changing back to '%s'", save_cwd) + os.chdir(save_cwd) + + return filename diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/ccompiler.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/ccompiler.py new file mode 100644 index 0000000000000000000000000000000000000000..714f13d8d30060542139ccf94c798e34134c0d98 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/ccompiler.py @@ -0,0 +1,1263 @@ +"""distutils.ccompiler + +Contains CCompiler, an abstract base class that defines the interface +for the Distutils compiler abstraction model.""" + +import os +import pathlib +import re +import sys +import types +import warnings + +from more_itertools import always_iterable + +from ._log import log +from ._modified import newer_group +from .dir_util import mkpath +from .errors import ( + CompileError, + DistutilsModuleError, + DistutilsPlatformError, + LinkError, + UnknownFileError, +) +from .file_util import move_file +from .spawn import spawn +from .util import execute, is_mingw, split_quoted + + +class CCompiler: + """Abstract base class to define the interface that must be implemented + by real compiler classes. Also has some utility methods used by + several compiler classes. + + The basic idea behind a compiler abstraction class is that each + instance can be used for all the compile/link steps in building a + single project. Thus, attributes common to all of those compile and + link steps -- include directories, macros to define, libraries to link + against, etc. -- are attributes of the compiler instance. To allow for + variability in how individual files are treated, most of those + attributes may be varied on a per-compilation or per-link basis. + """ + + # 'compiler_type' is a class attribute that identifies this class. It + # keeps code that wants to know what kind of compiler it's dealing with + # from having to import all possible compiler classes just to do an + # 'isinstance'. In concrete CCompiler subclasses, 'compiler_type' + # should really, really be one of the keys of the 'compiler_class' + # dictionary (see below -- used by the 'new_compiler()' factory + # function) -- authors of new compiler interface classes are + # responsible for updating 'compiler_class'! + compiler_type = None + + # XXX things not handled by this compiler abstraction model: + # * client can't provide additional options for a compiler, + # e.g. warning, optimization, debugging flags. Perhaps this + # should be the domain of concrete compiler abstraction classes + # (UnixCCompiler, MSVCCompiler, etc.) -- or perhaps the base + # class should have methods for the common ones. + # * can't completely override the include or library searchg + # path, ie. no "cc -I -Idir1 -Idir2" or "cc -L -Ldir1 -Ldir2". + # I'm not sure how widely supported this is even by Unix + # compilers, much less on other platforms. And I'm even less + # sure how useful it is; maybe for cross-compiling, but + # support for that is a ways off. (And anyways, cross + # compilers probably have a dedicated binary with the + # right paths compiled in. I hope.) + # * can't do really freaky things with the library list/library + # dirs, e.g. "-Ldir1 -lfoo -Ldir2 -lfoo" to link against + # different versions of libfoo.a in different locations. I + # think this is useless without the ability to null out the + # library search path anyways. + + # Subclasses that rely on the standard filename generation methods + # implemented below should override these; see the comment near + # those methods ('object_filenames()' et. al.) for details: + src_extensions = None # list of strings + obj_extension = None # string + static_lib_extension = None + shared_lib_extension = None # string + static_lib_format = None # format string + shared_lib_format = None # prob. same as static_lib_format + exe_extension = None # string + + # Default language settings. language_map is used to detect a source + # file or Extension target language, checking source filenames. + # language_order is used to detect the language precedence, when deciding + # what language to use when mixing source types. For example, if some + # extension has two files with ".c" extension, and one with ".cpp", it + # is still linked as c++. + language_map = { + ".c": "c", + ".cc": "c++", + ".cpp": "c++", + ".cxx": "c++", + ".m": "objc", + } + language_order = ["c++", "objc", "c"] + + include_dirs = [] + """ + include dirs specific to this compiler class + """ + + library_dirs = [] + """ + library dirs specific to this compiler class + """ + + def __init__(self, verbose=False, dry_run=False, force=False): + self.dry_run = dry_run + self.force = force + self.verbose = verbose + + # 'output_dir': a common output directory for object, library, + # shared object, and shared library files + self.output_dir = None + + # 'macros': a list of macro definitions (or undefinitions). A + # macro definition is a 2-tuple (name, value), where the value is + # either a string or None (no explicit value). A macro + # undefinition is a 1-tuple (name,). + self.macros = [] + + # 'include_dirs': a list of directories to search for include files + self.include_dirs = [] + + # 'libraries': a list of libraries to include in any link + # (library names, not filenames: eg. "foo" not "libfoo.a") + self.libraries = [] + + # 'library_dirs': a list of directories to search for libraries + self.library_dirs = [] + + # 'runtime_library_dirs': a list of directories to search for + # shared libraries/objects at runtime + self.runtime_library_dirs = [] + + # 'objects': a list of object files (or similar, such as explicitly + # named library files) to include on any link + self.objects = [] + + for key in self.executables.keys(): + self.set_executable(key, self.executables[key]) + + def set_executables(self, **kwargs): + """Define the executables (and options for them) that will be run + to perform the various stages of compilation. The exact set of + executables that may be specified here depends on the compiler + class (via the 'executables' class attribute), but most will have: + compiler the C/C++ compiler + linker_so linker used to create shared objects and libraries + linker_exe linker used to create binary executables + archiver static library creator + + On platforms with a command-line (Unix, DOS/Windows), each of these + is a string that will be split into executable name and (optional) + list of arguments. (Splitting the string is done similarly to how + Unix shells operate: words are delimited by spaces, but quotes and + backslashes can override this. See + 'distutils.util.split_quoted()'.) + """ + + # Note that some CCompiler implementation classes will define class + # attributes 'cpp', 'cc', etc. with hard-coded executable names; + # this is appropriate when a compiler class is for exactly one + # compiler/OS combination (eg. MSVCCompiler). Other compiler + # classes (UnixCCompiler, in particular) are driven by information + # discovered at run-time, since there are many different ways to do + # basically the same things with Unix C compilers. + + for key in kwargs: + if key not in self.executables: + raise ValueError( + f"unknown executable '{key}' for class {self.__class__.__name__}" + ) + self.set_executable(key, kwargs[key]) + + def set_executable(self, key, value): + if isinstance(value, str): + setattr(self, key, split_quoted(value)) + else: + setattr(self, key, value) + + def _find_macro(self, name): + i = 0 + for defn in self.macros: + if defn[0] == name: + return i + i += 1 + return None + + def _check_macro_definitions(self, definitions): + """Ensure that every element of 'definitions' is valid.""" + for defn in definitions: + self._check_macro_definition(*defn) + + def _check_macro_definition(self, defn): + """ + Raise a TypeError if defn is not valid. + + A valid definition is either a (name, value) 2-tuple or a (name,) tuple. + """ + if not isinstance(defn, tuple) or not self._is_valid_macro(*defn): + raise TypeError( + f"invalid macro definition '{defn}': " + "must be tuple (string,), (string, string), or (string, None)" + ) + + @staticmethod + def _is_valid_macro(name, value=None): + """ + A valid macro is a ``name : str`` and a ``value : str | None``. + """ + return isinstance(name, str) and isinstance(value, (str, types.NoneType)) + + # -- Bookkeeping methods ------------------------------------------- + + def define_macro(self, name, value=None): + """Define a preprocessor macro for all compilations driven by this + compiler object. The optional parameter 'value' should be a + string; if it is not supplied, then the macro will be defined + without an explicit value and the exact outcome depends on the + compiler used (XXX true? does ANSI say anything about this?) + """ + # Delete from the list of macro definitions/undefinitions if + # already there (so that this one will take precedence). + i = self._find_macro(name) + if i is not None: + del self.macros[i] + + self.macros.append((name, value)) + + def undefine_macro(self, name): + """Undefine a preprocessor macro for all compilations driven by + this compiler object. If the same macro is defined by + 'define_macro()' and undefined by 'undefine_macro()' the last call + takes precedence (including multiple redefinitions or + undefinitions). If the macro is redefined/undefined on a + per-compilation basis (ie. in the call to 'compile()'), then that + takes precedence. + """ + # Delete from the list of macro definitions/undefinitions if + # already there (so that this one will take precedence). + i = self._find_macro(name) + if i is not None: + del self.macros[i] + + undefn = (name,) + self.macros.append(undefn) + + def add_include_dir(self, dir): + """Add 'dir' to the list of directories that will be searched for + header files. The compiler is instructed to search directories in + the order in which they are supplied by successive calls to + 'add_include_dir()'. + """ + self.include_dirs.append(dir) + + def set_include_dirs(self, dirs): + """Set the list of directories that will be searched to 'dirs' (a + list of strings). Overrides any preceding calls to + 'add_include_dir()'; subsequence calls to 'add_include_dir()' add + to the list passed to 'set_include_dirs()'. This does not affect + any list of standard include directories that the compiler may + search by default. + """ + self.include_dirs = dirs[:] + + def add_library(self, libname): + """Add 'libname' to the list of libraries that will be included in + all links driven by this compiler object. Note that 'libname' + should *not* be the name of a file containing a library, but the + name of the library itself: the actual filename will be inferred by + the linker, the compiler, or the compiler class (depending on the + platform). + + The linker will be instructed to link against libraries in the + order they were supplied to 'add_library()' and/or + 'set_libraries()'. It is perfectly valid to duplicate library + names; the linker will be instructed to link against libraries as + many times as they are mentioned. + """ + self.libraries.append(libname) + + def set_libraries(self, libnames): + """Set the list of libraries to be included in all links driven by + this compiler object to 'libnames' (a list of strings). This does + not affect any standard system libraries that the linker may + include by default. + """ + self.libraries = libnames[:] + + def add_library_dir(self, dir): + """Add 'dir' to the list of directories that will be searched for + libraries specified to 'add_library()' and 'set_libraries()'. The + linker will be instructed to search for libraries in the order they + are supplied to 'add_library_dir()' and/or 'set_library_dirs()'. + """ + self.library_dirs.append(dir) + + def set_library_dirs(self, dirs): + """Set the list of library search directories to 'dirs' (a list of + strings). This does not affect any standard library search path + that the linker may search by default. + """ + self.library_dirs = dirs[:] + + def add_runtime_library_dir(self, dir): + """Add 'dir' to the list of directories that will be searched for + shared libraries at runtime. + """ + self.runtime_library_dirs.append(dir) + + def set_runtime_library_dirs(self, dirs): + """Set the list of directories to search for shared libraries at + runtime to 'dirs' (a list of strings). This does not affect any + standard search path that the runtime linker may search by + default. + """ + self.runtime_library_dirs = dirs[:] + + def add_link_object(self, object): + """Add 'object' to the list of object files (or analogues, such as + explicitly named library files or the output of "resource + compilers") to be included in every link driven by this compiler + object. + """ + self.objects.append(object) + + def set_link_objects(self, objects): + """Set the list of object files (or analogues) to be included in + every link to 'objects'. This does not affect any standard object + files that the linker may include by default (such as system + libraries). + """ + self.objects = objects[:] + + # -- Private utility methods -------------------------------------- + # (here for the convenience of subclasses) + + # Helper method to prep compiler in subclass compile() methods + + def _setup_compile(self, outdir, macros, incdirs, sources, depends, extra): + """Process arguments and decide which source files to compile.""" + outdir, macros, incdirs = self._fix_compile_args(outdir, macros, incdirs) + + if extra is None: + extra = [] + + # Get the list of expected output (object) files + objects = self.object_filenames(sources, strip_dir=False, output_dir=outdir) + assert len(objects) == len(sources) + + pp_opts = gen_preprocess_options(macros, incdirs) + + build = {} + for i in range(len(sources)): + src = sources[i] + obj = objects[i] + ext = os.path.splitext(src)[1] + self.mkpath(os.path.dirname(obj)) + build[obj] = (src, ext) + + return macros, objects, extra, pp_opts, build + + def _get_cc_args(self, pp_opts, debug, before): + # works for unixccompiler, cygwinccompiler + cc_args = pp_opts + ['-c'] + if debug: + cc_args[:0] = ['-g'] + if before: + cc_args[:0] = before + return cc_args + + def _fix_compile_args(self, output_dir, macros, include_dirs): + """Typecheck and fix-up some of the arguments to the 'compile()' + method, and return fixed-up values. Specifically: if 'output_dir' + is None, replaces it with 'self.output_dir'; ensures that 'macros' + is a list, and augments it with 'self.macros'; ensures that + 'include_dirs' is a list, and augments it with 'self.include_dirs'. + Guarantees that the returned values are of the correct type, + i.e. for 'output_dir' either string or None, and for 'macros' and + 'include_dirs' either list or None. + """ + if output_dir is None: + output_dir = self.output_dir + elif not isinstance(output_dir, str): + raise TypeError("'output_dir' must be a string or None") + + if macros is None: + macros = list(self.macros) + elif isinstance(macros, list): + macros = macros + (self.macros or []) + else: + raise TypeError("'macros' (if supplied) must be a list of tuples") + + if include_dirs is None: + include_dirs = list(self.include_dirs) + elif isinstance(include_dirs, (list, tuple)): + include_dirs = list(include_dirs) + (self.include_dirs or []) + else: + raise TypeError("'include_dirs' (if supplied) must be a list of strings") + + # add include dirs for class + include_dirs += self.__class__.include_dirs + + return output_dir, macros, include_dirs + + def _prep_compile(self, sources, output_dir, depends=None): + """Decide which source files must be recompiled. + + Determine the list of object files corresponding to 'sources', + and figure out which ones really need to be recompiled. + Return a list of all object files and a dictionary telling + which source files can be skipped. + """ + # Get the list of expected output (object) files + objects = self.object_filenames(sources, output_dir=output_dir) + assert len(objects) == len(sources) + + # Return an empty dict for the "which source files can be skipped" + # return value to preserve API compatibility. + return objects, {} + + def _fix_object_args(self, objects, output_dir): + """Typecheck and fix up some arguments supplied to various methods. + Specifically: ensure that 'objects' is a list; if output_dir is + None, replace with self.output_dir. Return fixed versions of + 'objects' and 'output_dir'. + """ + if not isinstance(objects, (list, tuple)): + raise TypeError("'objects' must be a list or tuple of strings") + objects = list(objects) + + if output_dir is None: + output_dir = self.output_dir + elif not isinstance(output_dir, str): + raise TypeError("'output_dir' must be a string or None") + + return (objects, output_dir) + + def _fix_lib_args(self, libraries, library_dirs, runtime_library_dirs): + """Typecheck and fix up some of the arguments supplied to the + 'link_*' methods. Specifically: ensure that all arguments are + lists, and augment them with their permanent versions + (eg. 'self.libraries' augments 'libraries'). Return a tuple with + fixed versions of all arguments. + """ + if libraries is None: + libraries = list(self.libraries) + elif isinstance(libraries, (list, tuple)): + libraries = list(libraries) + (self.libraries or []) + else: + raise TypeError("'libraries' (if supplied) must be a list of strings") + + if library_dirs is None: + library_dirs = list(self.library_dirs) + elif isinstance(library_dirs, (list, tuple)): + library_dirs = list(library_dirs) + (self.library_dirs or []) + else: + raise TypeError("'library_dirs' (if supplied) must be a list of strings") + + # add library dirs for class + library_dirs += self.__class__.library_dirs + + if runtime_library_dirs is None: + runtime_library_dirs = list(self.runtime_library_dirs) + elif isinstance(runtime_library_dirs, (list, tuple)): + runtime_library_dirs = list(runtime_library_dirs) + ( + self.runtime_library_dirs or [] + ) + else: + raise TypeError( + "'runtime_library_dirs' (if supplied) must be a list of strings" + ) + + return (libraries, library_dirs, runtime_library_dirs) + + def _need_link(self, objects, output_file): + """Return true if we need to relink the files listed in 'objects' + to recreate 'output_file'. + """ + if self.force: + return True + else: + if self.dry_run: + newer = newer_group(objects, output_file, missing='newer') + else: + newer = newer_group(objects, output_file) + return newer + + def detect_language(self, sources): + """Detect the language of a given file, or list of files. Uses + language_map, and language_order to do the job. + """ + if not isinstance(sources, list): + sources = [sources] + lang = None + index = len(self.language_order) + for source in sources: + base, ext = os.path.splitext(source) + extlang = self.language_map.get(ext) + try: + extindex = self.language_order.index(extlang) + if extindex < index: + lang = extlang + index = extindex + except ValueError: + pass + return lang + + # -- Worker methods ------------------------------------------------ + # (must be implemented by subclasses) + + def preprocess( + self, + source, + output_file=None, + macros=None, + include_dirs=None, + extra_preargs=None, + extra_postargs=None, + ): + """Preprocess a single C/C++ source file, named in 'source'. + Output will be written to file named 'output_file', or stdout if + 'output_file' not supplied. 'macros' is a list of macro + definitions as for 'compile()', which will augment the macros set + with 'define_macro()' and 'undefine_macro()'. 'include_dirs' is a + list of directory names that will be added to the default list. + + Raises PreprocessError on failure. + """ + pass + + def compile( + self, + sources, + output_dir=None, + macros=None, + include_dirs=None, + debug=False, + extra_preargs=None, + extra_postargs=None, + depends=None, + ): + """Compile one or more source files. + + 'sources' must be a list of filenames, most likely C/C++ + files, but in reality anything that can be handled by a + particular compiler and compiler class (eg. MSVCCompiler can + handle resource files in 'sources'). Return a list of object + filenames, one per source filename in 'sources'. Depending on + the implementation, not all source files will necessarily be + compiled, but all corresponding object filenames will be + returned. + + If 'output_dir' is given, object files will be put under it, while + retaining their original path component. That is, "foo/bar.c" + normally compiles to "foo/bar.o" (for a Unix implementation); if + 'output_dir' is "build", then it would compile to + "build/foo/bar.o". + + 'macros', if given, must be a list of macro definitions. A macro + definition is either a (name, value) 2-tuple or a (name,) 1-tuple. + The former defines a macro; if the value is None, the macro is + defined without an explicit value. The 1-tuple case undefines a + macro. Later definitions/redefinitions/ undefinitions take + precedence. + + 'include_dirs', if given, must be a list of strings, the + directories to add to the default include file search path for this + compilation only. + + 'debug' is a boolean; if true, the compiler will be instructed to + output debug symbols in (or alongside) the object file(s). + + 'extra_preargs' and 'extra_postargs' are implementation- dependent. + On platforms that have the notion of a command-line (e.g. Unix, + DOS/Windows), they are most likely lists of strings: extra + command-line arguments to prepend/append to the compiler command + line. On other platforms, consult the implementation class + documentation. In any event, they are intended as an escape hatch + for those occasions when the abstract compiler framework doesn't + cut the mustard. + + 'depends', if given, is a list of filenames that all targets + depend on. If a source file is older than any file in + depends, then the source file will be recompiled. This + supports dependency tracking, but only at a coarse + granularity. + + Raises CompileError on failure. + """ + # A concrete compiler class can either override this method + # entirely or implement _compile(). + macros, objects, extra_postargs, pp_opts, build = self._setup_compile( + output_dir, macros, include_dirs, sources, depends, extra_postargs + ) + cc_args = self._get_cc_args(pp_opts, debug, extra_preargs) + + for obj in objects: + try: + src, ext = build[obj] + except KeyError: + continue + self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts) + + # Return *all* object filenames, not just the ones we just built. + return objects + + def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts): + """Compile 'src' to product 'obj'.""" + # A concrete compiler class that does not override compile() + # should implement _compile(). + pass + + def create_static_lib( + self, objects, output_libname, output_dir=None, debug=False, target_lang=None + ): + """Link a bunch of stuff together to create a static library file. + The "bunch of stuff" consists of the list of object files supplied + as 'objects', the extra object files supplied to + 'add_link_object()' and/or 'set_link_objects()', the libraries + supplied to 'add_library()' and/or 'set_libraries()', and the + libraries supplied as 'libraries' (if any). + + 'output_libname' should be a library name, not a filename; the + filename will be inferred from the library name. 'output_dir' is + the directory where the library file will be put. + + 'debug' is a boolean; if true, debugging information will be + included in the library (note that on most platforms, it is the + compile step where this matters: the 'debug' flag is included here + just for consistency). + + 'target_lang' is the target language for which the given objects + are being compiled. This allows specific linkage time treatment of + certain languages. + + Raises LibError on failure. + """ + pass + + # values for target_desc parameter in link() + SHARED_OBJECT = "shared_object" + SHARED_LIBRARY = "shared_library" + EXECUTABLE = "executable" + + def link( + self, + target_desc, + objects, + output_filename, + output_dir=None, + libraries=None, + library_dirs=None, + runtime_library_dirs=None, + export_symbols=None, + debug=False, + extra_preargs=None, + extra_postargs=None, + build_temp=None, + target_lang=None, + ): + """Link a bunch of stuff together to create an executable or + shared library file. + + The "bunch of stuff" consists of the list of object files supplied + as 'objects'. 'output_filename' should be a filename. If + 'output_dir' is supplied, 'output_filename' is relative to it + (i.e. 'output_filename' can provide directory components if + needed). + + 'libraries' is a list of libraries to link against. These are + library names, not filenames, since they're translated into + filenames in a platform-specific way (eg. "foo" becomes "libfoo.a" + on Unix and "foo.lib" on DOS/Windows). However, they can include a + directory component, which means the linker will look in that + specific directory rather than searching all the normal locations. + + 'library_dirs', if supplied, should be a list of directories to + search for libraries that were specified as bare library names + (ie. no directory component). These are on top of the system + default and those supplied to 'add_library_dir()' and/or + 'set_library_dirs()'. 'runtime_library_dirs' is a list of + directories that will be embedded into the shared library and used + to search for other shared libraries that *it* depends on at + run-time. (This may only be relevant on Unix.) + + 'export_symbols' is a list of symbols that the shared library will + export. (This appears to be relevant only on Windows.) + + 'debug' is as for 'compile()' and 'create_static_lib()', with the + slight distinction that it actually matters on most platforms (as + opposed to 'create_static_lib()', which includes a 'debug' flag + mostly for form's sake). + + 'extra_preargs' and 'extra_postargs' are as for 'compile()' (except + of course that they supply command-line arguments for the + particular linker being used). + + 'target_lang' is the target language for which the given objects + are being compiled. This allows specific linkage time treatment of + certain languages. + + Raises LinkError on failure. + """ + raise NotImplementedError + + # Old 'link_*()' methods, rewritten to use the new 'link()' method. + + def link_shared_lib( + self, + objects, + output_libname, + output_dir=None, + libraries=None, + library_dirs=None, + runtime_library_dirs=None, + export_symbols=None, + debug=False, + extra_preargs=None, + extra_postargs=None, + build_temp=None, + target_lang=None, + ): + self.link( + CCompiler.SHARED_LIBRARY, + objects, + self.library_filename(output_libname, lib_type='shared'), + output_dir, + libraries, + library_dirs, + runtime_library_dirs, + export_symbols, + debug, + extra_preargs, + extra_postargs, + build_temp, + target_lang, + ) + + def link_shared_object( + self, + objects, + output_filename, + output_dir=None, + libraries=None, + library_dirs=None, + runtime_library_dirs=None, + export_symbols=None, + debug=False, + extra_preargs=None, + extra_postargs=None, + build_temp=None, + target_lang=None, + ): + self.link( + CCompiler.SHARED_OBJECT, + objects, + output_filename, + output_dir, + libraries, + library_dirs, + runtime_library_dirs, + export_symbols, + debug, + extra_preargs, + extra_postargs, + build_temp, + target_lang, + ) + + def link_executable( + self, + objects, + output_progname, + output_dir=None, + libraries=None, + library_dirs=None, + runtime_library_dirs=None, + debug=False, + extra_preargs=None, + extra_postargs=None, + target_lang=None, + ): + self.link( + CCompiler.EXECUTABLE, + objects, + self.executable_filename(output_progname), + output_dir, + libraries, + library_dirs, + runtime_library_dirs, + None, + debug, + extra_preargs, + extra_postargs, + None, + target_lang, + ) + + # -- Miscellaneous methods ----------------------------------------- + # These are all used by the 'gen_lib_options() function; there is + # no appropriate default implementation so subclasses should + # implement all of these. + + def library_dir_option(self, dir): + """Return the compiler option to add 'dir' to the list of + directories searched for libraries. + """ + raise NotImplementedError + + def runtime_library_dir_option(self, dir): + """Return the compiler option to add 'dir' to the list of + directories searched for runtime libraries. + """ + raise NotImplementedError + + def library_option(self, lib): + """Return the compiler option to add 'lib' to the list of libraries + linked into the shared library or executable. + """ + raise NotImplementedError + + def has_function( # noqa: C901 + self, + funcname, + includes=None, + include_dirs=None, + libraries=None, + library_dirs=None, + ): + """Return a boolean indicating whether funcname is provided as + a symbol on the current platform. The optional arguments can + be used to augment the compilation environment. + + The libraries argument is a list of flags to be passed to the + linker to make additional symbol definitions available for + linking. + + The includes and include_dirs arguments are deprecated. + Usually, supplying include files with function declarations + will cause function detection to fail even in cases where the + symbol is available for linking. + + """ + # this can't be included at module scope because it tries to + # import math which might not be available at that point - maybe + # the necessary logic should just be inlined? + import tempfile + + if includes is None: + includes = [] + else: + warnings.warn("includes is deprecated", DeprecationWarning) + if include_dirs is None: + include_dirs = [] + else: + warnings.warn("include_dirs is deprecated", DeprecationWarning) + if libraries is None: + libraries = [] + if library_dirs is None: + library_dirs = [] + fd, fname = tempfile.mkstemp(".c", funcname, text=True) + with os.fdopen(fd, "w", encoding='utf-8') as f: + for incl in includes: + f.write(f"""#include "{incl}"\n""") + if not includes: + # Use "char func(void);" as the prototype to follow + # what autoconf does. This prototype does not match + # any well-known function the compiler might recognize + # as a builtin, so this ends up as a true link test. + # Without a fake prototype, the test would need to + # know the exact argument types, and the has_function + # interface does not provide that level of information. + f.write( + f"""\ +#ifdef __cplusplus +extern "C" +#endif +char {funcname}(void); +""" + ) + f.write( + f"""\ +int main (int argc, char **argv) {{ + {funcname}(); + return 0; +}} +""" + ) + + try: + objects = self.compile([fname], include_dirs=include_dirs) + except CompileError: + return False + finally: + os.remove(fname) + + try: + self.link_executable( + objects, "a.out", libraries=libraries, library_dirs=library_dirs + ) + except (LinkError, TypeError): + return False + else: + os.remove( + self.executable_filename("a.out", output_dir=self.output_dir or '') + ) + finally: + for fn in objects: + os.remove(fn) + return True + + def find_library_file(self, dirs, lib, debug=False): + """Search the specified list of directories for a static or shared + library file 'lib' and return the full path to that file. If + 'debug' true, look for a debugging version (if that makes sense on + the current platform). Return None if 'lib' wasn't found in any of + the specified directories. + """ + raise NotImplementedError + + # -- Filename generation methods ----------------------------------- + + # The default implementation of the filename generating methods are + # prejudiced towards the Unix/DOS/Windows view of the world: + # * object files are named by replacing the source file extension + # (eg. .c/.cpp -> .o/.obj) + # * library files (shared or static) are named by plugging the + # library name and extension into a format string, eg. + # "lib%s.%s" % (lib_name, ".a") for Unix static libraries + # * executables are named by appending an extension (possibly + # empty) to the program name: eg. progname + ".exe" for + # Windows + # + # To reduce redundant code, these methods expect to find + # several attributes in the current object (presumably defined + # as class attributes): + # * src_extensions - + # list of C/C++ source file extensions, eg. ['.c', '.cpp'] + # * obj_extension - + # object file extension, eg. '.o' or '.obj' + # * static_lib_extension - + # extension for static library files, eg. '.a' or '.lib' + # * shared_lib_extension - + # extension for shared library/object files, eg. '.so', '.dll' + # * static_lib_format - + # format string for generating static library filenames, + # eg. 'lib%s.%s' or '%s.%s' + # * shared_lib_format + # format string for generating shared library filenames + # (probably same as static_lib_format, since the extension + # is one of the intended parameters to the format string) + # * exe_extension - + # extension for executable files, eg. '' or '.exe' + + def object_filenames(self, source_filenames, strip_dir=False, output_dir=''): + if output_dir is None: + output_dir = '' + return list( + self._make_out_path(output_dir, strip_dir, src_name) + for src_name in source_filenames + ) + + @property + def out_extensions(self): + return dict.fromkeys(self.src_extensions, self.obj_extension) + + def _make_out_path(self, output_dir, strip_dir, src_name): + return self._make_out_path_exts( + output_dir, strip_dir, src_name, self.out_extensions + ) + + @classmethod + def _make_out_path_exts(cls, output_dir, strip_dir, src_name, extensions): + r""" + >>> exts = {'.c': '.o'} + >>> CCompiler._make_out_path_exts('.', False, '/foo/bar.c', exts).replace('\\', '/') + './foo/bar.o' + >>> CCompiler._make_out_path_exts('.', True, '/foo/bar.c', exts).replace('\\', '/') + './bar.o' + """ + src = pathlib.PurePath(src_name) + # Ensure base is relative to honor output_dir (python/cpython#37775). + base = cls._make_relative(src) + try: + new_ext = extensions[src.suffix] + except LookupError: + raise UnknownFileError(f"unknown file type '{src.suffix}' (from '{src}')") + if strip_dir: + base = pathlib.PurePath(base.name) + return os.path.join(output_dir, base.with_suffix(new_ext)) + + @staticmethod + def _make_relative(base: pathlib.Path): + return base.relative_to(base.anchor) + + def shared_object_filename(self, basename, strip_dir=False, output_dir=''): + assert output_dir is not None + if strip_dir: + basename = os.path.basename(basename) + return os.path.join(output_dir, basename + self.shared_lib_extension) + + def executable_filename(self, basename, strip_dir=False, output_dir=''): + assert output_dir is not None + if strip_dir: + basename = os.path.basename(basename) + return os.path.join(output_dir, basename + (self.exe_extension or '')) + + def library_filename( + self, + libname, + lib_type='static', + strip_dir=False, + output_dir='', # or 'shared' + ): + assert output_dir is not None + expected = '"static", "shared", "dylib", "xcode_stub"' + if lib_type not in eval(expected): + raise ValueError(f"'lib_type' must be {expected}") + fmt = getattr(self, lib_type + "_lib_format") + ext = getattr(self, lib_type + "_lib_extension") + + dir, base = os.path.split(libname) + filename = fmt % (base, ext) + if strip_dir: + dir = '' + + return os.path.join(output_dir, dir, filename) + + # -- Utility methods ----------------------------------------------- + + def announce(self, msg, level=1): + log.debug(msg) + + def debug_print(self, msg): + from distutils.debug import DEBUG + + if DEBUG: + print(msg) + + def warn(self, msg): + sys.stderr.write(f"warning: {msg}\n") + + def execute(self, func, args, msg=None, level=1): + execute(func, args, msg, self.dry_run) + + def spawn(self, cmd, **kwargs): + spawn(cmd, dry_run=self.dry_run, **kwargs) + + def move_file(self, src, dst): + return move_file(src, dst, dry_run=self.dry_run) + + def mkpath(self, name, mode=0o777): + mkpath(name, mode, dry_run=self.dry_run) + + +# Map a sys.platform/os.name ('posix', 'nt') to the default compiler +# type for that platform. Keys are interpreted as re match +# patterns. Order is important; platform mappings are preferred over +# OS names. +_default_compilers = ( + # Platform string mappings + # on a cygwin built python we can use gcc like an ordinary UNIXish + # compiler + ('cygwin.*', 'unix'), + ('zos', 'zos'), + # OS name mappings + ('posix', 'unix'), + ('nt', 'msvc'), +) + + +def get_default_compiler(osname=None, platform=None): + """Determine the default compiler to use for the given platform. + + osname should be one of the standard Python OS names (i.e. the + ones returned by os.name) and platform the common value + returned by sys.platform for the platform in question. + + The default values are os.name and sys.platform in case the + parameters are not given. + """ + if osname is None: + osname = os.name + if platform is None: + platform = sys.platform + # Mingw is a special case where sys.platform is 'win32' but we + # want to use the 'mingw32' compiler, so check it first + if is_mingw(): + return 'mingw32' + for pattern, compiler in _default_compilers: + if ( + re.match(pattern, platform) is not None + or re.match(pattern, osname) is not None + ): + return compiler + # Default to Unix compiler + return 'unix' + + +# Map compiler types to (module_name, class_name) pairs -- ie. where to +# find the code that implements an interface to this compiler. (The module +# is assumed to be in the 'distutils' package.) +compiler_class = { + 'unix': ('unixccompiler', 'UnixCCompiler', "standard UNIX-style compiler"), + 'msvc': ('_msvccompiler', 'MSVCCompiler', "Microsoft Visual C++"), + 'cygwin': ( + 'cygwinccompiler', + 'CygwinCCompiler', + "Cygwin port of GNU C Compiler for Win32", + ), + 'mingw32': ( + 'cygwinccompiler', + 'Mingw32CCompiler', + "Mingw32 port of GNU C Compiler for Win32", + ), + 'bcpp': ('bcppcompiler', 'BCPPCompiler', "Borland C++ Compiler"), + 'zos': ('zosccompiler', 'zOSCCompiler', 'IBM XL C/C++ Compilers'), +} + + +def show_compilers(): + """Print list of available compilers (used by the "--help-compiler" + options to "build", "build_ext", "build_clib"). + """ + # XXX this "knows" that the compiler option it's describing is + # "--compiler", which just happens to be the case for the three + # commands that use it. + from distutils.fancy_getopt import FancyGetopt + + compilers = sorted( + ("compiler=" + compiler, None, compiler_class[compiler][2]) + for compiler in compiler_class.keys() + ) + pretty_printer = FancyGetopt(compilers) + pretty_printer.print_help("List of available compilers:") + + +def new_compiler(plat=None, compiler=None, verbose=False, dry_run=False, force=False): + """Generate an instance of some CCompiler subclass for the supplied + platform/compiler combination. 'plat' defaults to 'os.name' + (eg. 'posix', 'nt'), and 'compiler' defaults to the default compiler + for that platform. Currently only 'posix' and 'nt' are supported, and + the default compilers are "traditional Unix interface" (UnixCCompiler + class) and Visual C++ (MSVCCompiler class). Note that it's perfectly + possible to ask for a Unix compiler object under Windows, and a + Microsoft compiler object under Unix -- if you supply a value for + 'compiler', 'plat' is ignored. + """ + if plat is None: + plat = os.name + + try: + if compiler is None: + compiler = get_default_compiler(plat) + + (module_name, class_name, long_description) = compiler_class[compiler] + except KeyError: + msg = f"don't know how to compile C/C++ code on platform '{plat}'" + if compiler is not None: + msg = msg + f" with '{compiler}' compiler" + raise DistutilsPlatformError(msg) + + try: + module_name = "distutils." + module_name + __import__(module_name) + module = sys.modules[module_name] + klass = vars(module)[class_name] + except ImportError: + raise DistutilsModuleError( + f"can't compile C/C++ code: unable to load module '{module_name}'" + ) + except KeyError: + raise DistutilsModuleError( + f"can't compile C/C++ code: unable to find class '{class_name}' " + f"in module '{module_name}'" + ) + + # XXX The None is necessary to preserve backwards compatibility + # with classes that expect verbose to be the first positional + # argument. + return klass(None, dry_run, force) + + +def gen_preprocess_options(macros, include_dirs): + """Generate C pre-processor options (-D, -U, -I) as used by at least + two types of compilers: the typical Unix compiler and Visual C++. + 'macros' is the usual thing, a list of 1- or 2-tuples, where (name,) + means undefine (-U) macro 'name', and (name,value) means define (-D) + macro 'name' to 'value'. 'include_dirs' is just a list of directory + names to be added to the header file search path (-I). Returns a list + of command-line options suitable for either Unix compilers or Visual + C++. + """ + # XXX it would be nice (mainly aesthetic, and so we don't generate + # stupid-looking command lines) to go over 'macros' and eliminate + # redundant definitions/undefinitions (ie. ensure that only the + # latest mention of a particular macro winds up on the command + # line). I don't think it's essential, though, since most (all?) + # Unix C compilers only pay attention to the latest -D or -U + # mention of a macro on their command line. Similar situation for + # 'include_dirs'. I'm punting on both for now. Anyways, weeding out + # redundancies like this should probably be the province of + # CCompiler, since the data structures used are inherited from it + # and therefore common to all CCompiler classes. + pp_opts = [] + for macro in macros: + if not (isinstance(macro, tuple) and 1 <= len(macro) <= 2): + raise TypeError( + f"bad macro definition '{macro}': " + "each element of 'macros' list must be a 1- or 2-tuple" + ) + + if len(macro) == 1: # undefine this macro + pp_opts.append(f"-U{macro[0]}") + elif len(macro) == 2: + if macro[1] is None: # define with no explicit value + pp_opts.append(f"-D{macro[0]}") + else: + # XXX *don't* need to be clever about quoting the + # macro value here, because we're going to avoid the + # shell at all costs when we spawn the command! + pp_opts.append("-D{}={}".format(*macro)) + + pp_opts.extend(f"-I{dir}" for dir in include_dirs) + return pp_opts + + +def gen_lib_options(compiler, library_dirs, runtime_library_dirs, libraries): + """Generate linker options for searching library directories and + linking with specific libraries. 'libraries' and 'library_dirs' are, + respectively, lists of library names (not filenames!) and search + directories. Returns a list of command-line options suitable for use + with some compiler (depending on the two format strings passed in). + """ + lib_opts = [compiler.library_dir_option(dir) for dir in library_dirs] + + for dir in runtime_library_dirs: + lib_opts.extend(always_iterable(compiler.runtime_library_dir_option(dir))) + + # XXX it's important that we *not* remove redundant library mentions! + # sometimes you really do have to say "-lfoo -lbar -lfoo" in order to + # resolve all symbols. I just hope we never have to say "-lfoo obj.o + # -lbar" to get things to work -- that's certainly a possibility, but a + # pretty nasty way to arrange your C code. + + for lib in libraries: + (lib_dir, lib_name) = os.path.split(lib) + if lib_dir: + lib_file = compiler.find_library_file([lib_dir], lib_name) + if lib_file: + lib_opts.append(lib_file) + else: + compiler.warn( + f"no library file corresponding to '{lib}' found (skipping)" + ) + else: + lib_opts.append(compiler.library_option(lib)) + return lib_opts diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/cmd.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/cmd.py new file mode 100644 index 0000000000000000000000000000000000000000..9c6fa6566ce911d4bb97f386e0d42c90bf1e4036 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/cmd.py @@ -0,0 +1,462 @@ +"""distutils.cmd + +Provides the Command class, the base class for the command classes +in the distutils.command package. +""" + +from __future__ import annotations + +import logging +import os +import re +import sys +from collections.abc import Callable +from typing import Any, ClassVar, TypeVar, overload + +from . import _modified, archive_util, dir_util, file_util, util +from ._log import log +from .errors import DistutilsOptionError + +_CommandT = TypeVar("_CommandT", bound="Command") + + +class Command: + """Abstract base class for defining command classes, the "worker bees" + of the Distutils. A useful analogy for command classes is to think of + them as subroutines with local variables called "options". The options + are "declared" in 'initialize_options()' and "defined" (given their + final values, aka "finalized") in 'finalize_options()', both of which + must be defined by every command class. The distinction between the + two is necessary because option values might come from the outside + world (command line, config file, ...), and any options dependent on + other options must be computed *after* these outside influences have + been processed -- hence 'finalize_options()'. The "body" of the + subroutine, where it does all its work based on the values of its + options, is the 'run()' method, which must also be implemented by every + command class. + """ + + # 'sub_commands' formalizes the notion of a "family" of commands, + # eg. "install" as the parent with sub-commands "install_lib", + # "install_headers", etc. The parent of a family of commands + # defines 'sub_commands' as a class attribute; it's a list of + # (command_name : string, predicate : unbound_method | string | None) + # tuples, where 'predicate' is a method of the parent command that + # determines whether the corresponding command is applicable in the + # current situation. (Eg. we "install_headers" is only applicable if + # we have any C header files to install.) If 'predicate' is None, + # that command is always applicable. + # + # 'sub_commands' is usually defined at the *end* of a class, because + # predicates can be unbound methods, so they must already have been + # defined. The canonical example is the "install" command. + sub_commands: ClassVar[ # Any to work around variance issues + list[tuple[str, Callable[[Any], bool] | None]] + ] = [] + + user_options: ClassVar[ + # Specifying both because list is invariant. Avoids mypy override assignment issues + list[tuple[str, str, str]] | list[tuple[str, str | None, str]] + ] = [] + + # -- Creation/initialization methods ------------------------------- + + def __init__(self, dist): + """Create and initialize a new Command object. Most importantly, + invokes the 'initialize_options()' method, which is the real + initializer and depends on the actual command being + instantiated. + """ + # late import because of mutual dependence between these classes + from distutils.dist import Distribution + + if not isinstance(dist, Distribution): + raise TypeError("dist must be a Distribution instance") + if self.__class__ is Command: + raise RuntimeError("Command is an abstract class") + + self.distribution = dist + self.initialize_options() + + # Per-command versions of the global flags, so that the user can + # customize Distutils' behaviour command-by-command and let some + # commands fall back on the Distribution's behaviour. None means + # "not defined, check self.distribution's copy", while 0 or 1 mean + # false and true (duh). Note that this means figuring out the real + # value of each flag is a touch complicated -- hence "self._dry_run" + # will be handled by __getattr__, below. + # XXX This needs to be fixed. + self._dry_run = None + + # verbose is largely ignored, but needs to be set for + # backwards compatibility (I think)? + self.verbose = dist.verbose + + # Some commands define a 'self.force' option to ignore file + # timestamps, but methods defined *here* assume that + # 'self.force' exists for all commands. So define it here + # just to be safe. + self.force = None + + # The 'help' flag is just used for command-line parsing, so + # none of that complicated bureaucracy is needed. + self.help = False + + # 'finalized' records whether or not 'finalize_options()' has been + # called. 'finalize_options()' itself should not pay attention to + # this flag: it is the business of 'ensure_finalized()', which + # always calls 'finalize_options()', to respect/update it. + self.finalized = False + + # XXX A more explicit way to customize dry_run would be better. + def __getattr__(self, attr): + if attr == 'dry_run': + myval = getattr(self, "_" + attr) + if myval is None: + return getattr(self.distribution, attr) + else: + return myval + else: + raise AttributeError(attr) + + def ensure_finalized(self): + if not self.finalized: + self.finalize_options() + self.finalized = True + + # Subclasses must define: + # initialize_options() + # provide default values for all options; may be customized by + # setup script, by options from config file(s), or by command-line + # options + # finalize_options() + # decide on the final values for all options; this is called + # after all possible intervention from the outside world + # (command-line, option file, etc.) has been processed + # run() + # run the command: do whatever it is we're here to do, + # controlled by the command's various option values + + def initialize_options(self): + """Set default values for all the options that this command + supports. Note that these defaults may be overridden by other + commands, by the setup script, by config files, or by the + command-line. Thus, this is not the place to code dependencies + between options; generally, 'initialize_options()' implementations + are just a bunch of "self.foo = None" assignments. + + This method must be implemented by all command classes. + """ + raise RuntimeError( + f"abstract method -- subclass {self.__class__} must override" + ) + + def finalize_options(self): + """Set final values for all the options that this command supports. + This is always called as late as possible, ie. after any option + assignments from the command-line or from other commands have been + done. Thus, this is the place to code option dependencies: if + 'foo' depends on 'bar', then it is safe to set 'foo' from 'bar' as + long as 'foo' still has the same value it was assigned in + 'initialize_options()'. + + This method must be implemented by all command classes. + """ + raise RuntimeError( + f"abstract method -- subclass {self.__class__} must override" + ) + + def dump_options(self, header=None, indent=""): + from distutils.fancy_getopt import longopt_xlate + + if header is None: + header = f"command options for '{self.get_command_name()}':" + self.announce(indent + header, level=logging.INFO) + indent = indent + " " + for option, _, _ in self.user_options: + option = option.translate(longopt_xlate) + if option[-1] == "=": + option = option[:-1] + value = getattr(self, option) + self.announce(indent + f"{option} = {value}", level=logging.INFO) + + def run(self): + """A command's raison d'etre: carry out the action it exists to + perform, controlled by the options initialized in + 'initialize_options()', customized by other commands, the setup + script, the command-line, and config files, and finalized in + 'finalize_options()'. All terminal output and filesystem + interaction should be done by 'run()'. + + This method must be implemented by all command classes. + """ + raise RuntimeError( + f"abstract method -- subclass {self.__class__} must override" + ) + + def announce(self, msg, level=logging.DEBUG): + log.log(level, msg) + + def debug_print(self, msg): + """Print 'msg' to stdout if the global DEBUG (taken from the + DISTUTILS_DEBUG environment variable) flag is true. + """ + from distutils.debug import DEBUG + + if DEBUG: + print(msg) + sys.stdout.flush() + + # -- Option validation methods ------------------------------------- + # (these are very handy in writing the 'finalize_options()' method) + # + # NB. the general philosophy here is to ensure that a particular option + # value meets certain type and value constraints. If not, we try to + # force it into conformance (eg. if we expect a list but have a string, + # split the string on comma and/or whitespace). If we can't force the + # option into conformance, raise DistutilsOptionError. Thus, command + # classes need do nothing more than (eg.) + # self.ensure_string_list('foo') + # and they can be guaranteed that thereafter, self.foo will be + # a list of strings. + + def _ensure_stringlike(self, option, what, default=None): + val = getattr(self, option) + if val is None: + setattr(self, option, default) + return default + elif not isinstance(val, str): + raise DistutilsOptionError(f"'{option}' must be a {what} (got `{val}`)") + return val + + def ensure_string(self, option, default=None): + """Ensure that 'option' is a string; if not defined, set it to + 'default'. + """ + self._ensure_stringlike(option, "string", default) + + def ensure_string_list(self, option): + r"""Ensure that 'option' is a list of strings. If 'option' is + currently a string, we split it either on /,\s*/ or /\s+/, so + "foo bar baz", "foo,bar,baz", and "foo, bar baz" all become + ["foo", "bar", "baz"]. + """ + val = getattr(self, option) + if val is None: + return + elif isinstance(val, str): + setattr(self, option, re.split(r',\s*|\s+', val)) + else: + if isinstance(val, list): + ok = all(isinstance(v, str) for v in val) + else: + ok = False + if not ok: + raise DistutilsOptionError( + f"'{option}' must be a list of strings (got {val!r})" + ) + + def _ensure_tested_string(self, option, tester, what, error_fmt, default=None): + val = self._ensure_stringlike(option, what, default) + if val is not None and not tester(val): + raise DistutilsOptionError( + ("error in '%s' option: " + error_fmt) % (option, val) + ) + + def ensure_filename(self, option): + """Ensure that 'option' is the name of an existing file.""" + self._ensure_tested_string( + option, os.path.isfile, "filename", "'%s' does not exist or is not a file" + ) + + def ensure_dirname(self, option): + self._ensure_tested_string( + option, + os.path.isdir, + "directory name", + "'%s' does not exist or is not a directory", + ) + + # -- Convenience methods for commands ------------------------------ + + def get_command_name(self): + if hasattr(self, 'command_name'): + return self.command_name + else: + return self.__class__.__name__ + + def set_undefined_options(self, src_cmd, *option_pairs): + """Set the values of any "undefined" options from corresponding + option values in some other command object. "Undefined" here means + "is None", which is the convention used to indicate that an option + has not been changed between 'initialize_options()' and + 'finalize_options()'. Usually called from 'finalize_options()' for + options that depend on some other command rather than another + option of the same command. 'src_cmd' is the other command from + which option values will be taken (a command object will be created + for it if necessary); the remaining arguments are + '(src_option,dst_option)' tuples which mean "take the value of + 'src_option' in the 'src_cmd' command object, and copy it to + 'dst_option' in the current command object". + """ + # Option_pairs: list of (src_option, dst_option) tuples + src_cmd_obj = self.distribution.get_command_obj(src_cmd) + src_cmd_obj.ensure_finalized() + for src_option, dst_option in option_pairs: + if getattr(self, dst_option) is None: + setattr(self, dst_option, getattr(src_cmd_obj, src_option)) + + def get_finalized_command(self, command, create=True): + """Wrapper around Distribution's 'get_command_obj()' method: find + (create if necessary and 'create' is true) the command object for + 'command', call its 'ensure_finalized()' method, and return the + finalized command object. + """ + cmd_obj = self.distribution.get_command_obj(command, create) + cmd_obj.ensure_finalized() + return cmd_obj + + # XXX rename to 'get_reinitialized_command()'? (should do the + # same in dist.py, if so) + @overload + def reinitialize_command( + self, command: str, reinit_subcommands: bool = False + ) -> Command: ... + @overload + def reinitialize_command( + self, command: _CommandT, reinit_subcommands: bool = False + ) -> _CommandT: ... + def reinitialize_command( + self, command: str | Command, reinit_subcommands=False + ) -> Command: + return self.distribution.reinitialize_command(command, reinit_subcommands) + + def run_command(self, command): + """Run some other command: uses the 'run_command()' method of + Distribution, which creates and finalizes the command object if + necessary and then invokes its 'run()' method. + """ + self.distribution.run_command(command) + + def get_sub_commands(self): + """Determine the sub-commands that are relevant in the current + distribution (ie., that need to be run). This is based on the + 'sub_commands' class attribute: each tuple in that list may include + a method that we call to determine if the subcommand needs to be + run for the current distribution. Return a list of command names. + """ + commands = [] + for cmd_name, method in self.sub_commands: + if method is None or method(self): + commands.append(cmd_name) + return commands + + # -- External world manipulation ----------------------------------- + + def warn(self, msg): + log.warning("warning: %s: %s\n", self.get_command_name(), msg) + + def execute(self, func, args, msg=None, level=1): + util.execute(func, args, msg, dry_run=self.dry_run) + + def mkpath(self, name, mode=0o777): + dir_util.mkpath(name, mode, dry_run=self.dry_run) + + def copy_file( + self, + infile, + outfile, + preserve_mode=True, + preserve_times=True, + link=None, + level=1, + ): + """Copy a file respecting verbose, dry-run and force flags. (The + former two default to whatever is in the Distribution object, and + the latter defaults to false for commands that don't define it.)""" + return file_util.copy_file( + infile, + outfile, + preserve_mode, + preserve_times, + not self.force, + link, + dry_run=self.dry_run, + ) + + def copy_tree( + self, + infile, + outfile, + preserve_mode=True, + preserve_times=True, + preserve_symlinks=False, + level=1, + ): + """Copy an entire directory tree respecting verbose, dry-run, + and force flags. + """ + return dir_util.copy_tree( + infile, + outfile, + preserve_mode, + preserve_times, + preserve_symlinks, + not self.force, + dry_run=self.dry_run, + ) + + def move_file(self, src, dst, level=1): + """Move a file respecting dry-run flag.""" + return file_util.move_file(src, dst, dry_run=self.dry_run) + + def spawn(self, cmd, search_path=True, level=1): + """Spawn an external command respecting dry-run flag.""" + from distutils.spawn import spawn + + spawn(cmd, search_path, dry_run=self.dry_run) + + def make_archive( + self, base_name, format, root_dir=None, base_dir=None, owner=None, group=None + ): + return archive_util.make_archive( + base_name, + format, + root_dir, + base_dir, + dry_run=self.dry_run, + owner=owner, + group=group, + ) + + def make_file( + self, infiles, outfile, func, args, exec_msg=None, skip_msg=None, level=1 + ): + """Special case of 'execute()' for operations that process one or + more input files and generate one output file. Works just like + 'execute()', except the operation is skipped and a different + message printed if 'outfile' already exists and is newer than all + files listed in 'infiles'. If the command defined 'self.force', + and it is true, then the command is unconditionally run -- does no + timestamp checks. + """ + if skip_msg is None: + skip_msg = f"skipping {outfile} (inputs unchanged)" + + # Allow 'infiles' to be a single string + if isinstance(infiles, str): + infiles = (infiles,) + elif not isinstance(infiles, (list, tuple)): + raise TypeError("'infiles' must be a string, or a list or tuple of strings") + + if exec_msg is None: + exec_msg = "generating {} from {}".format(outfile, ', '.join(infiles)) + + # If 'outfile' must be regenerated (either because it doesn't + # exist, is out-of-date, or the 'force' flag is true) then + # perform the action that presumably regenerates it + if self.force or _modified.newer_group(infiles, outfile): + self.execute(func, args, exec_msg, level) + # Otherwise, print the "skip" message + else: + log.debug(skip_msg) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/core.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/core.py new file mode 100644 index 0000000000000000000000000000000000000000..bd62546bdd9d35dcc46e0b46a46b0c61a43fd0ba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/core.py @@ -0,0 +1,289 @@ +"""distutils.core + +The only module that needs to be imported to use the Distutils; provides +the 'setup' function (which is to be called from the setup script). Also +indirectly provides the Distribution and Command classes, although they are +really defined in distutils.dist and distutils.cmd. +""" + +from __future__ import annotations + +import os +import sys +import tokenize +from collections.abc import Iterable + +from .cmd import Command +from .debug import DEBUG + +# Mainly import these so setup scripts can "from distutils.core import" them. +from .dist import Distribution +from .errors import ( + CCompilerError, + DistutilsArgError, + DistutilsError, + DistutilsSetupError, +) +from .extension import Extension + +__all__ = ['Distribution', 'Command', 'Extension', 'setup'] + +# This is a barebones help message generated displayed when the user +# runs the setup script with no arguments at all. More useful help +# is generated with various --help options: global help, list commands, +# and per-command help. +USAGE = """\ +usage: %(script)s [global_opts] cmd1 [cmd1_opts] [cmd2 [cmd2_opts] ...] + or: %(script)s --help [cmd1 cmd2 ...] + or: %(script)s --help-commands + or: %(script)s cmd --help +""" + + +def gen_usage(script_name): + script = os.path.basename(script_name) + return USAGE % locals() + + +# Some mild magic to control the behaviour of 'setup()' from 'run_setup()'. +_setup_stop_after = None +_setup_distribution = None + +# Legal keyword arguments for the setup() function +setup_keywords = ( + 'distclass', + 'script_name', + 'script_args', + 'options', + 'name', + 'version', + 'author', + 'author_email', + 'maintainer', + 'maintainer_email', + 'url', + 'license', + 'description', + 'long_description', + 'keywords', + 'platforms', + 'classifiers', + 'download_url', + 'requires', + 'provides', + 'obsoletes', +) + +# Legal keyword arguments for the Extension constructor +extension_keywords = ( + 'name', + 'sources', + 'include_dirs', + 'define_macros', + 'undef_macros', + 'library_dirs', + 'libraries', + 'runtime_library_dirs', + 'extra_objects', + 'extra_compile_args', + 'extra_link_args', + 'swig_opts', + 'export_symbols', + 'depends', + 'language', +) + + +def setup(**attrs): # noqa: C901 + """The gateway to the Distutils: do everything your setup script needs + to do, in a highly flexible and user-driven way. Briefly: create a + Distribution instance; find and parse config files; parse the command + line; run each Distutils command found there, customized by the options + supplied to 'setup()' (as keyword arguments), in config files, and on + the command line. + + The Distribution instance might be an instance of a class supplied via + the 'distclass' keyword argument to 'setup'; if no such class is + supplied, then the Distribution class (in dist.py) is instantiated. + All other arguments to 'setup' (except for 'cmdclass') are used to set + attributes of the Distribution instance. + + The 'cmdclass' argument, if supplied, is a dictionary mapping command + names to command classes. Each command encountered on the command line + will be turned into a command class, which is in turn instantiated; any + class found in 'cmdclass' is used in place of the default, which is + (for command 'foo_bar') class 'foo_bar' in module + 'distutils.command.foo_bar'. The command class must provide a + 'user_options' attribute which is a list of option specifiers for + 'distutils.fancy_getopt'. Any command-line options between the current + and the next command are used to set attributes of the current command + object. + + When the entire command-line has been successfully parsed, calls the + 'run()' method on each command object in turn. This method will be + driven entirely by the Distribution object (which each command object + has a reference to, thanks to its constructor), and the + command-specific options that became attributes of each command + object. + """ + + global _setup_stop_after, _setup_distribution + + # Determine the distribution class -- either caller-supplied or + # our Distribution (see below). + klass = attrs.get('distclass') + if klass: + attrs.pop('distclass') + else: + klass = Distribution + + if 'script_name' not in attrs: + attrs['script_name'] = os.path.basename(sys.argv[0]) + if 'script_args' not in attrs: + attrs['script_args'] = sys.argv[1:] + + # Create the Distribution instance, using the remaining arguments + # (ie. everything except distclass) to initialize it + try: + _setup_distribution = dist = klass(attrs) + except DistutilsSetupError as msg: + if 'name' not in attrs: + raise SystemExit(f"error in setup command: {msg}") + else: + raise SystemExit("error in {} setup command: {}".format(attrs['name'], msg)) + + if _setup_stop_after == "init": + return dist + + # Find and parse the config file(s): they will override options from + # the setup script, but be overridden by the command line. + dist.parse_config_files() + + if DEBUG: + print("options (after parsing config files):") + dist.dump_option_dicts() + + if _setup_stop_after == "config": + return dist + + # Parse the command line and override config files; any + # command-line errors are the end user's fault, so turn them into + # SystemExit to suppress tracebacks. + try: + ok = dist.parse_command_line() + except DistutilsArgError as msg: + raise SystemExit(gen_usage(dist.script_name) + f"\nerror: {msg}") + + if DEBUG: + print("options (after parsing command line):") + dist.dump_option_dicts() + + if _setup_stop_after == "commandline": + return dist + + # And finally, run all the commands found on the command line. + if ok: + return run_commands(dist) + + return dist + + +# setup () + + +def run_commands(dist): + """Given a Distribution object run all the commands, + raising ``SystemExit`` errors in the case of failure. + + This function assumes that either ``sys.argv`` or ``dist.script_args`` + is already set accordingly. + """ + try: + dist.run_commands() + except KeyboardInterrupt: + raise SystemExit("interrupted") + except OSError as exc: + if DEBUG: + sys.stderr.write(f"error: {exc}\n") + raise + else: + raise SystemExit(f"error: {exc}") + + except (DistutilsError, CCompilerError) as msg: + if DEBUG: + raise + else: + raise SystemExit("error: " + str(msg)) + + return dist + + +def run_setup(script_name, script_args: Iterable[str] | None = None, stop_after="run"): + """Run a setup script in a somewhat controlled environment, and + return the Distribution instance that drives things. This is useful + if you need to find out the distribution meta-data (passed as + keyword args from 'script' to 'setup()', or the contents of the + config files or command-line. + + 'script_name' is a file that will be read and run with 'exec()'; + 'sys.argv[0]' will be replaced with 'script' for the duration of the + call. 'script_args' is a list of strings; if supplied, + 'sys.argv[1:]' will be replaced by 'script_args' for the duration of + the call. + + 'stop_after' tells 'setup()' when to stop processing; possible + values: + init + stop after the Distribution instance has been created and + populated with the keyword arguments to 'setup()' + config + stop after config files have been parsed (and their data + stored in the Distribution instance) + commandline + stop after the command-line ('sys.argv[1:]' or 'script_args') + have been parsed (and the data stored in the Distribution) + run [default] + stop after all commands have been run (the same as if 'setup()' + had been called in the usual way + + Returns the Distribution instance, which provides all information + used to drive the Distutils. + """ + if stop_after not in ('init', 'config', 'commandline', 'run'): + raise ValueError(f"invalid value for 'stop_after': {stop_after!r}") + + global _setup_stop_after, _setup_distribution + _setup_stop_after = stop_after + + save_argv = sys.argv.copy() + g = {'__file__': script_name, '__name__': '__main__'} + try: + try: + sys.argv[0] = script_name + if script_args is not None: + sys.argv[1:] = script_args + # tokenize.open supports automatic encoding detection + with tokenize.open(script_name) as f: + code = f.read().replace(r'\r\n', r'\n') + exec(code, g) + finally: + sys.argv = save_argv + _setup_stop_after = None + except SystemExit: + # Hmm, should we do something if exiting with a non-zero code + # (ie. error)? + pass + + if _setup_distribution is None: + raise RuntimeError( + "'distutils.core.setup()' was never called -- " + f"perhaps '{script_name}' is not a Distutils setup script?" + ) + + # I wonder if the setup script's namespace -- g and l -- would be of + # any interest to callers? + # print "_setup_distribution:", _setup_distribution + return _setup_distribution + + +# run_setup () diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/cygwinccompiler.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/cygwinccompiler.py new file mode 100644 index 0000000000000000000000000000000000000000..3c67524e6d969c529d2fd87f2777225378eca39f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/cygwinccompiler.py @@ -0,0 +1,339 @@ +"""distutils.cygwinccompiler + +Provides the CygwinCCompiler class, a subclass of UnixCCompiler that +handles the Cygwin port of the GNU C compiler to Windows. It also contains +the Mingw32CCompiler class which handles the mingw32 port of GCC (same as +cygwin in no-cygwin mode). +""" + +import copy +import os +import pathlib +import shlex +import sys +import warnings +from subprocess import check_output + +from .errors import ( + CCompilerError, + CompileError, + DistutilsExecError, + DistutilsPlatformError, +) +from .file_util import write_file +from .sysconfig import get_config_vars +from .unixccompiler import UnixCCompiler +from .version import LooseVersion, suppress_known_deprecation + + +def get_msvcr(): + """No longer needed, but kept for backward compatibility.""" + return [] + + +_runtime_library_dirs_msg = ( + "Unable to set runtime library search path on Windows, " + "usually indicated by `runtime_library_dirs` parameter to Extension" +) + + +class CygwinCCompiler(UnixCCompiler): + """Handles the Cygwin port of the GNU C compiler to Windows.""" + + compiler_type = 'cygwin' + obj_extension = ".o" + static_lib_extension = ".a" + shared_lib_extension = ".dll.a" + dylib_lib_extension = ".dll" + static_lib_format = "lib%s%s" + shared_lib_format = "lib%s%s" + dylib_lib_format = "cyg%s%s" + exe_extension = ".exe" + + def __init__(self, verbose=False, dry_run=False, force=False): + super().__init__(verbose, dry_run, force) + + status, details = check_config_h() + self.debug_print(f"Python's GCC status: {status} (details: {details})") + if status is not CONFIG_H_OK: + self.warn( + "Python's pyconfig.h doesn't seem to support your compiler. " + f"Reason: {details}. " + "Compiling may fail because of undefined preprocessor macros." + ) + + self.cc, self.cxx = get_config_vars('CC', 'CXX') + + # Override 'CC' and 'CXX' environment variables for + # building using MINGW compiler for MSVC python. + self.cc = os.environ.get('CC', self.cc or 'gcc') + self.cxx = os.environ.get('CXX', self.cxx or 'g++') + + self.linker_dll = self.cc + self.linker_dll_cxx = self.cxx + shared_option = "-shared" + + self.set_executables( + compiler=f'{self.cc} -mcygwin -O -Wall', + compiler_so=f'{self.cc} -mcygwin -mdll -O -Wall', + compiler_cxx=f'{self.cxx} -mcygwin -O -Wall', + compiler_so_cxx=f'{self.cxx} -mcygwin -mdll -O -Wall', + linker_exe=f'{self.cc} -mcygwin', + linker_so=f'{self.linker_dll} -mcygwin {shared_option}', + linker_exe_cxx=f'{self.cxx} -mcygwin', + linker_so_cxx=f'{self.linker_dll_cxx} -mcygwin {shared_option}', + ) + + self.dll_libraries = get_msvcr() + + @property + def gcc_version(self): + # Older numpy depended on this existing to check for ancient + # gcc versions. This doesn't make much sense with clang etc so + # just hardcode to something recent. + # https://github.com/numpy/numpy/pull/20333 + warnings.warn( + "gcc_version attribute of CygwinCCompiler is deprecated. " + "Instead of returning actual gcc version a fixed value 11.2.0 is returned.", + DeprecationWarning, + stacklevel=2, + ) + with suppress_known_deprecation(): + return LooseVersion("11.2.0") + + def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts): + """Compiles the source by spawning GCC and windres if needed.""" + if ext in ('.rc', '.res'): + # gcc needs '.res' and '.rc' compiled to object files !!! + try: + self.spawn(["windres", "-i", src, "-o", obj]) + except DistutilsExecError as msg: + raise CompileError(msg) + else: # for other files use the C-compiler + try: + if self.detect_language(src) == 'c++': + self.spawn( + self.compiler_so_cxx + + cc_args + + [src, '-o', obj] + + extra_postargs + ) + else: + self.spawn( + self.compiler_so + cc_args + [src, '-o', obj] + extra_postargs + ) + except DistutilsExecError as msg: + raise CompileError(msg) + + def link( + self, + target_desc, + objects, + output_filename, + output_dir=None, + libraries=None, + library_dirs=None, + runtime_library_dirs=None, + export_symbols=None, + debug=False, + extra_preargs=None, + extra_postargs=None, + build_temp=None, + target_lang=None, + ): + """Link the objects.""" + # use separate copies, so we can modify the lists + extra_preargs = copy.copy(extra_preargs or []) + libraries = copy.copy(libraries or []) + objects = copy.copy(objects or []) + + if runtime_library_dirs: + self.warn(_runtime_library_dirs_msg) + + # Additional libraries + libraries.extend(self.dll_libraries) + + # handle export symbols by creating a def-file + # with executables this only works with gcc/ld as linker + if (export_symbols is not None) and ( + target_desc != self.EXECUTABLE or self.linker_dll == "gcc" + ): + # (The linker doesn't do anything if output is up-to-date. + # So it would probably better to check if we really need this, + # but for this we had to insert some unchanged parts of + # UnixCCompiler, and this is not what we want.) + + # we want to put some files in the same directory as the + # object files are, build_temp doesn't help much + # where are the object files + temp_dir = os.path.dirname(objects[0]) + # name of dll to give the helper files the same base name + (dll_name, dll_extension) = os.path.splitext( + os.path.basename(output_filename) + ) + + # generate the filenames for these files + def_file = os.path.join(temp_dir, dll_name + ".def") + + # Generate .def file + contents = [f"LIBRARY {os.path.basename(output_filename)}", "EXPORTS"] + contents.extend(export_symbols) + self.execute(write_file, (def_file, contents), f"writing {def_file}") + + # next add options for def-file + + # for gcc/ld the def-file is specified as any object files + objects.append(def_file) + + # end: if ((export_symbols is not None) and + # (target_desc != self.EXECUTABLE or self.linker_dll == "gcc")): + + # who wants symbols and a many times larger output file + # should explicitly switch the debug mode on + # otherwise we let ld strip the output file + # (On my machine: 10KiB < stripped_file < ??100KiB + # unstripped_file = stripped_file + XXX KiB + # ( XXX=254 for a typical python extension)) + if not debug: + extra_preargs.append("-s") + + UnixCCompiler.link( + self, + target_desc, + objects, + output_filename, + output_dir, + libraries, + library_dirs, + runtime_library_dirs, + None, # export_symbols, we do this in our def-file + debug, + extra_preargs, + extra_postargs, + build_temp, + target_lang, + ) + + def runtime_library_dir_option(self, dir): + # cygwin doesn't support rpath. While in theory we could error + # out like MSVC does, code might expect it to work like on Unix, so + # just warn and hope for the best. + self.warn(_runtime_library_dirs_msg) + return [] + + # -- Miscellaneous methods ----------------------------------------- + + def _make_out_path(self, output_dir, strip_dir, src_name): + # use normcase to make sure '.rc' is really '.rc' and not '.RC' + norm_src_name = os.path.normcase(src_name) + return super()._make_out_path(output_dir, strip_dir, norm_src_name) + + @property + def out_extensions(self): + """ + Add support for rc and res files. + """ + return { + **super().out_extensions, + **{ext: ext + self.obj_extension for ext in ('.res', '.rc')}, + } + + +# the same as cygwin plus some additional parameters +class Mingw32CCompiler(CygwinCCompiler): + """Handles the Mingw32 port of the GNU C compiler to Windows.""" + + compiler_type = 'mingw32' + + def __init__(self, verbose=False, dry_run=False, force=False): + super().__init__(verbose, dry_run, force) + + shared_option = "-shared" + + if is_cygwincc(self.cc): + raise CCompilerError('Cygwin gcc cannot be used with --compiler=mingw32') + + self.set_executables( + compiler=f'{self.cc} -O -Wall', + compiler_so=f'{self.cc} -shared -O -Wall', + compiler_so_cxx=f'{self.cxx} -shared -O -Wall', + compiler_cxx=f'{self.cxx} -O -Wall', + linker_exe=f'{self.cc}', + linker_so=f'{self.linker_dll} {shared_option}', + linker_exe_cxx=f'{self.cxx}', + linker_so_cxx=f'{self.linker_dll_cxx} {shared_option}', + ) + + def runtime_library_dir_option(self, dir): + raise DistutilsPlatformError(_runtime_library_dirs_msg) + + +# Because these compilers aren't configured in Python's pyconfig.h file by +# default, we should at least warn the user if he is using an unmodified +# version. + +CONFIG_H_OK = "ok" +CONFIG_H_NOTOK = "not ok" +CONFIG_H_UNCERTAIN = "uncertain" + + +def check_config_h(): + """Check if the current Python installation appears amenable to building + extensions with GCC. + + Returns a tuple (status, details), where 'status' is one of the following + constants: + + - CONFIG_H_OK: all is well, go ahead and compile + - CONFIG_H_NOTOK: doesn't look good + - CONFIG_H_UNCERTAIN: not sure -- unable to read pyconfig.h + + 'details' is a human-readable string explaining the situation. + + Note there are two ways to conclude "OK": either 'sys.version' contains + the string "GCC" (implying that this Python was built with GCC), or the + installed "pyconfig.h" contains the string "__GNUC__". + """ + + # XXX since this function also checks sys.version, it's not strictly a + # "pyconfig.h" check -- should probably be renamed... + + from distutils import sysconfig + + # if sys.version contains GCC then python was compiled with GCC, and the + # pyconfig.h file should be OK + if "GCC" in sys.version: + return CONFIG_H_OK, "sys.version mentions 'GCC'" + + # Clang would also work + if "Clang" in sys.version: + return CONFIG_H_OK, "sys.version mentions 'Clang'" + + # let's see if __GNUC__ is mentioned in python.h + fn = sysconfig.get_config_h_filename() + try: + config_h = pathlib.Path(fn).read_text(encoding='utf-8') + except OSError as exc: + return (CONFIG_H_UNCERTAIN, f"couldn't read '{fn}': {exc.strerror}") + else: + substring = '__GNUC__' + if substring in config_h: + code = CONFIG_H_OK + mention_inflected = 'mentions' + else: + code = CONFIG_H_NOTOK + mention_inflected = 'does not mention' + return code, f"{fn!r} {mention_inflected} {substring!r}" + + +def is_cygwincc(cc): + """Try to determine if the compiler that would be used is from cygwin.""" + out_string = check_output(shlex.split(cc) + ['-dumpmachine']) + return out_string.strip().endswith(b'cygwin') + + +get_versions = None +""" +A stand-in for the previous get_versions() function to prevent failures +when monkeypatched. See pypa/setuptools#2969. +""" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/debug.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/debug.py new file mode 100644 index 0000000000000000000000000000000000000000..daf1660f0d821143e388d37532a39ddfd2ca0347 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/debug.py @@ -0,0 +1,5 @@ +import os + +# If DISTUTILS_DEBUG is anything other than the empty string, we run in +# debug mode. +DEBUG = os.environ.get('DISTUTILS_DEBUG') diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/dep_util.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/dep_util.py new file mode 100644 index 0000000000000000000000000000000000000000..09a8a2e126c8bb009dbbe61c5c4bd5e358744996 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/dep_util.py @@ -0,0 +1,14 @@ +import warnings + +from . import _modified + + +def __getattr__(name): + if name not in ['newer', 'newer_group', 'newer_pairwise']: + raise AttributeError(name) + warnings.warn( + "dep_util is Deprecated. Use functions from setuptools instead.", + DeprecationWarning, + stacklevel=2, + ) + return getattr(_modified, name) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/dir_util.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/dir_util.py new file mode 100644 index 0000000000000000000000000000000000000000..d9782602cfe78d4033fb43846965347de8ad9cdf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/dir_util.py @@ -0,0 +1,244 @@ +"""distutils.dir_util + +Utility functions for manipulating directories and directory trees.""" + +import functools +import itertools +import os +import pathlib + +from . import file_util +from ._log import log +from .errors import DistutilsFileError, DistutilsInternalError + + +class SkipRepeatAbsolutePaths(set): + """ + Cache for mkpath. + + In addition to cheapening redundant calls, eliminates redundant + "creating /foo/bar/baz" messages in dry-run mode. + """ + + def __init__(self): + SkipRepeatAbsolutePaths.instance = self + + @classmethod + def clear(cls): + super(cls, cls.instance).clear() + + def wrap(self, func): + @functools.wraps(func) + def wrapper(path, *args, **kwargs): + if path.absolute() in self: + return + result = func(path, *args, **kwargs) + self.add(path.absolute()) + return result + + return wrapper + + +# Python 3.8 compatibility +wrapper = SkipRepeatAbsolutePaths().wrap + + +@functools.singledispatch +@wrapper +def mkpath(name: pathlib.Path, mode=0o777, verbose=True, dry_run=False) -> None: + """Create a directory and any missing ancestor directories. + + If the directory already exists (or if 'name' is the empty string, which + means the current directory, which of course exists), then do nothing. + Raise DistutilsFileError if unable to create some directory along the way + (eg. some sub-path exists, but is a file rather than a directory). + If 'verbose' is true, log the directory created. + """ + if verbose and not name.is_dir(): + log.info("creating %s", name) + + try: + dry_run or name.mkdir(mode=mode, parents=True, exist_ok=True) + except OSError as exc: + raise DistutilsFileError(f"could not create '{name}': {exc.args[-1]}") + + +@mkpath.register +def _(name: str, *args, **kwargs): + return mkpath(pathlib.Path(name), *args, **kwargs) + + +@mkpath.register +def _(name: None, *args, **kwargs): + """ + Detect a common bug -- name is None. + """ + raise DistutilsInternalError(f"mkpath: 'name' must be a string (got {name!r})") + + +def create_tree(base_dir, files, mode=0o777, verbose=True, dry_run=False): + """Create all the empty directories under 'base_dir' needed to put 'files' + there. + + 'base_dir' is just the name of a directory which doesn't necessarily + exist yet; 'files' is a list of filenames to be interpreted relative to + 'base_dir'. 'base_dir' + the directory portion of every file in 'files' + will be created if it doesn't already exist. 'mode', 'verbose' and + 'dry_run' flags are as for 'mkpath()'. + """ + # First get the list of directories to create + need_dir = set(os.path.join(base_dir, os.path.dirname(file)) for file in files) + + # Now create them + for dir in sorted(need_dir): + mkpath(dir, mode, verbose=verbose, dry_run=dry_run) + + +def copy_tree( + src, + dst, + preserve_mode=True, + preserve_times=True, + preserve_symlinks=False, + update=False, + verbose=True, + dry_run=False, +): + """Copy an entire directory tree 'src' to a new location 'dst'. + + Both 'src' and 'dst' must be directory names. If 'src' is not a + directory, raise DistutilsFileError. If 'dst' does not exist, it is + created with 'mkpath()'. The end result of the copy is that every + file in 'src' is copied to 'dst', and directories under 'src' are + recursively copied to 'dst'. Return the list of files that were + copied or might have been copied, using their output name. The + return value is unaffected by 'update' or 'dry_run': it is simply + the list of all files under 'src', with the names changed to be + under 'dst'. + + 'preserve_mode' and 'preserve_times' are the same as for + 'copy_file'; note that they only apply to regular files, not to + directories. If 'preserve_symlinks' is true, symlinks will be + copied as symlinks (on platforms that support them!); otherwise + (the default), the destination of the symlink will be copied. + 'update' and 'verbose' are the same as for 'copy_file'. + """ + if not dry_run and not os.path.isdir(src): + raise DistutilsFileError(f"cannot copy tree '{src}': not a directory") + try: + names = os.listdir(src) + except OSError as e: + if dry_run: + names = [] + else: + raise DistutilsFileError(f"error listing files in '{src}': {e.strerror}") + + if not dry_run: + mkpath(dst, verbose=verbose) + + copy_one = functools.partial( + _copy_one, + src=src, + dst=dst, + preserve_symlinks=preserve_symlinks, + verbose=verbose, + dry_run=dry_run, + preserve_mode=preserve_mode, + preserve_times=preserve_times, + update=update, + ) + return list(itertools.chain.from_iterable(map(copy_one, names))) + + +def _copy_one( + name, + *, + src, + dst, + preserve_symlinks, + verbose, + dry_run, + preserve_mode, + preserve_times, + update, +): + src_name = os.path.join(src, name) + dst_name = os.path.join(dst, name) + + if name.startswith('.nfs'): + # skip NFS rename files + return + + if preserve_symlinks and os.path.islink(src_name): + link_dest = os.readlink(src_name) + if verbose >= 1: + log.info("linking %s -> %s", dst_name, link_dest) + if not dry_run: + os.symlink(link_dest, dst_name) + yield dst_name + + elif os.path.isdir(src_name): + yield from copy_tree( + src_name, + dst_name, + preserve_mode, + preserve_times, + preserve_symlinks, + update, + verbose=verbose, + dry_run=dry_run, + ) + else: + file_util.copy_file( + src_name, + dst_name, + preserve_mode, + preserve_times, + update, + verbose=verbose, + dry_run=dry_run, + ) + yield dst_name + + +def _build_cmdtuple(path, cmdtuples): + """Helper for remove_tree().""" + for f in os.listdir(path): + real_f = os.path.join(path, f) + if os.path.isdir(real_f) and not os.path.islink(real_f): + _build_cmdtuple(real_f, cmdtuples) + else: + cmdtuples.append((os.remove, real_f)) + cmdtuples.append((os.rmdir, path)) + + +def remove_tree(directory, verbose=True, dry_run=False): + """Recursively remove an entire directory tree. + + Any errors are ignored (apart from being reported to stdout if 'verbose' + is true). + """ + if verbose >= 1: + log.info("removing '%s' (and everything under it)", directory) + if dry_run: + return + cmdtuples = [] + _build_cmdtuple(directory, cmdtuples) + for cmd in cmdtuples: + try: + cmd[0](cmd[1]) + # Clear the cache + SkipRepeatAbsolutePaths.clear() + except OSError as exc: + log.warning("error removing %s: %s", directory, exc) + + +def ensure_relative(path): + """Take the full path 'path', and make it a relative path. + + This is useful to make 'path' the second argument to os.path.join(). + """ + drive, path = os.path.splitdrive(path) + if path[0:1] == os.sep: + path = drive + path[1:] + return path diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/dist.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/dist.py new file mode 100644 index 0000000000000000000000000000000000000000..33ed8ebd7a0e60cc1949274b1c9854c6fd72ffa5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/dist.py @@ -0,0 +1,1317 @@ +"""distutils.dist + +Provides the Distribution class, which represents the module distribution +being built/installed/distributed. +""" + +from __future__ import annotations + +import contextlib +import logging +import os +import pathlib +import re +import sys +import warnings +from collections.abc import Iterable +from email import message_from_file +from typing import TYPE_CHECKING, Literal, TypeVar, overload + +from packaging.utils import canonicalize_name, canonicalize_version + +from ._log import log +from .debug import DEBUG +from .errors import ( + DistutilsArgError, + DistutilsClassError, + DistutilsModuleError, + DistutilsOptionError, +) +from .fancy_getopt import FancyGetopt, translate_longopt +from .util import check_environ, rfc822_escape, strtobool + +if TYPE_CHECKING: + # type-only import because of mutual dependence between these modules + from .cmd import Command + +_CommandT = TypeVar("_CommandT", bound="Command") + +# Regex to define acceptable Distutils command names. This is not *quite* +# the same as a Python NAME -- I don't allow leading underscores. The fact +# that they're very similar is no coincidence; the default naming scheme is +# to look for a Python module named after the command. +command_re = re.compile(r'^[a-zA-Z]([a-zA-Z0-9_]*)$') + + +def _ensure_list(value, fieldname): + if isinstance(value, str): + # a string containing comma separated values is okay. It will + # be converted to a list by Distribution.finalize_options(). + pass + elif not isinstance(value, list): + # passing a tuple or an iterator perhaps, warn and convert + typename = type(value).__name__ + msg = "Warning: '{fieldname}' should be a list, got type '{typename}'" + msg = msg.format(**locals()) + log.warning(msg) + value = list(value) + return value + + +class Distribution: + """The core of the Distutils. Most of the work hiding behind 'setup' + is really done within a Distribution instance, which farms the work out + to the Distutils commands specified on the command line. + + Setup scripts will almost never instantiate Distribution directly, + unless the 'setup()' function is totally inadequate to their needs. + However, it is conceivable that a setup script might wish to subclass + Distribution for some specialized purpose, and then pass the subclass + to 'setup()' as the 'distclass' keyword argument. If so, it is + necessary to respect the expectations that 'setup' has of Distribution. + See the code for 'setup()', in core.py, for details. + """ + + # 'global_options' describes the command-line options that may be + # supplied to the setup script prior to any actual commands. + # Eg. "./setup.py -n" or "./setup.py --quiet" both take advantage of + # these global options. This list should be kept to a bare minimum, + # since every global option is also valid as a command option -- and we + # don't want to pollute the commands with too many options that they + # have minimal control over. + # The fourth entry for verbose means that it can be repeated. + global_options = [ + ('verbose', 'v', "run verbosely (default)", 1), + ('quiet', 'q', "run quietly (turns verbosity off)"), + ('dry-run', 'n', "don't actually do anything"), + ('help', 'h', "show detailed help message"), + ('no-user-cfg', None, 'ignore pydistutils.cfg in your home directory'), + ] + + # 'common_usage' is a short (2-3 line) string describing the common + # usage of the setup script. + common_usage = """\ +Common commands: (see '--help-commands' for more) + + setup.py build will build the package underneath 'build/' + setup.py install will install the package +""" + + # options that are not propagated to the commands + display_options = [ + ('help-commands', None, "list all available commands"), + ('name', None, "print package name"), + ('version', 'V', "print package version"), + ('fullname', None, "print -"), + ('author', None, "print the author's name"), + ('author-email', None, "print the author's email address"), + ('maintainer', None, "print the maintainer's name"), + ('maintainer-email', None, "print the maintainer's email address"), + ('contact', None, "print the maintainer's name if known, else the author's"), + ( + 'contact-email', + None, + "print the maintainer's email address if known, else the author's", + ), + ('url', None, "print the URL for this package"), + ('license', None, "print the license of the package"), + ('licence', None, "alias for --license"), + ('description', None, "print the package description"), + ('long-description', None, "print the long package description"), + ('platforms', None, "print the list of platforms"), + ('classifiers', None, "print the list of classifiers"), + ('keywords', None, "print the list of keywords"), + ('provides', None, "print the list of packages/modules provided"), + ('requires', None, "print the list of packages/modules required"), + ('obsoletes', None, "print the list of packages/modules made obsolete"), + ] + display_option_names = [translate_longopt(x[0]) for x in display_options] + + # negative options are options that exclude other options + negative_opt = {'quiet': 'verbose'} + + # -- Creation/initialization methods ------------------------------- + + def __init__(self, attrs=None): # noqa: C901 + """Construct a new Distribution instance: initialize all the + attributes of a Distribution, and then use 'attrs' (a dictionary + mapping attribute names to values) to assign some of those + attributes their "real" values. (Any attributes not mentioned in + 'attrs' will be assigned to some null value: 0, None, an empty list + or dictionary, etc.) Most importantly, initialize the + 'command_obj' attribute to the empty dictionary; this will be + filled in with real command objects by 'parse_command_line()'. + """ + + # Default values for our command-line options + self.verbose = True + self.dry_run = False + self.help = False + for attr in self.display_option_names: + setattr(self, attr, False) + + # Store the distribution meta-data (name, version, author, and so + # forth) in a separate object -- we're getting to have enough + # information here (and enough command-line options) that it's + # worth it. Also delegate 'get_XXX()' methods to the 'metadata' + # object in a sneaky and underhanded (but efficient!) way. + self.metadata = DistributionMetadata() + for basename in self.metadata._METHOD_BASENAMES: + method_name = "get_" + basename + setattr(self, method_name, getattr(self.metadata, method_name)) + + # 'cmdclass' maps command names to class objects, so we + # can 1) quickly figure out which class to instantiate when + # we need to create a new command object, and 2) have a way + # for the setup script to override command classes + self.cmdclass = {} + + # 'command_packages' is a list of packages in which commands + # are searched for. The factory for command 'foo' is expected + # to be named 'foo' in the module 'foo' in one of the packages + # named here. This list is searched from the left; an error + # is raised if no named package provides the command being + # searched for. (Always access using get_command_packages().) + self.command_packages = None + + # 'script_name' and 'script_args' are usually set to sys.argv[0] + # and sys.argv[1:], but they can be overridden when the caller is + # not necessarily a setup script run from the command-line. + self.script_name = None + self.script_args: list[str] | None = None + + # 'command_options' is where we store command options between + # parsing them (from config files, the command-line, etc.) and when + # they are actually needed -- ie. when the command in question is + # instantiated. It is a dictionary of dictionaries of 2-tuples: + # command_options = { command_name : { option : (source, value) } } + self.command_options = {} + + # 'dist_files' is the list of (command, pyversion, file) that + # have been created by any dist commands run so far. This is + # filled regardless of whether the run is dry or not. pyversion + # gives sysconfig.get_python_version() if the dist file is + # specific to a Python version, 'any' if it is good for all + # Python versions on the target platform, and '' for a source + # file. pyversion should not be used to specify minimum or + # maximum required Python versions; use the metainfo for that + # instead. + self.dist_files = [] + + # These options are really the business of various commands, rather + # than of the Distribution itself. We provide aliases for them in + # Distribution as a convenience to the developer. + self.packages = None + self.package_data = {} + self.package_dir = None + self.py_modules = None + self.libraries = None + self.headers = None + self.ext_modules = None + self.ext_package = None + self.include_dirs = None + self.extra_path = None + self.scripts = None + self.data_files = None + self.password = '' + + # And now initialize bookkeeping stuff that can't be supplied by + # the caller at all. 'command_obj' maps command names to + # Command instances -- that's how we enforce that every command + # class is a singleton. + self.command_obj = {} + + # 'have_run' maps command names to boolean values; it keeps track + # of whether we have actually run a particular command, to make it + # cheap to "run" a command whenever we think we might need to -- if + # it's already been done, no need for expensive filesystem + # operations, we just check the 'have_run' dictionary and carry on. + # It's only safe to query 'have_run' for a command class that has + # been instantiated -- a false value will be inserted when the + # command object is created, and replaced with a true value when + # the command is successfully run. Thus it's probably best to use + # '.get()' rather than a straight lookup. + self.have_run = {} + + # Now we'll use the attrs dictionary (ultimately, keyword args from + # the setup script) to possibly override any or all of these + # distribution options. + + if attrs: + # Pull out the set of command options and work on them + # specifically. Note that this order guarantees that aliased + # command options will override any supplied redundantly + # through the general options dictionary. + options = attrs.get('options') + if options is not None: + del attrs['options'] + for command, cmd_options in options.items(): + opt_dict = self.get_option_dict(command) + for opt, val in cmd_options.items(): + opt_dict[opt] = ("setup script", val) + + if 'licence' in attrs: + attrs['license'] = attrs['licence'] + del attrs['licence'] + msg = "'licence' distribution option is deprecated; use 'license'" + warnings.warn(msg) + + # Now work on the rest of the attributes. Any attribute that's + # not already defined is invalid! + for key, val in attrs.items(): + if hasattr(self.metadata, "set_" + key): + getattr(self.metadata, "set_" + key)(val) + elif hasattr(self.metadata, key): + setattr(self.metadata, key, val) + elif hasattr(self, key): + setattr(self, key, val) + else: + msg = f"Unknown distribution option: {key!r}" + warnings.warn(msg) + + # no-user-cfg is handled before other command line args + # because other args override the config files, and this + # one is needed before we can load the config files. + # If attrs['script_args'] wasn't passed, assume false. + # + # This also make sure we just look at the global options + self.want_user_cfg = True + + if self.script_args is not None: + # Coerce any possible iterable from attrs into a list + self.script_args = list(self.script_args) + for arg in self.script_args: + if not arg.startswith('-'): + break + if arg == '--no-user-cfg': + self.want_user_cfg = False + break + + self.finalize_options() + + def get_option_dict(self, command): + """Get the option dictionary for a given command. If that + command's option dictionary hasn't been created yet, then create it + and return the new dictionary; otherwise, return the existing + option dictionary. + """ + dict = self.command_options.get(command) + if dict is None: + dict = self.command_options[command] = {} + return dict + + def dump_option_dicts(self, header=None, commands=None, indent=""): + from pprint import pformat + + if commands is None: # dump all command option dicts + commands = sorted(self.command_options.keys()) + + if header is not None: + self.announce(indent + header) + indent = indent + " " + + if not commands: + self.announce(indent + "no commands known yet") + return + + for cmd_name in commands: + opt_dict = self.command_options.get(cmd_name) + if opt_dict is None: + self.announce(indent + f"no option dict for '{cmd_name}' command") + else: + self.announce(indent + f"option dict for '{cmd_name}' command:") + out = pformat(opt_dict) + for line in out.split('\n'): + self.announce(indent + " " + line) + + # -- Config file finding/parsing methods --------------------------- + + def find_config_files(self): + """Find as many configuration files as should be processed for this + platform, and return a list of filenames in the order in which they + should be parsed. The filenames returned are guaranteed to exist + (modulo nasty race conditions). + + There are multiple possible config files: + - distutils.cfg in the Distutils installation directory (i.e. + where the top-level Distutils __inst__.py file lives) + - a file in the user's home directory named .pydistutils.cfg + on Unix and pydistutils.cfg on Windows/Mac; may be disabled + with the ``--no-user-cfg`` option + - setup.cfg in the current directory + - a file named by an environment variable + """ + check_environ() + files = [str(path) for path in self._gen_paths() if os.path.isfile(path)] + + if DEBUG: + self.announce("using config files: {}".format(', '.join(files))) + + return files + + def _gen_paths(self): + # The system-wide Distutils config file + sys_dir = pathlib.Path(sys.modules['distutils'].__file__).parent + yield sys_dir / "distutils.cfg" + + # The per-user config file + prefix = '.' * (os.name == 'posix') + filename = prefix + 'pydistutils.cfg' + if self.want_user_cfg: + with contextlib.suppress(RuntimeError): + yield pathlib.Path('~').expanduser() / filename + + # All platforms support local setup.cfg + yield pathlib.Path('setup.cfg') + + # Additional config indicated in the environment + with contextlib.suppress(TypeError): + yield pathlib.Path(os.getenv("DIST_EXTRA_CONFIG")) + + def parse_config_files(self, filenames=None): # noqa: C901 + from configparser import ConfigParser + + # Ignore install directory options if we have a venv + if sys.prefix != sys.base_prefix: + ignore_options = [ + 'install-base', + 'install-platbase', + 'install-lib', + 'install-platlib', + 'install-purelib', + 'install-headers', + 'install-scripts', + 'install-data', + 'prefix', + 'exec-prefix', + 'home', + 'user', + 'root', + ] + else: + ignore_options = [] + + ignore_options = frozenset(ignore_options) + + if filenames is None: + filenames = self.find_config_files() + + if DEBUG: + self.announce("Distribution.parse_config_files():") + + parser = ConfigParser() + for filename in filenames: + if DEBUG: + self.announce(f" reading {filename}") + parser.read(filename, encoding='utf-8') + for section in parser.sections(): + options = parser.options(section) + opt_dict = self.get_option_dict(section) + + for opt in options: + if opt != '__name__' and opt not in ignore_options: + val = parser.get(section, opt) + opt = opt.replace('-', '_') + opt_dict[opt] = (filename, val) + + # Make the ConfigParser forget everything (so we retain + # the original filenames that options come from) + parser.__init__() + + # If there was a "global" section in the config file, use it + # to set Distribution options. + + if 'global' in self.command_options: + for opt, (_src, val) in self.command_options['global'].items(): + alias = self.negative_opt.get(opt) + try: + if alias: + setattr(self, alias, not strtobool(val)) + elif opt in ('verbose', 'dry_run'): # ugh! + setattr(self, opt, strtobool(val)) + else: + setattr(self, opt, val) + except ValueError as msg: + raise DistutilsOptionError(msg) + + # -- Command-line parsing methods ---------------------------------- + + def parse_command_line(self): + """Parse the setup script's command line, taken from the + 'script_args' instance attribute (which defaults to 'sys.argv[1:]' + -- see 'setup()' in core.py). This list is first processed for + "global options" -- options that set attributes of the Distribution + instance. Then, it is alternately scanned for Distutils commands + and options for that command. Each new command terminates the + options for the previous command. The allowed options for a + command are determined by the 'user_options' attribute of the + command class -- thus, we have to be able to load command classes + in order to parse the command line. Any error in that 'options' + attribute raises DistutilsGetoptError; any error on the + command-line raises DistutilsArgError. If no Distutils commands + were found on the command line, raises DistutilsArgError. Return + true if command-line was successfully parsed and we should carry + on with executing commands; false if no errors but we shouldn't + execute commands (currently, this only happens if user asks for + help). + """ + # + # We now have enough information to show the Macintosh dialog + # that allows the user to interactively specify the "command line". + # + toplevel_options = self._get_toplevel_options() + + # We have to parse the command line a bit at a time -- global + # options, then the first command, then its options, and so on -- + # because each command will be handled by a different class, and + # the options that are valid for a particular class aren't known + # until we have loaded the command class, which doesn't happen + # until we know what the command is. + + self.commands = [] + parser = FancyGetopt(toplevel_options + self.display_options) + parser.set_negative_aliases(self.negative_opt) + parser.set_aliases({'licence': 'license'}) + args = parser.getopt(args=self.script_args, object=self) + option_order = parser.get_option_order() + logging.getLogger().setLevel(logging.WARN - 10 * self.verbose) + + # for display options we return immediately + if self.handle_display_options(option_order): + return + while args: + args = self._parse_command_opts(parser, args) + if args is None: # user asked for help (and got it) + return + + # Handle the cases of --help as a "global" option, ie. + # "setup.py --help" and "setup.py --help command ...". For the + # former, we show global options (--verbose, --dry-run, etc.) + # and display-only options (--name, --version, etc.); for the + # latter, we omit the display-only options and show help for + # each command listed on the command line. + if self.help: + self._show_help( + parser, display_options=len(self.commands) == 0, commands=self.commands + ) + return + + # Oops, no commands found -- an end-user error + if not self.commands: + raise DistutilsArgError("no commands supplied") + + # All is well: return true + return True + + def _get_toplevel_options(self): + """Return the non-display options recognized at the top level. + + This includes options that are recognized *only* at the top + level as well as options recognized for commands. + """ + return self.global_options + [ + ( + "command-packages=", + None, + "list of packages that provide distutils commands", + ), + ] + + def _parse_command_opts(self, parser, args): # noqa: C901 + """Parse the command-line options for a single command. + 'parser' must be a FancyGetopt instance; 'args' must be the list + of arguments, starting with the current command (whose options + we are about to parse). Returns a new version of 'args' with + the next command at the front of the list; will be the empty + list if there are no more commands on the command line. Returns + None if the user asked for help on this command. + """ + # late import because of mutual dependence between these modules + from distutils.cmd import Command + + # Pull the current command from the head of the command line + command = args[0] + if not command_re.match(command): + raise SystemExit(f"invalid command name '{command}'") + self.commands.append(command) + + # Dig up the command class that implements this command, so we + # 1) know that it's a valid command, and 2) know which options + # it takes. + try: + cmd_class = self.get_command_class(command) + except DistutilsModuleError as msg: + raise DistutilsArgError(msg) + + # Require that the command class be derived from Command -- want + # to be sure that the basic "command" interface is implemented. + if not issubclass(cmd_class, Command): + raise DistutilsClassError( + f"command class {cmd_class} must subclass Command" + ) + + # Also make sure that the command object provides a list of its + # known options. + if not ( + hasattr(cmd_class, 'user_options') + and isinstance(cmd_class.user_options, list) + ): + msg = ( + "command class %s must provide " + "'user_options' attribute (a list of tuples)" + ) + raise DistutilsClassError(msg % cmd_class) + + # If the command class has a list of negative alias options, + # merge it in with the global negative aliases. + negative_opt = self.negative_opt + if hasattr(cmd_class, 'negative_opt'): + negative_opt = negative_opt.copy() + negative_opt.update(cmd_class.negative_opt) + + # Check for help_options in command class. They have a different + # format (tuple of four) so we need to preprocess them here. + if hasattr(cmd_class, 'help_options') and isinstance( + cmd_class.help_options, list + ): + help_options = fix_help_options(cmd_class.help_options) + else: + help_options = [] + + # All commands support the global options too, just by adding + # in 'global_options'. + parser.set_option_table( + self.global_options + cmd_class.user_options + help_options + ) + parser.set_negative_aliases(negative_opt) + (args, opts) = parser.getopt(args[1:]) + if hasattr(opts, 'help') and opts.help: + self._show_help(parser, display_options=False, commands=[cmd_class]) + return + + if hasattr(cmd_class, 'help_options') and isinstance( + cmd_class.help_options, list + ): + help_option_found = 0 + for help_option, _short, _desc, func in cmd_class.help_options: + if hasattr(opts, parser.get_attr_name(help_option)): + help_option_found = 1 + if callable(func): + func() + else: + raise DistutilsClassError( + f"invalid help function {func!r} for help option '{help_option}': " + "must be a callable object (function, etc.)" + ) + + if help_option_found: + return + + # Put the options from the command-line into their official + # holding pen, the 'command_options' dictionary. + opt_dict = self.get_option_dict(command) + for name, value in vars(opts).items(): + opt_dict[name] = ("command line", value) + + return args + + def finalize_options(self): + """Set final values for all the options on the Distribution + instance, analogous to the .finalize_options() method of Command + objects. + """ + for attr in ('keywords', 'platforms'): + value = getattr(self.metadata, attr) + if value is None: + continue + if isinstance(value, str): + value = [elm.strip() for elm in value.split(',')] + setattr(self.metadata, attr, value) + + def _show_help( + self, parser, global_options=True, display_options=True, commands: Iterable = () + ): + """Show help for the setup script command-line in the form of + several lists of command-line options. 'parser' should be a + FancyGetopt instance; do not expect it to be returned in the + same state, as its option table will be reset to make it + generate the correct help text. + + If 'global_options' is true, lists the global options: + --verbose, --dry-run, etc. If 'display_options' is true, lists + the "display-only" options: --name, --version, etc. Finally, + lists per-command help for every command name or command class + in 'commands'. + """ + # late import because of mutual dependence between these modules + from distutils.cmd import Command + from distutils.core import gen_usage + + if global_options: + if display_options: + options = self._get_toplevel_options() + else: + options = self.global_options + parser.set_option_table(options) + parser.print_help(self.common_usage + "\nGlobal options:") + print() + + if display_options: + parser.set_option_table(self.display_options) + parser.print_help( + "Information display options (just display information, ignore any commands)" + ) + print() + + for command in commands: + if isinstance(command, type) and issubclass(command, Command): + klass = command + else: + klass = self.get_command_class(command) + if hasattr(klass, 'help_options') and isinstance(klass.help_options, list): + parser.set_option_table( + klass.user_options + fix_help_options(klass.help_options) + ) + else: + parser.set_option_table(klass.user_options) + parser.print_help(f"Options for '{klass.__name__}' command:") + print() + + print(gen_usage(self.script_name)) + + def handle_display_options(self, option_order): + """If there were any non-global "display-only" options + (--help-commands or the metadata display options) on the command + line, display the requested info and return true; else return + false. + """ + from distutils.core import gen_usage + + # User just wants a list of commands -- we'll print it out and stop + # processing now (ie. if they ran "setup --help-commands foo bar", + # we ignore "foo bar"). + if self.help_commands: + self.print_commands() + print() + print(gen_usage(self.script_name)) + return 1 + + # If user supplied any of the "display metadata" options, then + # display that metadata in the order in which the user supplied the + # metadata options. + any_display_options = 0 + is_display_option = set() + for option in self.display_options: + is_display_option.add(option[0]) + + for opt, val in option_order: + if val and opt in is_display_option: + opt = translate_longopt(opt) + value = getattr(self.metadata, "get_" + opt)() + if opt in ('keywords', 'platforms'): + print(','.join(value)) + elif opt in ('classifiers', 'provides', 'requires', 'obsoletes'): + print('\n'.join(value)) + else: + print(value) + any_display_options = 1 + + return any_display_options + + def print_command_list(self, commands, header, max_length): + """Print a subset of the list of all commands -- used by + 'print_commands()'. + """ + print(header + ":") + + for cmd in commands: + klass = self.cmdclass.get(cmd) + if not klass: + klass = self.get_command_class(cmd) + try: + description = klass.description + except AttributeError: + description = "(no description available)" + + print(f" {cmd:<{max_length}} {description}") + + def print_commands(self): + """Print out a help message listing all available commands with a + description of each. The list is divided into "standard commands" + (listed in distutils.command.__all__) and "extra commands" + (mentioned in self.cmdclass, but not a standard command). The + descriptions come from the command class attribute + 'description'. + """ + import distutils.command + + std_commands = distutils.command.__all__ + is_std = set(std_commands) + + extra_commands = [cmd for cmd in self.cmdclass.keys() if cmd not in is_std] + + max_length = 0 + for cmd in std_commands + extra_commands: + if len(cmd) > max_length: + max_length = len(cmd) + + self.print_command_list(std_commands, "Standard commands", max_length) + if extra_commands: + print() + self.print_command_list(extra_commands, "Extra commands", max_length) + + def get_command_list(self): + """Get a list of (command, description) tuples. + The list is divided into "standard commands" (listed in + distutils.command.__all__) and "extra commands" (mentioned in + self.cmdclass, but not a standard command). The descriptions come + from the command class attribute 'description'. + """ + # Currently this is only used on Mac OS, for the Mac-only GUI + # Distutils interface (by Jack Jansen) + import distutils.command + + std_commands = distutils.command.__all__ + is_std = set(std_commands) + + extra_commands = [cmd for cmd in self.cmdclass.keys() if cmd not in is_std] + + rv = [] + for cmd in std_commands + extra_commands: + klass = self.cmdclass.get(cmd) + if not klass: + klass = self.get_command_class(cmd) + try: + description = klass.description + except AttributeError: + description = "(no description available)" + rv.append((cmd, description)) + return rv + + # -- Command class/object methods ---------------------------------- + + def get_command_packages(self): + """Return a list of packages from which commands are loaded.""" + pkgs = self.command_packages + if not isinstance(pkgs, list): + if pkgs is None: + pkgs = '' + pkgs = [pkg.strip() for pkg in pkgs.split(',') if pkg != ''] + if "distutils.command" not in pkgs: + pkgs.insert(0, "distutils.command") + self.command_packages = pkgs + return pkgs + + def get_command_class(self, command): + """Return the class that implements the Distutils command named by + 'command'. First we check the 'cmdclass' dictionary; if the + command is mentioned there, we fetch the class object from the + dictionary and return it. Otherwise we load the command module + ("distutils.command." + command) and fetch the command class from + the module. The loaded class is also stored in 'cmdclass' + to speed future calls to 'get_command_class()'. + + Raises DistutilsModuleError if the expected module could not be + found, or if that module does not define the expected class. + """ + klass = self.cmdclass.get(command) + if klass: + return klass + + for pkgname in self.get_command_packages(): + module_name = f"{pkgname}.{command}" + klass_name = command + + try: + __import__(module_name) + module = sys.modules[module_name] + except ImportError: + continue + + try: + klass = getattr(module, klass_name) + except AttributeError: + raise DistutilsModuleError( + f"invalid command '{command}' (no class '{klass_name}' in module '{module_name}')" + ) + + self.cmdclass[command] = klass + return klass + + raise DistutilsModuleError(f"invalid command '{command}'") + + @overload + def get_command_obj( + self, command: str, create: Literal[True] = True + ) -> Command: ... + @overload + def get_command_obj( + self, command: str, create: Literal[False] + ) -> Command | None: ... + def get_command_obj(self, command: str, create: bool = True) -> Command | None: + """Return the command object for 'command'. Normally this object + is cached on a previous call to 'get_command_obj()'; if no command + object for 'command' is in the cache, then we either create and + return it (if 'create' is true) or return None. + """ + cmd_obj = self.command_obj.get(command) + if not cmd_obj and create: + if DEBUG: + self.announce( + "Distribution.get_command_obj(): " + f"creating '{command}' command object" + ) + + klass = self.get_command_class(command) + cmd_obj = self.command_obj[command] = klass(self) + self.have_run[command] = False + + # Set any options that were supplied in config files + # or on the command line. (NB. support for error + # reporting is lame here: any errors aren't reported + # until 'finalize_options()' is called, which means + # we won't report the source of the error.) + options = self.command_options.get(command) + if options: + self._set_command_options(cmd_obj, options) + + return cmd_obj + + def _set_command_options(self, command_obj, option_dict=None): # noqa: C901 + """Set the options for 'command_obj' from 'option_dict'. Basically + this means copying elements of a dictionary ('option_dict') to + attributes of an instance ('command'). + + 'command_obj' must be a Command instance. If 'option_dict' is not + supplied, uses the standard option dictionary for this command + (from 'self.command_options'). + """ + command_name = command_obj.get_command_name() + if option_dict is None: + option_dict = self.get_option_dict(command_name) + + if DEBUG: + self.announce(f" setting options for '{command_name}' command:") + for option, (source, value) in option_dict.items(): + if DEBUG: + self.announce(f" {option} = {value} (from {source})") + try: + bool_opts = [translate_longopt(o) for o in command_obj.boolean_options] + except AttributeError: + bool_opts = [] + try: + neg_opt = command_obj.negative_opt + except AttributeError: + neg_opt = {} + + try: + is_string = isinstance(value, str) + if option in neg_opt and is_string: + setattr(command_obj, neg_opt[option], not strtobool(value)) + elif option in bool_opts and is_string: + setattr(command_obj, option, strtobool(value)) + elif hasattr(command_obj, option): + setattr(command_obj, option, value) + else: + raise DistutilsOptionError( + f"error in {source}: command '{command_name}' has no such option '{option}'" + ) + except ValueError as msg: + raise DistutilsOptionError(msg) + + @overload + def reinitialize_command( + self, command: str, reinit_subcommands: bool = False + ) -> Command: ... + @overload + def reinitialize_command( + self, command: _CommandT, reinit_subcommands: bool = False + ) -> _CommandT: ... + def reinitialize_command( + self, command: str | Command, reinit_subcommands=False + ) -> Command: + """Reinitializes a command to the state it was in when first + returned by 'get_command_obj()': ie., initialized but not yet + finalized. This provides the opportunity to sneak option + values in programmatically, overriding or supplementing + user-supplied values from the config files and command line. + You'll have to re-finalize the command object (by calling + 'finalize_options()' or 'ensure_finalized()') before using it for + real. + + 'command' should be a command name (string) or command object. If + 'reinit_subcommands' is true, also reinitializes the command's + sub-commands, as declared by the 'sub_commands' class attribute (if + it has one). See the "install" command for an example. Only + reinitializes the sub-commands that actually matter, ie. those + whose test predicates return true. + + Returns the reinitialized command object. + """ + from distutils.cmd import Command + + if not isinstance(command, Command): + command_name = command + command = self.get_command_obj(command_name) + else: + command_name = command.get_command_name() + + if not command.finalized: + return command + command.initialize_options() + command.finalized = False + self.have_run[command_name] = False + self._set_command_options(command) + + if reinit_subcommands: + for sub in command.get_sub_commands(): + self.reinitialize_command(sub, reinit_subcommands) + + return command + + # -- Methods that operate on the Distribution ---------------------- + + def announce(self, msg, level=logging.INFO): + log.log(level, msg) + + def run_commands(self): + """Run each command that was seen on the setup script command line. + Uses the list of commands found and cache of command objects + created by 'get_command_obj()'. + """ + for cmd in self.commands: + self.run_command(cmd) + + # -- Methods that operate on its Commands -------------------------- + + def run_command(self, command): + """Do whatever it takes to run a command (including nothing at all, + if the command has already been run). Specifically: if we have + already created and run the command named by 'command', return + silently without doing anything. If the command named by 'command' + doesn't even have a command object yet, create one. Then invoke + 'run()' on that command object (or an existing one). + """ + # Already been here, done that? then return silently. + if self.have_run.get(command): + return + + log.info("running %s", command) + cmd_obj = self.get_command_obj(command) + cmd_obj.ensure_finalized() + cmd_obj.run() + self.have_run[command] = True + + # -- Distribution query methods ------------------------------------ + + def has_pure_modules(self): + return len(self.packages or self.py_modules or []) > 0 + + def has_ext_modules(self): + return self.ext_modules and len(self.ext_modules) > 0 + + def has_c_libraries(self): + return self.libraries and len(self.libraries) > 0 + + def has_modules(self): + return self.has_pure_modules() or self.has_ext_modules() + + def has_headers(self): + return self.headers and len(self.headers) > 0 + + def has_scripts(self): + return self.scripts and len(self.scripts) > 0 + + def has_data_files(self): + return self.data_files and len(self.data_files) > 0 + + def is_pure(self): + return ( + self.has_pure_modules() + and not self.has_ext_modules() + and not self.has_c_libraries() + ) + + # -- Metadata query methods ---------------------------------------- + + # If you're looking for 'get_name()', 'get_version()', and so forth, + # they are defined in a sneaky way: the constructor binds self.get_XXX + # to self.metadata.get_XXX. The actual code is in the + # DistributionMetadata class, below. + + +class DistributionMetadata: + """Dummy class to hold the distribution meta-data: name, version, + author, and so forth. + """ + + _METHOD_BASENAMES = ( + "name", + "version", + "author", + "author_email", + "maintainer", + "maintainer_email", + "url", + "license", + "description", + "long_description", + "keywords", + "platforms", + "fullname", + "contact", + "contact_email", + "classifiers", + "download_url", + # PEP 314 + "provides", + "requires", + "obsoletes", + ) + + def __init__(self, path=None): + if path is not None: + self.read_pkg_file(open(path)) + else: + self.name = None + self.version = None + self.author = None + self.author_email = None + self.maintainer = None + self.maintainer_email = None + self.url = None + self.license = None + self.description = None + self.long_description = None + self.keywords = None + self.platforms = None + self.classifiers = None + self.download_url = None + # PEP 314 + self.provides = None + self.requires = None + self.obsoletes = None + + def read_pkg_file(self, file): + """Reads the metadata values from a file object.""" + msg = message_from_file(file) + + def _read_field(name): + value = msg[name] + if value and value != "UNKNOWN": + return value + + def _read_list(name): + values = msg.get_all(name, None) + if values == []: + return None + return values + + metadata_version = msg['metadata-version'] + self.name = _read_field('name') + self.version = _read_field('version') + self.description = _read_field('summary') + # we are filling author only. + self.author = _read_field('author') + self.maintainer = None + self.author_email = _read_field('author-email') + self.maintainer_email = None + self.url = _read_field('home-page') + self.license = _read_field('license') + + if 'download-url' in msg: + self.download_url = _read_field('download-url') + else: + self.download_url = None + + self.long_description = _read_field('description') + self.description = _read_field('summary') + + if 'keywords' in msg: + self.keywords = _read_field('keywords').split(',') + + self.platforms = _read_list('platform') + self.classifiers = _read_list('classifier') + + # PEP 314 - these fields only exist in 1.1 + if metadata_version == '1.1': + self.requires = _read_list('requires') + self.provides = _read_list('provides') + self.obsoletes = _read_list('obsoletes') + else: + self.requires = None + self.provides = None + self.obsoletes = None + + def write_pkg_info(self, base_dir): + """Write the PKG-INFO file into the release tree.""" + with open( + os.path.join(base_dir, 'PKG-INFO'), 'w', encoding='UTF-8' + ) as pkg_info: + self.write_pkg_file(pkg_info) + + def write_pkg_file(self, file): + """Write the PKG-INFO format data to a file object.""" + version = '1.0' + if ( + self.provides + or self.requires + or self.obsoletes + or self.classifiers + or self.download_url + ): + version = '1.1' + + # required fields + file.write(f'Metadata-Version: {version}\n') + file.write(f'Name: {self.get_name()}\n') + file.write(f'Version: {self.get_version()}\n') + + def maybe_write(header, val): + if val: + file.write(f"{header}: {val}\n") + + # optional fields + maybe_write("Summary", self.get_description()) + maybe_write("Home-page", self.get_url()) + maybe_write("Author", self.get_contact()) + maybe_write("Author-email", self.get_contact_email()) + maybe_write("License", self.get_license()) + maybe_write("Download-URL", self.download_url) + maybe_write("Description", rfc822_escape(self.get_long_description() or "")) + maybe_write("Keywords", ",".join(self.get_keywords())) + + self._write_list(file, 'Platform', self.get_platforms()) + self._write_list(file, 'Classifier', self.get_classifiers()) + + # PEP 314 + self._write_list(file, 'Requires', self.get_requires()) + self._write_list(file, 'Provides', self.get_provides()) + self._write_list(file, 'Obsoletes', self.get_obsoletes()) + + def _write_list(self, file, name, values): + values = values or [] + for value in values: + file.write(f'{name}: {value}\n') + + # -- Metadata query methods ---------------------------------------- + + def get_name(self): + return self.name or "UNKNOWN" + + def get_version(self): + return self.version or "0.0.0" + + def get_fullname(self): + return self._fullname(self.get_name(), self.get_version()) + + @staticmethod + def _fullname(name: str, version: str) -> str: + """ + >>> DistributionMetadata._fullname('setup.tools', '1.0-2') + 'setup_tools-1.0.post2' + >>> DistributionMetadata._fullname('setup-tools', '1.2post2') + 'setup_tools-1.2.post2' + >>> DistributionMetadata._fullname('setup-tools', '1.0-r2') + 'setup_tools-1.0.post2' + >>> DistributionMetadata._fullname('setup.tools', '1.0.post') + 'setup_tools-1.0.post0' + >>> DistributionMetadata._fullname('setup.tools', '1.0+ubuntu-1') + 'setup_tools-1.0+ubuntu.1' + """ + return "{}-{}".format( + canonicalize_name(name).replace('-', '_'), + canonicalize_version(version, strip_trailing_zero=False), + ) + + def get_author(self): + return self.author + + def get_author_email(self): + return self.author_email + + def get_maintainer(self): + return self.maintainer + + def get_maintainer_email(self): + return self.maintainer_email + + def get_contact(self): + return self.maintainer or self.author + + def get_contact_email(self): + return self.maintainer_email or self.author_email + + def get_url(self): + return self.url + + def get_license(self): + return self.license + + get_licence = get_license + + def get_description(self): + return self.description + + def get_long_description(self): + return self.long_description + + def get_keywords(self): + return self.keywords or [] + + def set_keywords(self, value): + self.keywords = _ensure_list(value, 'keywords') + + def get_platforms(self): + return self.platforms + + def set_platforms(self, value): + self.platforms = _ensure_list(value, 'platforms') + + def get_classifiers(self): + return self.classifiers or [] + + def set_classifiers(self, value): + self.classifiers = _ensure_list(value, 'classifiers') + + def get_download_url(self): + return self.download_url + + # PEP 314 + def get_requires(self): + return self.requires or [] + + def set_requires(self, value): + import distutils.versionpredicate + + for v in value: + distutils.versionpredicate.VersionPredicate(v) + self.requires = list(value) + + def get_provides(self): + return self.provides or [] + + def set_provides(self, value): + value = [v.strip() for v in value] + for v in value: + import distutils.versionpredicate + + distutils.versionpredicate.split_provision(v) + self.provides = value + + def get_obsoletes(self): + return self.obsoletes or [] + + def set_obsoletes(self, value): + import distutils.versionpredicate + + for v in value: + distutils.versionpredicate.VersionPredicate(v) + self.obsoletes = list(value) + + +def fix_help_options(options): + """Convert a 4-tuple 'help_options' list as found in various command + classes to the 3-tuple form required by FancyGetopt. + """ + return [opt[0:3] for opt in options] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/errors.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/errors.py new file mode 100644 index 0000000000000000000000000000000000000000..3196a4f0972bbbb08556398e45dd1cef73a69f88 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/errors.py @@ -0,0 +1,124 @@ +""" +Exceptions used by the Distutils modules. + +Distutils modules may raise these or standard exceptions, +including :exc:`SystemExit`. +""" + + +class DistutilsError(Exception): + """The root of all Distutils evil.""" + + pass + + +class DistutilsModuleError(DistutilsError): + """Unable to load an expected module, or to find an expected class + within some module (in particular, command modules and classes).""" + + pass + + +class DistutilsClassError(DistutilsError): + """Some command class (or possibly distribution class, if anyone + feels a need to subclass Distribution) is found not to be holding + up its end of the bargain, ie. implementing some part of the + "command "interface.""" + + pass + + +class DistutilsGetoptError(DistutilsError): + """The option table provided to 'fancy_getopt()' is bogus.""" + + pass + + +class DistutilsArgError(DistutilsError): + """Raised by fancy_getopt in response to getopt.error -- ie. an + error in the command line usage.""" + + pass + + +class DistutilsFileError(DistutilsError): + """Any problems in the filesystem: expected file not found, etc. + Typically this is for problems that we detect before OSError + could be raised.""" + + pass + + +class DistutilsOptionError(DistutilsError): + """Syntactic/semantic errors in command options, such as use of + mutually conflicting options, or inconsistent options, + badly-spelled values, etc. No distinction is made between option + values originating in the setup script, the command line, config + files, or what-have-you -- but if we *know* something originated in + the setup script, we'll raise DistutilsSetupError instead.""" + + pass + + +class DistutilsSetupError(DistutilsError): + """For errors that can be definitely blamed on the setup script, + such as invalid keyword arguments to 'setup()'.""" + + pass + + +class DistutilsPlatformError(DistutilsError): + """We don't know how to do something on the current platform (but + we do know how to do it on some platform) -- eg. trying to compile + C files on a platform not supported by a CCompiler subclass.""" + + pass + + +class DistutilsExecError(DistutilsError): + """Any problems executing an external program (such as the C + compiler, when compiling C files).""" + + pass + + +class DistutilsInternalError(DistutilsError): + """Internal inconsistencies or impossibilities (obviously, this + should never be seen if the code is working!).""" + + pass + + +class DistutilsTemplateError(DistutilsError): + """Syntax error in a file list template.""" + + +class DistutilsByteCompileError(DistutilsError): + """Byte compile error.""" + + +# Exception classes used by the CCompiler implementation classes +class CCompilerError(Exception): + """Some compile/link operation failed.""" + + +class PreprocessError(CCompilerError): + """Failure to preprocess one or more C/C++ files.""" + + +class CompileError(CCompilerError): + """Failure to compile one or more C/C++ source files.""" + + +class LibError(CCompilerError): + """Failure to create a static library from one or more C/C++ object + files.""" + + +class LinkError(CCompilerError): + """Failure to link one or more C/C++ object files into an executable + or shared library file.""" + + +class UnknownFileError(CCompilerError): + """Attempt to process an unknown file type.""" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/extension.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/extension.py new file mode 100644 index 0000000000000000000000000000000000000000..e0532734362846053e40d14fea59b5f83ceadaa4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/extension.py @@ -0,0 +1,255 @@ +"""distutils.extension + +Provides the Extension class, used to describe C/C++ extension +modules in setup scripts.""" + +import os +import warnings + +# This class is really only used by the "build_ext" command, so it might +# make sense to put it in distutils.command.build_ext. However, that +# module is already big enough, and I want to make this class a bit more +# complex to simplify some common cases ("foo" module in "foo.c") and do +# better error-checking ("foo.c" actually exists). +# +# Also, putting this in build_ext.py means every setup script would have to +# import that large-ish module (indirectly, through distutils.core) in +# order to do anything. + + +class Extension: + """Just a collection of attributes that describes an extension + module and everything needed to build it (hopefully in a portable + way, but there are hooks that let you be as unportable as you need). + + Instance attributes: + name : string + the full name of the extension, including any packages -- ie. + *not* a filename or pathname, but Python dotted name + sources : Iterable[string | os.PathLike] + iterable of source filenames (except strings, which could be misinterpreted + as a single filename), relative to the distribution root (where the setup + script lives), in Unix form (slash-separated) for portability. Can be any + non-string iterable (list, tuple, set, etc.) containing strings or + PathLike objects. Source files may be C, C++, SWIG (.i), platform-specific + resource files, or whatever else is recognized by the "build_ext" command + as source for a Python extension. + include_dirs : [string] + list of directories to search for C/C++ header files (in Unix + form for portability) + define_macros : [(name : string, value : string|None)] + list of macros to define; each macro is defined using a 2-tuple, + where 'value' is either the string to define it to or None to + define it without a particular value (equivalent of "#define + FOO" in source or -DFOO on Unix C compiler command line) + undef_macros : [string] + list of macros to undefine explicitly + library_dirs : [string] + list of directories to search for C/C++ libraries at link time + libraries : [string] + list of library names (not filenames or paths) to link against + runtime_library_dirs : [string] + list of directories to search for C/C++ libraries at run time + (for shared extensions, this is when the extension is loaded) + extra_objects : [string] + list of extra files to link with (eg. object files not implied + by 'sources', static library that must be explicitly specified, + binary resource files, etc.) + extra_compile_args : [string] + any extra platform- and compiler-specific information to use + when compiling the source files in 'sources'. For platforms and + compilers where "command line" makes sense, this is typically a + list of command-line arguments, but for other platforms it could + be anything. + extra_link_args : [string] + any extra platform- and compiler-specific information to use + when linking object files together to create the extension (or + to create a new static Python interpreter). Similar + interpretation as for 'extra_compile_args'. + export_symbols : [string] + list of symbols to be exported from a shared extension. Not + used on all platforms, and not generally necessary for Python + extensions, which typically export exactly one symbol: "init" + + extension_name. + swig_opts : [string] + any extra options to pass to SWIG if a source file has the .i + extension. + depends : [string] + list of files that the extension depends on + language : string + extension language (i.e. "c", "c++", "objc"). Will be detected + from the source extensions if not provided. + optional : boolean + specifies that a build failure in the extension should not abort the + build process, but simply not install the failing extension. + """ + + # When adding arguments to this constructor, be sure to update + # setup_keywords in core.py. + def __init__( + self, + name, + sources, + include_dirs=None, + define_macros=None, + undef_macros=None, + library_dirs=None, + libraries=None, + runtime_library_dirs=None, + extra_objects=None, + extra_compile_args=None, + extra_link_args=None, + export_symbols=None, + swig_opts=None, + depends=None, + language=None, + optional=None, + **kw, # To catch unknown keywords + ): + if not isinstance(name, str): + raise TypeError("'name' must be a string") + + # handle the string case first; since strings are iterable, disallow them + if isinstance(sources, str): + raise TypeError( + "'sources' must be an iterable of strings or PathLike objects, not a string" + ) + + # now we check if it's iterable and contains valid types + try: + self.sources = list(map(os.fspath, sources)) + except TypeError: + raise TypeError( + "'sources' must be an iterable of strings or PathLike objects" + ) + + self.name = name + self.include_dirs = include_dirs or [] + self.define_macros = define_macros or [] + self.undef_macros = undef_macros or [] + self.library_dirs = library_dirs or [] + self.libraries = libraries or [] + self.runtime_library_dirs = runtime_library_dirs or [] + self.extra_objects = extra_objects or [] + self.extra_compile_args = extra_compile_args or [] + self.extra_link_args = extra_link_args or [] + self.export_symbols = export_symbols or [] + self.swig_opts = swig_opts or [] + self.depends = depends or [] + self.language = language + self.optional = optional + + # If there are unknown keyword options, warn about them + if len(kw) > 0: + options = [repr(option) for option in kw] + options = ', '.join(sorted(options)) + msg = f"Unknown Extension options: {options}" + warnings.warn(msg) + + def __repr__(self): + return f'<{self.__class__.__module__}.{self.__class__.__qualname__}({self.name!r}) at {id(self):#x}>' + + +def read_setup_file(filename): # noqa: C901 + """Reads a Setup file and returns Extension instances.""" + from distutils.sysconfig import _variable_rx, expand_makefile_vars, parse_makefile + from distutils.text_file import TextFile + from distutils.util import split_quoted + + # First pass over the file to gather "VAR = VALUE" assignments. + vars = parse_makefile(filename) + + # Second pass to gobble up the real content: lines of the form + # ... [ ...] [ ...] [ ...] + file = TextFile( + filename, + strip_comments=True, + skip_blanks=True, + join_lines=True, + lstrip_ws=True, + rstrip_ws=True, + ) + try: + extensions = [] + + while True: + line = file.readline() + if line is None: # eof + break + if _variable_rx.match(line): # VAR=VALUE, handled in first pass + continue + + if line[0] == line[-1] == "*": + file.warn(f"'{line}' lines not handled yet") + continue + + line = expand_makefile_vars(line, vars) + words = split_quoted(line) + + # NB. this parses a slightly different syntax than the old + # makesetup script: here, there must be exactly one extension per + # line, and it must be the first word of the line. I have no idea + # why the old syntax supported multiple extensions per line, as + # they all wind up being the same. + + module = words[0] + ext = Extension(module, []) + append_next_word = None + + for word in words[1:]: + if append_next_word is not None: + append_next_word.append(word) + append_next_word = None + continue + + suffix = os.path.splitext(word)[1] + switch = word[0:2] + value = word[2:] + + if suffix in (".c", ".cc", ".cpp", ".cxx", ".c++", ".m", ".mm"): + # hmm, should we do something about C vs. C++ sources? + # or leave it up to the CCompiler implementation to + # worry about? + ext.sources.append(word) + elif switch == "-I": + ext.include_dirs.append(value) + elif switch == "-D": + equals = value.find("=") + if equals == -1: # bare "-DFOO" -- no value + ext.define_macros.append((value, None)) + else: # "-DFOO=blah" + ext.define_macros.append((value[0:equals], value[equals + 2 :])) + elif switch == "-U": + ext.undef_macros.append(value) + elif switch == "-C": # only here 'cause makesetup has it! + ext.extra_compile_args.append(word) + elif switch == "-l": + ext.libraries.append(value) + elif switch == "-L": + ext.library_dirs.append(value) + elif switch == "-R": + ext.runtime_library_dirs.append(value) + elif word == "-rpath": + append_next_word = ext.runtime_library_dirs + elif word == "-Xlinker": + append_next_word = ext.extra_link_args + elif word == "-Xcompiler": + append_next_word = ext.extra_compile_args + elif switch == "-u": + ext.extra_link_args.append(word) + if not value: + append_next_word = ext.extra_link_args + elif suffix in (".a", ".so", ".sl", ".o", ".dylib"): + # NB. a really faithful emulation of makesetup would + # append a .o file to extra_objects only if it + # had a slash in it; otherwise, it would s/.o/.c/ + # and append it to sources. Hmmmm. + ext.extra_objects.append(word) + else: + file.warn(f"unrecognized argument '{word}'") + + extensions.append(ext) + finally: + file.close() + + return extensions diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/fancy_getopt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/fancy_getopt.py new file mode 100644 index 0000000000000000000000000000000000000000..1a1d3a05da544b98ff84d03ecbee1185b2eb45f7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/fancy_getopt.py @@ -0,0 +1,471 @@ +"""distutils.fancy_getopt + +Wrapper around the standard getopt module that provides the following +additional features: + * short and long options are tied together + * options have help strings, so fancy_getopt could potentially + create a complete usage summary + * options set attributes of a passed-in object +""" + +from __future__ import annotations + +import getopt +import re +import string +import sys +from collections.abc import Sequence +from typing import Any + +from .errors import DistutilsArgError, DistutilsGetoptError + +# Much like command_re in distutils.core, this is close to but not quite +# the same as a Python NAME -- except, in the spirit of most GNU +# utilities, we use '-' in place of '_'. (The spirit of LISP lives on!) +# The similarities to NAME are again not a coincidence... +longopt_pat = r'[a-zA-Z](?:[a-zA-Z0-9-]*)' +longopt_re = re.compile(rf'^{longopt_pat}$') + +# For recognizing "negative alias" options, eg. "quiet=!verbose" +neg_alias_re = re.compile(f"^({longopt_pat})=!({longopt_pat})$") + +# This is used to translate long options to legitimate Python identifiers +# (for use as attributes of some object). +longopt_xlate = str.maketrans('-', '_') + + +class FancyGetopt: + """Wrapper around the standard 'getopt()' module that provides some + handy extra functionality: + * short and long options are tied together + * options have help strings, and help text can be assembled + from them + * options set attributes of a passed-in object + * boolean options can have "negative aliases" -- eg. if + --quiet is the "negative alias" of --verbose, then "--quiet" + on the command line sets 'verbose' to false + """ + + def __init__(self, option_table=None): + # The option table is (currently) a list of tuples. The + # tuples may have 3 or four values: + # (long_option, short_option, help_string [, repeatable]) + # if an option takes an argument, its long_option should have '=' + # appended; short_option should just be a single character, no ':' + # in any case. If a long_option doesn't have a corresponding + # short_option, short_option should be None. All option tuples + # must have long options. + self.option_table = option_table + + # 'option_index' maps long option names to entries in the option + # table (ie. those 3-tuples). + self.option_index = {} + if self.option_table: + self._build_index() + + # 'alias' records (duh) alias options; {'foo': 'bar'} means + # --foo is an alias for --bar + self.alias = {} + + # 'negative_alias' keeps track of options that are the boolean + # opposite of some other option + self.negative_alias = {} + + # These keep track of the information in the option table. We + # don't actually populate these structures until we're ready to + # parse the command-line, since the 'option_table' passed in here + # isn't necessarily the final word. + self.short_opts = [] + self.long_opts = [] + self.short2long = {} + self.attr_name = {} + self.takes_arg = {} + + # And 'option_order' is filled up in 'getopt()'; it records the + # original order of options (and their values) on the command-line, + # but expands short options, converts aliases, etc. + self.option_order = [] + + def _build_index(self): + self.option_index.clear() + for option in self.option_table: + self.option_index[option[0]] = option + + def set_option_table(self, option_table): + self.option_table = option_table + self._build_index() + + def add_option(self, long_option, short_option=None, help_string=None): + if long_option in self.option_index: + raise DistutilsGetoptError( + f"option conflict: already an option '{long_option}'" + ) + else: + option = (long_option, short_option, help_string) + self.option_table.append(option) + self.option_index[long_option] = option + + def has_option(self, long_option): + """Return true if the option table for this parser has an + option with long name 'long_option'.""" + return long_option in self.option_index + + def get_attr_name(self, long_option): + """Translate long option name 'long_option' to the form it + has as an attribute of some object: ie., translate hyphens + to underscores.""" + return long_option.translate(longopt_xlate) + + def _check_alias_dict(self, aliases, what): + assert isinstance(aliases, dict) + for alias, opt in aliases.items(): + if alias not in self.option_index: + raise DistutilsGetoptError( + f"invalid {what} '{alias}': option '{alias}' not defined" + ) + if opt not in self.option_index: + raise DistutilsGetoptError( + f"invalid {what} '{alias}': aliased option '{opt}' not defined" + ) + + def set_aliases(self, alias): + """Set the aliases for this option parser.""" + self._check_alias_dict(alias, "alias") + self.alias = alias + + def set_negative_aliases(self, negative_alias): + """Set the negative aliases for this option parser. + 'negative_alias' should be a dictionary mapping option names to + option names, both the key and value must already be defined + in the option table.""" + self._check_alias_dict(negative_alias, "negative alias") + self.negative_alias = negative_alias + + def _grok_option_table(self): # noqa: C901 + """Populate the various data structures that keep tabs on the + option table. Called by 'getopt()' before it can do anything + worthwhile. + """ + self.long_opts = [] + self.short_opts = [] + self.short2long.clear() + self.repeat = {} + + for option in self.option_table: + if len(option) == 3: + long, short, help = option + repeat = 0 + elif len(option) == 4: + long, short, help, repeat = option + else: + # the option table is part of the code, so simply + # assert that it is correct + raise ValueError(f"invalid option tuple: {option!r}") + + # Type- and value-check the option names + if not isinstance(long, str) or len(long) < 2: + raise DistutilsGetoptError( + f"invalid long option '{long}': must be a string of length >= 2" + ) + + if not ((short is None) or (isinstance(short, str) and len(short) == 1)): + raise DistutilsGetoptError( + f"invalid short option '{short}': must a single character or None" + ) + + self.repeat[long] = repeat + self.long_opts.append(long) + + if long[-1] == '=': # option takes an argument? + if short: + short = short + ':' + long = long[0:-1] + self.takes_arg[long] = True + else: + # Is option is a "negative alias" for some other option (eg. + # "quiet" == "!verbose")? + alias_to = self.negative_alias.get(long) + if alias_to is not None: + if self.takes_arg[alias_to]: + raise DistutilsGetoptError( + f"invalid negative alias '{long}': " + f"aliased option '{alias_to}' takes a value" + ) + + self.long_opts[-1] = long # XXX redundant?! + self.takes_arg[long] = False + + # If this is an alias option, make sure its "takes arg" flag is + # the same as the option it's aliased to. + alias_to = self.alias.get(long) + if alias_to is not None: + if self.takes_arg[long] != self.takes_arg[alias_to]: + raise DistutilsGetoptError( + f"invalid alias '{long}': inconsistent with " + f"aliased option '{alias_to}' (one of them takes a value, " + "the other doesn't" + ) + + # Now enforce some bondage on the long option name, so we can + # later translate it to an attribute name on some object. Have + # to do this a bit late to make sure we've removed any trailing + # '='. + if not longopt_re.match(long): + raise DistutilsGetoptError( + f"invalid long option name '{long}' " + "(must be letters, numbers, hyphens only" + ) + + self.attr_name[long] = self.get_attr_name(long) + if short: + self.short_opts.append(short) + self.short2long[short[0]] = long + + def getopt(self, args: Sequence[str] | None = None, object=None): # noqa: C901 + """Parse command-line options in args. Store as attributes on object. + + If 'args' is None or not supplied, uses 'sys.argv[1:]'. If + 'object' is None or not supplied, creates a new OptionDummy + object, stores option values there, and returns a tuple (args, + object). If 'object' is supplied, it is modified in place and + 'getopt()' just returns 'args'; in both cases, the returned + 'args' is a modified copy of the passed-in 'args' list, which + is left untouched. + """ + if args is None: + args = sys.argv[1:] + if object is None: + object = OptionDummy() + created_object = True + else: + created_object = False + + self._grok_option_table() + + short_opts = ' '.join(self.short_opts) + try: + opts, args = getopt.getopt(args, short_opts, self.long_opts) + except getopt.error as msg: + raise DistutilsArgError(msg) + + for opt, val in opts: + if len(opt) == 2 and opt[0] == '-': # it's a short option + opt = self.short2long[opt[1]] + else: + assert len(opt) > 2 and opt[:2] == '--' + opt = opt[2:] + + alias = self.alias.get(opt) + if alias: + opt = alias + + if not self.takes_arg[opt]: # boolean option? + assert val == '', "boolean option can't have value" + alias = self.negative_alias.get(opt) + if alias: + opt = alias + val = 0 + else: + val = 1 + + attr = self.attr_name[opt] + # The only repeating option at the moment is 'verbose'. + # It has a negative option -q quiet, which should set verbose = False. + if val and self.repeat.get(attr) is not None: + val = getattr(object, attr, 0) + 1 + setattr(object, attr, val) + self.option_order.append((opt, val)) + + # for opts + if created_object: + return args, object + else: + return args + + def get_option_order(self): + """Returns the list of (option, value) tuples processed by the + previous run of 'getopt()'. Raises RuntimeError if + 'getopt()' hasn't been called yet. + """ + if self.option_order is None: + raise RuntimeError("'getopt()' hasn't been called yet") + else: + return self.option_order + + def generate_help(self, header=None): # noqa: C901 + """Generate help text (a list of strings, one per suggested line of + output) from the option table for this FancyGetopt object. + """ + # Blithely assume the option table is good: probably wouldn't call + # 'generate_help()' unless you've already called 'getopt()'. + + # First pass: determine maximum length of long option names + max_opt = 0 + for option in self.option_table: + long = option[0] + short = option[1] + ell = len(long) + if long[-1] == '=': + ell = ell - 1 + if short is not None: + ell = ell + 5 # " (-x)" where short == 'x' + if ell > max_opt: + max_opt = ell + + opt_width = max_opt + 2 + 2 + 2 # room for indent + dashes + gutter + + # Typical help block looks like this: + # --foo controls foonabulation + # Help block for longest option looks like this: + # --flimflam set the flim-flam level + # and with wrapped text: + # --flimflam set the flim-flam level (must be between + # 0 and 100, except on Tuesdays) + # Options with short names will have the short name shown (but + # it doesn't contribute to max_opt): + # --foo (-f) controls foonabulation + # If adding the short option would make the left column too wide, + # we push the explanation off to the next line + # --flimflam (-l) + # set the flim-flam level + # Important parameters: + # - 2 spaces before option block start lines + # - 2 dashes for each long option name + # - min. 2 spaces between option and explanation (gutter) + # - 5 characters (incl. space) for short option name + + # Now generate lines of help text. (If 80 columns were good enough + # for Jesus, then 78 columns are good enough for me!) + line_width = 78 + text_width = line_width - opt_width + big_indent = ' ' * opt_width + if header: + lines = [header] + else: + lines = ['Option summary:'] + + for option in self.option_table: + long, short, help = option[:3] + text = wrap_text(help, text_width) + if long[-1] == '=': + long = long[0:-1] + + # Case 1: no short option at all (makes life easy) + if short is None: + if text: + lines.append(f" --{long:<{max_opt}} {text[0]}") + else: + lines.append(f" --{long:<{max_opt}}") + + # Case 2: we have a short option, so we have to include it + # just after the long option + else: + opt_names = f"{long} (-{short})" + if text: + lines.append(f" --{opt_names:<{max_opt}} {text[0]}") + else: + lines.append(f" --{opt_names:<{max_opt}}") + + for ell in text[1:]: + lines.append(big_indent + ell) + return lines + + def print_help(self, header=None, file=None): + if file is None: + file = sys.stdout + for line in self.generate_help(header): + file.write(line + "\n") + + +def fancy_getopt(options, negative_opt, object, args: Sequence[str] | None): + parser = FancyGetopt(options) + parser.set_negative_aliases(negative_opt) + return parser.getopt(args, object) + + +WS_TRANS = {ord(_wschar): ' ' for _wschar in string.whitespace} + + +def wrap_text(text, width): + """wrap_text(text : string, width : int) -> [string] + + Split 'text' into multiple lines of no more than 'width' characters + each, and return the list of strings that results. + """ + if text is None: + return [] + if len(text) <= width: + return [text] + + text = text.expandtabs() + text = text.translate(WS_TRANS) + chunks = re.split(r'( +|-+)', text) + chunks = [ch for ch in chunks if ch] # ' - ' results in empty strings + lines = [] + + while chunks: + cur_line = [] # list of chunks (to-be-joined) + cur_len = 0 # length of current line + + while chunks: + ell = len(chunks[0]) + if cur_len + ell <= width: # can squeeze (at least) this chunk in + cur_line.append(chunks[0]) + del chunks[0] + cur_len = cur_len + ell + else: # this line is full + # drop last chunk if all space + if cur_line and cur_line[-1][0] == ' ': + del cur_line[-1] + break + + if chunks: # any chunks left to process? + # if the current line is still empty, then we had a single + # chunk that's too big too fit on a line -- so we break + # down and break it up at the line width + if cur_len == 0: + cur_line.append(chunks[0][0:width]) + chunks[0] = chunks[0][width:] + + # all-whitespace chunks at the end of a line can be discarded + # (and we know from the re.split above that if a chunk has + # *any* whitespace, it is *all* whitespace) + if chunks[0][0] == ' ': + del chunks[0] + + # and store this line in the list-of-all-lines -- as a single + # string, of course! + lines.append(''.join(cur_line)) + + return lines + + +def translate_longopt(opt): + """Convert a long option name to a valid Python identifier by + changing "-" to "_". + """ + return opt.translate(longopt_xlate) + + +class OptionDummy: + """Dummy class just used as a place to hold command-line option + values as instance attributes.""" + + def __init__(self, options: Sequence[Any] = []): + """Create a new OptionDummy instance. The attributes listed in + 'options' will be initialized to None.""" + for opt in options: + setattr(self, opt, None) + + +if __name__ == "__main__": + text = """\ +Tra-la-la, supercalifragilisticexpialidocious. +How *do* you spell that odd word, anyways? +(Someone ask Mary -- she'll know [or she'll +say, "How should I know?"].)""" + + for w in (10, 20, 30, 40): + print(f"width: {w}") + print("\n".join(wrap_text(text, w))) + print() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/file_util.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/file_util.py new file mode 100644 index 0000000000000000000000000000000000000000..0acc8cb84bd4d28363c4e33e7ff18d691bebfa48 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/file_util.py @@ -0,0 +1,236 @@ +"""distutils.file_util + +Utility functions for operating on single files. +""" + +import os + +from ._log import log +from .errors import DistutilsFileError + +# for generating verbose output in 'copy_file()' +_copy_action = {None: 'copying', 'hard': 'hard linking', 'sym': 'symbolically linking'} + + +def _copy_file_contents(src, dst, buffer_size=16 * 1024): # noqa: C901 + """Copy the file 'src' to 'dst'; both must be filenames. Any error + opening either file, reading from 'src', or writing to 'dst', raises + DistutilsFileError. Data is read/written in chunks of 'buffer_size' + bytes (default 16k). No attempt is made to handle anything apart from + regular files. + """ + # Stolen from shutil module in the standard library, but with + # custom error-handling added. + fsrc = None + fdst = None + try: + try: + fsrc = open(src, 'rb') + except OSError as e: + raise DistutilsFileError(f"could not open '{src}': {e.strerror}") + + if os.path.exists(dst): + try: + os.unlink(dst) + except OSError as e: + raise DistutilsFileError(f"could not delete '{dst}': {e.strerror}") + + try: + fdst = open(dst, 'wb') + except OSError as e: + raise DistutilsFileError(f"could not create '{dst}': {e.strerror}") + + while True: + try: + buf = fsrc.read(buffer_size) + except OSError as e: + raise DistutilsFileError(f"could not read from '{src}': {e.strerror}") + + if not buf: + break + + try: + fdst.write(buf) + except OSError as e: + raise DistutilsFileError(f"could not write to '{dst}': {e.strerror}") + finally: + if fdst: + fdst.close() + if fsrc: + fsrc.close() + + +def copy_file( # noqa: C901 + src, + dst, + preserve_mode=True, + preserve_times=True, + update=False, + link=None, + verbose=True, + dry_run=False, +): + """Copy a file 'src' to 'dst'. If 'dst' is a directory, then 'src' is + copied there with the same name; otherwise, it must be a filename. (If + the file exists, it will be ruthlessly clobbered.) If 'preserve_mode' + is true (the default), the file's mode (type and permission bits, or + whatever is analogous on the current platform) is copied. If + 'preserve_times' is true (the default), the last-modified and + last-access times are copied as well. If 'update' is true, 'src' will + only be copied if 'dst' does not exist, or if 'dst' does exist but is + older than 'src'. + + 'link' allows you to make hard links (os.link) or symbolic links + (os.symlink) instead of copying: set it to "hard" or "sym"; if it is + None (the default), files are copied. Don't set 'link' on systems that + don't support it: 'copy_file()' doesn't check if hard or symbolic + linking is available. If hardlink fails, falls back to + _copy_file_contents(). + + Under Mac OS, uses the native file copy function in macostools; on + other systems, uses '_copy_file_contents()' to copy file contents. + + Return a tuple (dest_name, copied): 'dest_name' is the actual name of + the output file, and 'copied' is true if the file was copied (or would + have been copied, if 'dry_run' true). + """ + # XXX if the destination file already exists, we clobber it if + # copying, but blow up if linking. Hmmm. And I don't know what + # macostools.copyfile() does. Should definitely be consistent, and + # should probably blow up if destination exists and we would be + # changing it (ie. it's not already a hard/soft link to src OR + # (not update) and (src newer than dst). + + from distutils._modified import newer + from stat import S_IMODE, ST_ATIME, ST_MODE, ST_MTIME + + if not os.path.isfile(src): + raise DistutilsFileError( + f"can't copy '{src}': doesn't exist or not a regular file" + ) + + if os.path.isdir(dst): + dir = dst + dst = os.path.join(dst, os.path.basename(src)) + else: + dir = os.path.dirname(dst) + + if update and not newer(src, dst): + if verbose >= 1: + log.debug("not copying %s (output up-to-date)", src) + return (dst, False) + + try: + action = _copy_action[link] + except KeyError: + raise ValueError(f"invalid value '{link}' for 'link' argument") + + if verbose >= 1: + if os.path.basename(dst) == os.path.basename(src): + log.info("%s %s -> %s", action, src, dir) + else: + log.info("%s %s -> %s", action, src, dst) + + if dry_run: + return (dst, True) + + # If linking (hard or symbolic), use the appropriate system call + # (Unix only, of course, but that's the caller's responsibility) + elif link == 'hard': + if not (os.path.exists(dst) and os.path.samefile(src, dst)): + try: + os.link(src, dst) + except OSError: + # If hard linking fails, fall back on copying file + # (some special filesystems don't support hard linking + # even under Unix, see issue #8876). + pass + else: + return (dst, True) + elif link == 'sym': + if not (os.path.exists(dst) and os.path.samefile(src, dst)): + os.symlink(src, dst) + return (dst, True) + + # Otherwise (non-Mac, not linking), copy the file contents and + # (optionally) copy the times and mode. + _copy_file_contents(src, dst) + if preserve_mode or preserve_times: + st = os.stat(src) + + # According to David Ascher , utime() should be done + # before chmod() (at least under NT). + if preserve_times: + os.utime(dst, (st[ST_ATIME], st[ST_MTIME])) + if preserve_mode: + os.chmod(dst, S_IMODE(st[ST_MODE])) + + return (dst, True) + + +# XXX I suspect this is Unix-specific -- need porting help! +def move_file(src, dst, verbose=True, dry_run=False): # noqa: C901 + """Move a file 'src' to 'dst'. If 'dst' is a directory, the file will + be moved into it with the same name; otherwise, 'src' is just renamed + to 'dst'. Return the new full name of the file. + + Handles cross-device moves on Unix using 'copy_file()'. What about + other systems??? + """ + import errno + from os.path import basename, dirname, exists, isdir, isfile + + if verbose >= 1: + log.info("moving %s -> %s", src, dst) + + if dry_run: + return dst + + if not isfile(src): + raise DistutilsFileError(f"can't move '{src}': not a regular file") + + if isdir(dst): + dst = os.path.join(dst, basename(src)) + elif exists(dst): + raise DistutilsFileError( + f"can't move '{src}': destination '{dst}' already exists" + ) + + if not isdir(dirname(dst)): + raise DistutilsFileError( + f"can't move '{src}': destination '{dst}' not a valid path" + ) + + copy_it = False + try: + os.rename(src, dst) + except OSError as e: + (num, msg) = e.args + if num == errno.EXDEV: + copy_it = True + else: + raise DistutilsFileError(f"couldn't move '{src}' to '{dst}': {msg}") + + if copy_it: + copy_file(src, dst, verbose=verbose) + try: + os.unlink(src) + except OSError as e: + (num, msg) = e.args + try: + os.unlink(dst) + except OSError: + pass + raise DistutilsFileError( + f"couldn't move '{src}' to '{dst}' by copy/delete: " + f"delete '{src}' failed: {msg}" + ) + return dst + + +def write_file(filename, contents): + """Create a file with the specified name and write 'contents' (a + sequence of strings without line terminators) to it. + """ + with open(filename, 'w', encoding='utf-8') as f: + f.writelines(line + '\n' for line in contents) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/filelist.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/filelist.py new file mode 100644 index 0000000000000000000000000000000000000000..9857b195493cb5a772db5bcd7ae0657f583cae9d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/filelist.py @@ -0,0 +1,366 @@ +"""distutils.filelist + +Provides the FileList class, used for poking about the filesystem +and building lists of files. +""" + +import fnmatch +import functools +import os +import re + +from ._log import log +from .errors import DistutilsInternalError, DistutilsTemplateError +from .util import convert_path + + +class FileList: + """A list of files built by on exploring the filesystem and filtered by + applying various patterns to what we find there. + + Instance attributes: + dir + directory from which files will be taken -- only used if + 'allfiles' not supplied to constructor + files + list of filenames currently being built/filtered/manipulated + allfiles + complete list of files under consideration (ie. without any + filtering applied) + """ + + def __init__(self, warn=None, debug_print=None): + # ignore argument to FileList, but keep them for backwards + # compatibility + self.allfiles = None + self.files = [] + + def set_allfiles(self, allfiles): + self.allfiles = allfiles + + def findall(self, dir=os.curdir): + self.allfiles = findall(dir) + + def debug_print(self, msg): + """Print 'msg' to stdout if the global DEBUG (taken from the + DISTUTILS_DEBUG environment variable) flag is true. + """ + from distutils.debug import DEBUG + + if DEBUG: + print(msg) + + # Collection methods + + def append(self, item): + self.files.append(item) + + def extend(self, items): + self.files.extend(items) + + def sort(self): + # Not a strict lexical sort! + sortable_files = sorted(map(os.path.split, self.files)) + self.files = [] + for sort_tuple in sortable_files: + self.files.append(os.path.join(*sort_tuple)) + + # Other miscellaneous utility methods + + def remove_duplicates(self): + # Assumes list has been sorted! + for i in range(len(self.files) - 1, 0, -1): + if self.files[i] == self.files[i - 1]: + del self.files[i] + + # "File template" methods + + def _parse_template_line(self, line): + words = line.split() + action = words[0] + + patterns = dir = dir_pattern = None + + if action in ('include', 'exclude', 'global-include', 'global-exclude'): + if len(words) < 2: + raise DistutilsTemplateError( + f"'{action}' expects ..." + ) + patterns = [convert_path(w) for w in words[1:]] + elif action in ('recursive-include', 'recursive-exclude'): + if len(words) < 3: + raise DistutilsTemplateError( + f"'{action}' expects ..." + ) + dir = convert_path(words[1]) + patterns = [convert_path(w) for w in words[2:]] + elif action in ('graft', 'prune'): + if len(words) != 2: + raise DistutilsTemplateError( + f"'{action}' expects a single " + ) + dir_pattern = convert_path(words[1]) + else: + raise DistutilsTemplateError(f"unknown action '{action}'") + + return (action, patterns, dir, dir_pattern) + + def process_template_line(self, line): # noqa: C901 + # Parse the line: split it up, make sure the right number of words + # is there, and return the relevant words. 'action' is always + # defined: it's the first word of the line. Which of the other + # three are defined depends on the action; it'll be either + # patterns, (dir and patterns), or (dir_pattern). + (action, patterns, dir, dir_pattern) = self._parse_template_line(line) + + # OK, now we know that the action is valid and we have the + # right number of words on the line for that action -- so we + # can proceed with minimal error-checking. + if action == 'include': + self.debug_print("include " + ' '.join(patterns)) + for pattern in patterns: + if not self.include_pattern(pattern, anchor=True): + log.warning("warning: no files found matching '%s'", pattern) + + elif action == 'exclude': + self.debug_print("exclude " + ' '.join(patterns)) + for pattern in patterns: + if not self.exclude_pattern(pattern, anchor=True): + log.warning( + "warning: no previously-included files found matching '%s'", + pattern, + ) + + elif action == 'global-include': + self.debug_print("global-include " + ' '.join(patterns)) + for pattern in patterns: + if not self.include_pattern(pattern, anchor=False): + log.warning( + ( + "warning: no files found matching '%s' " + "anywhere in distribution" + ), + pattern, + ) + + elif action == 'global-exclude': + self.debug_print("global-exclude " + ' '.join(patterns)) + for pattern in patterns: + if not self.exclude_pattern(pattern, anchor=False): + log.warning( + ( + "warning: no previously-included files matching " + "'%s' found anywhere in distribution" + ), + pattern, + ) + + elif action == 'recursive-include': + self.debug_print("recursive-include {} {}".format(dir, ' '.join(patterns))) + for pattern in patterns: + if not self.include_pattern(pattern, prefix=dir): + msg = "warning: no files found matching '%s' under directory '%s'" + log.warning(msg, pattern, dir) + + elif action == 'recursive-exclude': + self.debug_print("recursive-exclude {} {}".format(dir, ' '.join(patterns))) + for pattern in patterns: + if not self.exclude_pattern(pattern, prefix=dir): + log.warning( + ( + "warning: no previously-included files matching " + "'%s' found under directory '%s'" + ), + pattern, + dir, + ) + + elif action == 'graft': + self.debug_print("graft " + dir_pattern) + if not self.include_pattern(None, prefix=dir_pattern): + log.warning("warning: no directories found matching '%s'", dir_pattern) + + elif action == 'prune': + self.debug_print("prune " + dir_pattern) + if not self.exclude_pattern(None, prefix=dir_pattern): + log.warning( + ("no previously-included directories found matching '%s'"), + dir_pattern, + ) + else: + raise DistutilsInternalError( + f"this cannot happen: invalid action '{action}'" + ) + + # Filtering/selection methods + + def include_pattern(self, pattern, anchor=True, prefix=None, is_regex=False): + """Select strings (presumably filenames) from 'self.files' that + match 'pattern', a Unix-style wildcard (glob) pattern. Patterns + are not quite the same as implemented by the 'fnmatch' module: '*' + and '?' match non-special characters, where "special" is platform- + dependent: slash on Unix; colon, slash, and backslash on + DOS/Windows; and colon on Mac OS. + + If 'anchor' is true (the default), then the pattern match is more + stringent: "*.py" will match "foo.py" but not "foo/bar.py". If + 'anchor' is false, both of these will match. + + If 'prefix' is supplied, then only filenames starting with 'prefix' + (itself a pattern) and ending with 'pattern', with anything in between + them, will match. 'anchor' is ignored in this case. + + If 'is_regex' is true, 'anchor' and 'prefix' are ignored, and + 'pattern' is assumed to be either a string containing a regex or a + regex object -- no translation is done, the regex is just compiled + and used as-is. + + Selected strings will be added to self.files. + + Return True if files are found, False otherwise. + """ + # XXX docstring lying about what the special chars are? + files_found = False + pattern_re = translate_pattern(pattern, anchor, prefix, is_regex) + self.debug_print(f"include_pattern: applying regex r'{pattern_re.pattern}'") + + # delayed loading of allfiles list + if self.allfiles is None: + self.findall() + + for name in self.allfiles: + if pattern_re.search(name): + self.debug_print(" adding " + name) + self.files.append(name) + files_found = True + return files_found + + def exclude_pattern(self, pattern, anchor=True, prefix=None, is_regex=False): + """Remove strings (presumably filenames) from 'files' that match + 'pattern'. Other parameters are the same as for + 'include_pattern()', above. + The list 'self.files' is modified in place. + Return True if files are found, False otherwise. + """ + files_found = False + pattern_re = translate_pattern(pattern, anchor, prefix, is_regex) + self.debug_print(f"exclude_pattern: applying regex r'{pattern_re.pattern}'") + for i in range(len(self.files) - 1, -1, -1): + if pattern_re.search(self.files[i]): + self.debug_print(" removing " + self.files[i]) + del self.files[i] + files_found = True + return files_found + + +# Utility functions + + +def _find_all_simple(path): + """ + Find all files under 'path' + """ + all_unique = _UniqueDirs.filter(os.walk(path, followlinks=True)) + results = ( + os.path.join(base, file) for base, dirs, files in all_unique for file in files + ) + return filter(os.path.isfile, results) + + +class _UniqueDirs(set): + """ + Exclude previously-seen dirs from walk results, + avoiding infinite recursion. + Ref https://bugs.python.org/issue44497. + """ + + def __call__(self, walk_item): + """ + Given an item from an os.walk result, determine + if the item represents a unique dir for this instance + and if not, prevent further traversal. + """ + base, dirs, files = walk_item + stat = os.stat(base) + candidate = stat.st_dev, stat.st_ino + found = candidate in self + if found: + del dirs[:] + self.add(candidate) + return not found + + @classmethod + def filter(cls, items): + return filter(cls(), items) + + +def findall(dir=os.curdir): + """ + Find all files under 'dir' and return the list of full filenames. + Unless dir is '.', return full filenames with dir prepended. + """ + files = _find_all_simple(dir) + if dir == os.curdir: + make_rel = functools.partial(os.path.relpath, start=dir) + files = map(make_rel, files) + return list(files) + + +def glob_to_re(pattern): + """Translate a shell-like glob pattern to a regular expression; return + a string containing the regex. Differs from 'fnmatch.translate()' in + that '*' does not match "special characters" (which are + platform-specific). + """ + pattern_re = fnmatch.translate(pattern) + + # '?' and '*' in the glob pattern become '.' and '.*' in the RE, which + # IMHO is wrong -- '?' and '*' aren't supposed to match slash in Unix, + # and by extension they shouldn't match such "special characters" under + # any OS. So change all non-escaped dots in the RE to match any + # character except the special characters (currently: just os.sep). + sep = os.sep + if os.sep == '\\': + # we're using a regex to manipulate a regex, so we need + # to escape the backslash twice + sep = r'\\\\' + escaped = rf'\1[^{sep}]' + pattern_re = re.sub(r'((?= 2: + set_threshold(logging.DEBUG) + + +class Log(logging.Logger): + """distutils.log.Log is deprecated, please use an alternative from `logging`.""" + + def __init__(self, threshold=WARN): + warnings.warn(Log.__doc__) # avoid DeprecationWarning to ensure warn is shown + super().__init__(__name__, level=threshold) + + @property + def threshold(self): + return self.level + + @threshold.setter + def threshold(self, level): + self.setLevel(level) + + warn = logging.Logger.warning diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/spawn.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/spawn.py new file mode 100644 index 0000000000000000000000000000000000000000..ba280334d1975de4f6b383bd50667da71d45b4f2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/spawn.py @@ -0,0 +1,117 @@ +"""distutils.spawn + +Provides the 'spawn()' function, a front-end to various platform- +specific functions for launching another program in a sub-process. +""" + +from __future__ import annotations + +import os +import platform +import shutil +import subprocess +import sys +import warnings +from collections.abc import Mapping + +from ._log import log +from .debug import DEBUG +from .errors import DistutilsExecError + + +def _debug(cmd): + """ + Render a subprocess command differently depending on DEBUG. + """ + return cmd if DEBUG else cmd[0] + + +def _inject_macos_ver(env: Mapping[str:str] | None) -> Mapping[str:str] | None: + if platform.system() != 'Darwin': + return env + + from .util import MACOSX_VERSION_VAR, get_macosx_target_ver + + target_ver = get_macosx_target_ver() + update = {MACOSX_VERSION_VAR: target_ver} if target_ver else {} + return {**_resolve(env), **update} + + +def _resolve(env: Mapping[str:str] | None) -> Mapping[str:str]: + return os.environ if env is None else env + + +def spawn(cmd, search_path=True, verbose=False, dry_run=False, env=None): + """Run another program, specified as a command list 'cmd', in a new process. + + 'cmd' is just the argument list for the new process, ie. + cmd[0] is the program to run and cmd[1:] are the rest of its arguments. + There is no way to run a program with a name different from that of its + executable. + + If 'search_path' is true (the default), the system's executable + search path will be used to find the program; otherwise, cmd[0] + must be the exact path to the executable. If 'dry_run' is true, + the command will not actually be run. + + Raise DistutilsExecError if running the program fails in any way; just + return on success. + """ + log.info(subprocess.list2cmdline(cmd)) + if dry_run: + return + + if search_path: + executable = shutil.which(cmd[0]) + if executable is not None: + cmd[0] = executable + + try: + subprocess.check_call(cmd, env=_inject_macos_ver(env)) + except OSError as exc: + raise DistutilsExecError( + f"command {_debug(cmd)!r} failed: {exc.args[-1]}" + ) from exc + except subprocess.CalledProcessError as err: + raise DistutilsExecError( + f"command {_debug(cmd)!r} failed with exit code {err.returncode}" + ) from err + + +def find_executable(executable, path=None): + """Tries to find 'executable' in the directories listed in 'path'. + + A string listing directories separated by 'os.pathsep'; defaults to + os.environ['PATH']. Returns the complete filename or None if not found. + """ + warnings.warn( + 'Use shutil.which instead of find_executable', DeprecationWarning, stacklevel=2 + ) + _, ext = os.path.splitext(executable) + if (sys.platform == 'win32') and (ext != '.exe'): + executable = executable + '.exe' + + if os.path.isfile(executable): + return executable + + if path is None: + path = os.environ.get('PATH', None) + # bpo-35755: Don't fall through if PATH is the empty string + if path is None: + try: + path = os.confstr("CS_PATH") + except (AttributeError, ValueError): + # os.confstr() or CS_PATH is not available + path = os.defpath + + # PATH='' doesn't match, whereas PATH=':' looks in the current directory + if not path: + return None + + paths = path.split(os.pathsep) + for p in paths: + f = os.path.join(p, executable) + if os.path.isfile(f): + # the file exists, we have a shot at spawn working + return f + return None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/sysconfig.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/sysconfig.py new file mode 100644 index 0000000000000000000000000000000000000000..ef3def83eb6c02fb31d642c8e8026f9cbb6b4670 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/sysconfig.py @@ -0,0 +1,583 @@ +"""Provide access to Python's configuration information. The specific +configuration variables available depend heavily on the platform and +configuration. The values may be retrieved using +get_config_var(name), and the list of variables is available via +get_config_vars().keys(). Additional convenience functions are also +available. + +Written by: Fred L. Drake, Jr. +Email: +""" + +import functools +import os +import pathlib +import re +import sys +import sysconfig + +from jaraco.functools import pass_none + +from .compat import py39 +from .errors import DistutilsPlatformError +from .util import is_mingw + +IS_PYPY = '__pypy__' in sys.builtin_module_names + +# These are needed in a couple of spots, so just compute them once. +PREFIX = os.path.normpath(sys.prefix) +EXEC_PREFIX = os.path.normpath(sys.exec_prefix) +BASE_PREFIX = os.path.normpath(sys.base_prefix) +BASE_EXEC_PREFIX = os.path.normpath(sys.base_exec_prefix) + +# Path to the base directory of the project. On Windows the binary may +# live in project/PCbuild/win32 or project/PCbuild/amd64. +# set for cross builds +if "_PYTHON_PROJECT_BASE" in os.environ: + project_base = os.path.abspath(os.environ["_PYTHON_PROJECT_BASE"]) +else: + if sys.executable: + project_base = os.path.dirname(os.path.abspath(sys.executable)) + else: + # sys.executable can be empty if argv[0] has been changed and Python is + # unable to retrieve the real program name + project_base = os.getcwd() + + +def _is_python_source_dir(d): + """ + Return True if the target directory appears to point to an + un-installed Python. + """ + modules = pathlib.Path(d).joinpath('Modules') + return any(modules.joinpath(fn).is_file() for fn in ('Setup', 'Setup.local')) + + +_sys_home = getattr(sys, '_home', None) + + +def _is_parent(dir_a, dir_b): + """ + Return True if a is a parent of b. + """ + return os.path.normcase(dir_a).startswith(os.path.normcase(dir_b)) + + +if os.name == 'nt': + + @pass_none + def _fix_pcbuild(d): + # In a venv, sys._home will be inside BASE_PREFIX rather than PREFIX. + prefixes = PREFIX, BASE_PREFIX + matched = ( + prefix + for prefix in prefixes + if _is_parent(d, os.path.join(prefix, "PCbuild")) + ) + return next(matched, d) + + project_base = _fix_pcbuild(project_base) + _sys_home = _fix_pcbuild(_sys_home) + + +def _python_build(): + if _sys_home: + return _is_python_source_dir(_sys_home) + return _is_python_source_dir(project_base) + + +python_build = _python_build() + + +# Calculate the build qualifier flags if they are defined. Adding the flags +# to the include and lib directories only makes sense for an installation, not +# an in-source build. +build_flags = '' +try: + if not python_build: + build_flags = sys.abiflags +except AttributeError: + # It's not a configure-based build, so the sys module doesn't have + # this attribute, which is fine. + pass + + +def get_python_version(): + """Return a string containing the major and minor Python version, + leaving off the patchlevel. Sample return values could be '1.5' + or '2.2'. + """ + return f'{sys.version_info.major}.{sys.version_info.minor}' + + +def get_python_inc(plat_specific=False, prefix=None): + """Return the directory containing installed Python header files. + + If 'plat_specific' is false (the default), this is the path to the + non-platform-specific header files, i.e. Python.h and so on; + otherwise, this is the path to platform-specific header files + (namely pyconfig.h). + + If 'prefix' is supplied, use it instead of sys.base_prefix or + sys.base_exec_prefix -- i.e., ignore 'plat_specific'. + """ + default_prefix = BASE_EXEC_PREFIX if plat_specific else BASE_PREFIX + resolved_prefix = prefix if prefix is not None else default_prefix + # MinGW imitates posix like layout, but os.name != posix + os_name = "posix" if is_mingw() else os.name + try: + getter = globals()[f'_get_python_inc_{os_name}'] + except KeyError: + raise DistutilsPlatformError( + "I don't know where Python installs its C header files " + f"on platform '{os.name}'" + ) + return getter(resolved_prefix, prefix, plat_specific) + + +@pass_none +def _extant(path): + """ + Replace path with None if it doesn't exist. + """ + return path if os.path.exists(path) else None + + +def _get_python_inc_posix(prefix, spec_prefix, plat_specific): + if IS_PYPY and sys.version_info < (3, 8): + return os.path.join(prefix, 'include') + return ( + _get_python_inc_posix_python(plat_specific) + or _extant(_get_python_inc_from_config(plat_specific, spec_prefix)) + or _get_python_inc_posix_prefix(prefix) + ) + + +def _get_python_inc_posix_python(plat_specific): + """ + Assume the executable is in the build directory. The + pyconfig.h file should be in the same directory. Since + the build directory may not be the source directory, + use "srcdir" from the makefile to find the "Include" + directory. + """ + if not python_build: + return + if plat_specific: + return _sys_home or project_base + incdir = os.path.join(get_config_var('srcdir'), 'Include') + return os.path.normpath(incdir) + + +def _get_python_inc_from_config(plat_specific, spec_prefix): + """ + If no prefix was explicitly specified, provide the include + directory from the config vars. Useful when + cross-compiling, since the config vars may come from + the host + platform Python installation, while the current Python + executable is from the build platform installation. + + >>> monkeypatch = getfixture('monkeypatch') + >>> gpifc = _get_python_inc_from_config + >>> monkeypatch.setitem(gpifc.__globals__, 'get_config_var', str.lower) + >>> gpifc(False, '/usr/bin/') + >>> gpifc(False, '') + >>> gpifc(False, None) + 'includepy' + >>> gpifc(True, None) + 'confincludepy' + """ + if spec_prefix is None: + return get_config_var('CONF' * plat_specific + 'INCLUDEPY') + + +def _get_python_inc_posix_prefix(prefix): + implementation = 'pypy' if IS_PYPY else 'python' + python_dir = implementation + get_python_version() + build_flags + return os.path.join(prefix, "include", python_dir) + + +def _get_python_inc_nt(prefix, spec_prefix, plat_specific): + if python_build: + # Include both include dirs to ensure we can find pyconfig.h + return ( + os.path.join(prefix, "include") + + os.path.pathsep + + os.path.dirname(sysconfig.get_config_h_filename()) + ) + return os.path.join(prefix, "include") + + +# allow this behavior to be monkey-patched. Ref pypa/distutils#2. +def _posix_lib(standard_lib, libpython, early_prefix, prefix): + if standard_lib: + return libpython + else: + return os.path.join(libpython, "site-packages") + + +def get_python_lib(plat_specific=False, standard_lib=False, prefix=None): + """Return the directory containing the Python library (standard or + site additions). + + If 'plat_specific' is true, return the directory containing + platform-specific modules, i.e. any module from a non-pure-Python + module distribution; otherwise, return the platform-shared library + directory. If 'standard_lib' is true, return the directory + containing standard Python library modules; otherwise, return the + directory for site-specific modules. + + If 'prefix' is supplied, use it instead of sys.base_prefix or + sys.base_exec_prefix -- i.e., ignore 'plat_specific'. + """ + + if IS_PYPY and sys.version_info < (3, 8): + # PyPy-specific schema + if prefix is None: + prefix = PREFIX + if standard_lib: + return os.path.join(prefix, "lib-python", sys.version_info.major) + return os.path.join(prefix, 'site-packages') + + early_prefix = prefix + + if prefix is None: + if standard_lib: + prefix = plat_specific and BASE_EXEC_PREFIX or BASE_PREFIX + else: + prefix = plat_specific and EXEC_PREFIX or PREFIX + + if os.name == "posix" or is_mingw(): + if plat_specific or standard_lib: + # Platform-specific modules (any module from a non-pure-Python + # module distribution) or standard Python library modules. + libdir = getattr(sys, "platlibdir", "lib") + else: + # Pure Python + libdir = "lib" + implementation = 'pypy' if IS_PYPY else 'python' + libpython = os.path.join(prefix, libdir, implementation + get_python_version()) + return _posix_lib(standard_lib, libpython, early_prefix, prefix) + elif os.name == "nt": + if standard_lib: + return os.path.join(prefix, "Lib") + else: + return os.path.join(prefix, "Lib", "site-packages") + else: + raise DistutilsPlatformError( + f"I don't know where Python installs its library on platform '{os.name}'" + ) + + +@functools.lru_cache +def _customize_macos(): + """ + Perform first-time customization of compiler-related + config vars on macOS. Use after a compiler is known + to be needed. This customization exists primarily to support Pythons + from binary installers. The kind and paths to build tools on + the user system may vary significantly from the system + that Python itself was built on. Also the user OS + version and build tools may not support the same set + of CPU architectures for universal builds. + """ + + sys.platform == "darwin" and __import__('_osx_support').customize_compiler( + get_config_vars() + ) + + +def customize_compiler(compiler): + """Do any platform-specific customization of a CCompiler instance. + + Mainly needed on Unix, so we can plug in the information that + varies across Unices and is stored in Python's Makefile. + """ + if compiler.compiler_type in ["unix", "cygwin"] or ( + compiler.compiler_type == "mingw32" and is_mingw() + ): + _customize_macos() + + ( + cc, + cxx, + cflags, + ccshared, + ldshared, + ldcxxshared, + shlib_suffix, + ar, + ar_flags, + ) = get_config_vars( + 'CC', + 'CXX', + 'CFLAGS', + 'CCSHARED', + 'LDSHARED', + 'LDCXXSHARED', + 'SHLIB_SUFFIX', + 'AR', + 'ARFLAGS', + ) + + cxxflags = cflags + + if 'CC' in os.environ: + newcc = os.environ['CC'] + if 'LDSHARED' not in os.environ and ldshared.startswith(cc): + # If CC is overridden, use that as the default + # command for LDSHARED as well + ldshared = newcc + ldshared[len(cc) :] + cc = newcc + cxx = os.environ.get('CXX', cxx) + ldshared = os.environ.get('LDSHARED', ldshared) + ldcxxshared = os.environ.get('LDCXXSHARED', ldcxxshared) + cpp = os.environ.get( + 'CPP', + cc + " -E", # not always + ) + + ldshared = _add_flags(ldshared, 'LD') + ldcxxshared = _add_flags(ldcxxshared, 'LD') + cflags = os.environ.get('CFLAGS', cflags) + ldshared = _add_flags(ldshared, 'C') + cxxflags = os.environ.get('CXXFLAGS', cxxflags) + ldcxxshared = _add_flags(ldcxxshared, 'CXX') + cpp = _add_flags(cpp, 'CPP') + cflags = _add_flags(cflags, 'CPP') + cxxflags = _add_flags(cxxflags, 'CPP') + ldshared = _add_flags(ldshared, 'CPP') + ldcxxshared = _add_flags(ldcxxshared, 'CPP') + + ar = os.environ.get('AR', ar) + + archiver = ar + ' ' + os.environ.get('ARFLAGS', ar_flags) + cc_cmd = cc + ' ' + cflags + cxx_cmd = cxx + ' ' + cxxflags + + compiler.set_executables( + preprocessor=cpp, + compiler=cc_cmd, + compiler_so=cc_cmd + ' ' + ccshared, + compiler_cxx=cxx_cmd, + compiler_so_cxx=cxx_cmd + ' ' + ccshared, + linker_so=ldshared, + linker_so_cxx=ldcxxshared, + linker_exe=cc, + linker_exe_cxx=cxx, + archiver=archiver, + ) + + if 'RANLIB' in os.environ and compiler.executables.get('ranlib', None): + compiler.set_executables(ranlib=os.environ['RANLIB']) + + compiler.shared_lib_extension = shlib_suffix + + +def get_config_h_filename(): + """Return full pathname of installed pyconfig.h file.""" + return sysconfig.get_config_h_filename() + + +def get_makefile_filename(): + """Return full pathname of installed Makefile from the Python build.""" + return sysconfig.get_makefile_filename() + + +def parse_config_h(fp, g=None): + """Parse a config.h-style file. + + A dictionary containing name/value pairs is returned. If an + optional dictionary is passed in as the second argument, it is + used instead of a new dictionary. + """ + return sysconfig.parse_config_h(fp, vars=g) + + +# Regexes needed for parsing Makefile (and similar syntaxes, +# like old-style Setup files). +_variable_rx = re.compile(r"([a-zA-Z][a-zA-Z0-9_]+)\s*=\s*(.*)") +_findvar1_rx = re.compile(r"\$\(([A-Za-z][A-Za-z0-9_]*)\)") +_findvar2_rx = re.compile(r"\${([A-Za-z][A-Za-z0-9_]*)}") + + +def parse_makefile(fn, g=None): # noqa: C901 + """Parse a Makefile-style file. + + A dictionary containing name/value pairs is returned. If an + optional dictionary is passed in as the second argument, it is + used instead of a new dictionary. + """ + from distutils.text_file import TextFile + + fp = TextFile( + fn, + strip_comments=True, + skip_blanks=True, + join_lines=True, + errors="surrogateescape", + ) + + if g is None: + g = {} + done = {} + notdone = {} + + while True: + line = fp.readline() + if line is None: # eof + break + m = _variable_rx.match(line) + if m: + n, v = m.group(1, 2) + v = v.strip() + # `$$' is a literal `$' in make + tmpv = v.replace('$$', '') + + if "$" in tmpv: + notdone[n] = v + else: + try: + v = int(v) + except ValueError: + # insert literal `$' + done[n] = v.replace('$$', '$') + else: + done[n] = v + + # Variables with a 'PY_' prefix in the makefile. These need to + # be made available without that prefix through sysconfig. + # Special care is needed to ensure that variable expansion works, even + # if the expansion uses the name without a prefix. + renamed_variables = ('CFLAGS', 'LDFLAGS', 'CPPFLAGS') + + # do variable interpolation here + while notdone: + for name in list(notdone): + value = notdone[name] + m = _findvar1_rx.search(value) or _findvar2_rx.search(value) + if m: + n = m.group(1) + found = True + if n in done: + item = str(done[n]) + elif n in notdone: + # get it on a subsequent round + found = False + elif n in os.environ: + # do it like make: fall back to environment + item = os.environ[n] + + elif n in renamed_variables: + if name.startswith('PY_') and name[3:] in renamed_variables: + item = "" + + elif 'PY_' + n in notdone: + found = False + + else: + item = str(done['PY_' + n]) + else: + done[n] = item = "" + if found: + after = value[m.end() :] + value = value[: m.start()] + item + after + if "$" in after: + notdone[name] = value + else: + try: + value = int(value) + except ValueError: + done[name] = value.strip() + else: + done[name] = value + del notdone[name] + + if name.startswith('PY_') and name[3:] in renamed_variables: + name = name[3:] + if name not in done: + done[name] = value + else: + # bogus variable reference; just drop it since we can't deal + del notdone[name] + + fp.close() + + # strip spurious spaces + for k, v in done.items(): + if isinstance(v, str): + done[k] = v.strip() + + # save the results in the global dictionary + g.update(done) + return g + + +def expand_makefile_vars(s, vars): + """Expand Makefile-style variables -- "${foo}" or "$(foo)" -- in + 'string' according to 'vars' (a dictionary mapping variable names to + values). Variables not present in 'vars' are silently expanded to the + empty string. The variable values in 'vars' should not contain further + variable expansions; if 'vars' is the output of 'parse_makefile()', + you're fine. Returns a variable-expanded version of 's'. + """ + + # This algorithm does multiple expansion, so if vars['foo'] contains + # "${bar}", it will expand ${foo} to ${bar}, and then expand + # ${bar}... and so forth. This is fine as long as 'vars' comes from + # 'parse_makefile()', which takes care of such expansions eagerly, + # according to make's variable expansion semantics. + + while True: + m = _findvar1_rx.search(s) or _findvar2_rx.search(s) + if m: + (beg, end) = m.span() + s = s[0:beg] + vars.get(m.group(1)) + s[end:] + else: + break + return s + + +_config_vars = None + + +def get_config_vars(*args): + """With no arguments, return a dictionary of all configuration + variables relevant for the current platform. Generally this includes + everything needed to build extensions and install both pure modules and + extensions. On Unix, this means every variable defined in Python's + installed Makefile; on Windows it's a much smaller set. + + With arguments, return a list of values that result from looking up + each argument in the configuration variable dictionary. + """ + global _config_vars + if _config_vars is None: + _config_vars = sysconfig.get_config_vars().copy() + py39.add_ext_suffix(_config_vars) + + return [_config_vars.get(name) for name in args] if args else _config_vars + + +def get_config_var(name): + """Return the value of a single variable using the dictionary + returned by 'get_config_vars()'. Equivalent to + get_config_vars().get(name) + """ + if name == 'SO': + import warnings + + warnings.warn('SO is deprecated, use EXT_SUFFIX', DeprecationWarning, 2) + return get_config_vars().get(name) + + +@pass_none +def _add_flags(value: str, type: str) -> str: + """ + Add any flags from the environment for the given type. + + type is the prefix to FLAGS in the environment key (e.g. "C" for "CFLAGS"). + """ + flags = os.environ.get(f'{type}FLAGS') + return f'{value} {flags}' if flags else value diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/text_file.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/text_file.py new file mode 100644 index 0000000000000000000000000000000000000000..89d9048d5902950516c3541ad212bff7b6cb2045 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/text_file.py @@ -0,0 +1,286 @@ +"""text_file + +provides the TextFile class, which gives an interface to text files +that (optionally) takes care of stripping comments, ignoring blank +lines, and joining lines with backslashes.""" + +import sys + + +class TextFile: + """Provides a file-like object that takes care of all the things you + commonly want to do when processing a text file that has some + line-by-line syntax: strip comments (as long as "#" is your + comment character), skip blank lines, join adjacent lines by + escaping the newline (ie. backslash at end of line), strip + leading and/or trailing whitespace. All of these are optional + and independently controllable. + + Provides a 'warn()' method so you can generate warning messages that + report physical line number, even if the logical line in question + spans multiple physical lines. Also provides 'unreadline()' for + implementing line-at-a-time lookahead. + + Constructor is called as: + + TextFile (filename=None, file=None, **options) + + It bombs (RuntimeError) if both 'filename' and 'file' are None; + 'filename' should be a string, and 'file' a file object (or + something that provides 'readline()' and 'close()' methods). It is + recommended that you supply at least 'filename', so that TextFile + can include it in warning messages. If 'file' is not supplied, + TextFile creates its own using 'io.open()'. + + The options are all boolean, and affect the value returned by + 'readline()': + strip_comments [default: true] + strip from "#" to end-of-line, as well as any whitespace + leading up to the "#" -- unless it is escaped by a backslash + lstrip_ws [default: false] + strip leading whitespace from each line before returning it + rstrip_ws [default: true] + strip trailing whitespace (including line terminator!) from + each line before returning it + skip_blanks [default: true} + skip lines that are empty *after* stripping comments and + whitespace. (If both lstrip_ws and rstrip_ws are false, + then some lines may consist of solely whitespace: these will + *not* be skipped, even if 'skip_blanks' is true.) + join_lines [default: false] + if a backslash is the last non-newline character on a line + after stripping comments and whitespace, join the following line + to it to form one "logical line"; if N consecutive lines end + with a backslash, then N+1 physical lines will be joined to + form one logical line. + collapse_join [default: false] + strip leading whitespace from lines that are joined to their + predecessor; only matters if (join_lines and not lstrip_ws) + errors [default: 'strict'] + error handler used to decode the file content + + Note that since 'rstrip_ws' can strip the trailing newline, the + semantics of 'readline()' must differ from those of the builtin file + object's 'readline()' method! In particular, 'readline()' returns + None for end-of-file: an empty string might just be a blank line (or + an all-whitespace line), if 'rstrip_ws' is true but 'skip_blanks' is + not.""" + + default_options = { + 'strip_comments': 1, + 'skip_blanks': 1, + 'lstrip_ws': 0, + 'rstrip_ws': 1, + 'join_lines': 0, + 'collapse_join': 0, + 'errors': 'strict', + } + + def __init__(self, filename=None, file=None, **options): + """Construct a new TextFile object. At least one of 'filename' + (a string) and 'file' (a file-like object) must be supplied. + They keyword argument options are described above and affect + the values returned by 'readline()'.""" + if filename is None and file is None: + raise RuntimeError( + "you must supply either or both of 'filename' and 'file'" + ) + + # set values for all options -- either from client option hash + # or fallback to default_options + for opt in self.default_options.keys(): + if opt in options: + setattr(self, opt, options[opt]) + else: + setattr(self, opt, self.default_options[opt]) + + # sanity check client option hash + for opt in options.keys(): + if opt not in self.default_options: + raise KeyError(f"invalid TextFile option '{opt}'") + + if file is None: + self.open(filename) + else: + self.filename = filename + self.file = file + self.current_line = 0 # assuming that file is at BOF! + + # 'linebuf' is a stack of lines that will be emptied before we + # actually read from the file; it's only populated by an + # 'unreadline()' operation + self.linebuf = [] + + def open(self, filename): + """Open a new file named 'filename'. This overrides both the + 'filename' and 'file' arguments to the constructor.""" + self.filename = filename + self.file = open(self.filename, errors=self.errors, encoding='utf-8') + self.current_line = 0 + + def close(self): + """Close the current file and forget everything we know about it + (filename, current line number).""" + file = self.file + self.file = None + self.filename = None + self.current_line = None + file.close() + + def gen_error(self, msg, line=None): + outmsg = [] + if line is None: + line = self.current_line + outmsg.append(self.filename + ", ") + if isinstance(line, (list, tuple)): + outmsg.append("lines {}-{}: ".format(*line)) + else: + outmsg.append(f"line {int(line)}: ") + outmsg.append(str(msg)) + return "".join(outmsg) + + def error(self, msg, line=None): + raise ValueError("error: " + self.gen_error(msg, line)) + + def warn(self, msg, line=None): + """Print (to stderr) a warning message tied to the current logical + line in the current file. If the current logical line in the + file spans multiple physical lines, the warning refers to the + whole range, eg. "lines 3-5". If 'line' supplied, it overrides + the current line number; it may be a list or tuple to indicate a + range of physical lines, or an integer for a single physical + line.""" + sys.stderr.write("warning: " + self.gen_error(msg, line) + "\n") + + def readline(self): # noqa: C901 + """Read and return a single logical line from the current file (or + from an internal buffer if lines have previously been "unread" + with 'unreadline()'). If the 'join_lines' option is true, this + may involve reading multiple physical lines concatenated into a + single string. Updates the current line number, so calling + 'warn()' after 'readline()' emits a warning about the physical + line(s) just read. Returns None on end-of-file, since the empty + string can occur if 'rstrip_ws' is true but 'strip_blanks' is + not.""" + # If any "unread" lines waiting in 'linebuf', return the top + # one. (We don't actually buffer read-ahead data -- lines only + # get put in 'linebuf' if the client explicitly does an + # 'unreadline()'. + if self.linebuf: + line = self.linebuf[-1] + del self.linebuf[-1] + return line + + buildup_line = '' + + while True: + # read the line, make it None if EOF + line = self.file.readline() + if line == '': + line = None + + if self.strip_comments and line: + # Look for the first "#" in the line. If none, never + # mind. If we find one and it's the first character, or + # is not preceded by "\", then it starts a comment -- + # strip the comment, strip whitespace before it, and + # carry on. Otherwise, it's just an escaped "#", so + # unescape it (and any other escaped "#"'s that might be + # lurking in there) and otherwise leave the line alone. + + pos = line.find("#") + if pos == -1: # no "#" -- no comments + pass + + # It's definitely a comment -- either "#" is the first + # character, or it's elsewhere and unescaped. + elif pos == 0 or line[pos - 1] != "\\": + # Have to preserve the trailing newline, because it's + # the job of a later step (rstrip_ws) to remove it -- + # and if rstrip_ws is false, we'd better preserve it! + # (NB. this means that if the final line is all comment + # and has no trailing newline, we will think that it's + # EOF; I think that's OK.) + eol = (line[-1] == '\n') and '\n' or '' + line = line[0:pos] + eol + + # If all that's left is whitespace, then skip line + # *now*, before we try to join it to 'buildup_line' -- + # that way constructs like + # hello \\ + # # comment that should be ignored + # there + # result in "hello there". + if line.strip() == "": + continue + else: # it's an escaped "#" + line = line.replace("\\#", "#") + + # did previous line end with a backslash? then accumulate + if self.join_lines and buildup_line: + # oops: end of file + if line is None: + self.warn("continuation line immediately precedes end-of-file") + return buildup_line + + if self.collapse_join: + line = line.lstrip() + line = buildup_line + line + + # careful: pay attention to line number when incrementing it + if isinstance(self.current_line, list): + self.current_line[1] = self.current_line[1] + 1 + else: + self.current_line = [self.current_line, self.current_line + 1] + # just an ordinary line, read it as usual + else: + if line is None: # eof + return None + + # still have to be careful about incrementing the line number! + if isinstance(self.current_line, list): + self.current_line = self.current_line[1] + 1 + else: + self.current_line = self.current_line + 1 + + # strip whitespace however the client wants (leading and + # trailing, or one or the other, or neither) + if self.lstrip_ws and self.rstrip_ws: + line = line.strip() + elif self.lstrip_ws: + line = line.lstrip() + elif self.rstrip_ws: + line = line.rstrip() + + # blank line (whether we rstrip'ed or not)? skip to next line + # if appropriate + if line in ('', '\n') and self.skip_blanks: + continue + + if self.join_lines: + if line[-1] == '\\': + buildup_line = line[:-1] + continue + + if line[-2:] == '\\\n': + buildup_line = line[0:-2] + '\n' + continue + + # well, I guess there's some actual content there: return it + return line + + def readlines(self): + """Read and return the list of all logical lines remaining in the + current file.""" + lines = [] + while True: + line = self.readline() + if line is None: + return lines + lines.append(line) + + def unreadline(self, line): + """Push 'line' (a string) onto an internal buffer that will be + checked by future 'readline()' calls. Handy for implementing + a parser with line-at-a-time lookahead.""" + self.linebuf.append(line) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/unixccompiler.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/unixccompiler.py new file mode 100644 index 0000000000000000000000000000000000000000..6c1116ae8f7f9d54831dc8a9e815c2f212686e3f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/unixccompiler.py @@ -0,0 +1,402 @@ +"""distutils.unixccompiler + +Contains the UnixCCompiler class, a subclass of CCompiler that handles +the "typical" Unix-style command-line C compiler: + * macros defined with -Dname[=value] + * macros undefined with -Uname + * include search directories specified with -Idir + * libraries specified with -lllib + * library search directories specified with -Ldir + * compile handled by 'cc' (or similar) executable with -c option: + compiles .c to .o + * link static library handled by 'ar' command (possibly with 'ranlib') + * link shared library handled by 'cc -shared' +""" + +from __future__ import annotations + +import itertools +import os +import re +import shlex +import sys + +from . import sysconfig +from ._log import log +from ._macos_compat import compiler_fixup +from ._modified import newer +from .ccompiler import CCompiler, gen_lib_options, gen_preprocess_options +from .compat import consolidate_linker_args +from .errors import CompileError, DistutilsExecError, LibError, LinkError + +# XXX Things not currently handled: +# * optimization/debug/warning flags; we just use whatever's in Python's +# Makefile and live with it. Is this adequate? If not, we might +# have to have a bunch of subclasses GNUCCompiler, SGICCompiler, +# SunCCompiler, and I suspect down that road lies madness. +# * even if we don't know a warning flag from an optimization flag, +# we need some way for outsiders to feed preprocessor/compiler/linker +# flags in to us -- eg. a sysadmin might want to mandate certain flags +# via a site config file, or a user might want to set something for +# compiling this module distribution only via the setup.py command +# line, whatever. As long as these options come from something on the +# current system, they can be as system-dependent as they like, and we +# should just happily stuff them into the preprocessor/compiler/linker +# options and carry on. + + +def _split_env(cmd): + """ + For macOS, split command into 'env' portion (if any) + and the rest of the linker command. + + >>> _split_env(['a', 'b', 'c']) + ([], ['a', 'b', 'c']) + >>> _split_env(['/usr/bin/env', 'A=3', 'gcc']) + (['/usr/bin/env', 'A=3'], ['gcc']) + """ + pivot = 0 + if os.path.basename(cmd[0]) == "env": + pivot = 1 + while '=' in cmd[pivot]: + pivot += 1 + return cmd[:pivot], cmd[pivot:] + + +def _split_aix(cmd): + """ + AIX platforms prefix the compiler with the ld_so_aix + script, so split that from the linker command. + + >>> _split_aix(['a', 'b', 'c']) + ([], ['a', 'b', 'c']) + >>> _split_aix(['/bin/foo/ld_so_aix', 'gcc']) + (['/bin/foo/ld_so_aix'], ['gcc']) + """ + pivot = os.path.basename(cmd[0]) == 'ld_so_aix' + return cmd[:pivot], cmd[pivot:] + + +def _linker_params(linker_cmd, compiler_cmd): + """ + The linker command usually begins with the compiler + command (possibly multiple elements), followed by zero or more + params for shared library building. + + If the LDSHARED env variable overrides the linker command, + however, the commands may not match. + + Return the best guess of the linker parameters by stripping + the linker command. If the compiler command does not + match the linker command, assume the linker command is + just the first element. + + >>> _linker_params('gcc foo bar'.split(), ['gcc']) + ['foo', 'bar'] + >>> _linker_params('gcc foo bar'.split(), ['other']) + ['foo', 'bar'] + >>> _linker_params('ccache gcc foo bar'.split(), 'ccache gcc'.split()) + ['foo', 'bar'] + >>> _linker_params(['gcc'], ['gcc']) + [] + """ + c_len = len(compiler_cmd) + pivot = c_len if linker_cmd[:c_len] == compiler_cmd else 1 + return linker_cmd[pivot:] + + +class UnixCCompiler(CCompiler): + compiler_type = 'unix' + + # These are used by CCompiler in two places: the constructor sets + # instance attributes 'preprocessor', 'compiler', etc. from them, and + # 'set_executable()' allows any of these to be set. The defaults here + # are pretty generic; they will probably have to be set by an outsider + # (eg. using information discovered by the sysconfig about building + # Python extensions). + executables = { + 'preprocessor': None, + 'compiler': ["cc"], + 'compiler_so': ["cc"], + 'compiler_cxx': ["c++"], + 'compiler_so_cxx': ["c++"], + 'linker_so': ["cc", "-shared"], + 'linker_so_cxx': ["c++", "-shared"], + 'linker_exe': ["cc"], + 'linker_exe_cxx': ["c++", "-shared"], + 'archiver': ["ar", "-cr"], + 'ranlib': None, + } + + if sys.platform[:6] == "darwin": + executables['ranlib'] = ["ranlib"] + + # Needed for the filename generation methods provided by the base + # class, CCompiler. NB. whoever instantiates/uses a particular + # UnixCCompiler instance should set 'shared_lib_ext' -- we set a + # reasonable common default here, but it's not necessarily used on all + # Unices! + + src_extensions = [".c", ".C", ".cc", ".cxx", ".cpp", ".m"] + obj_extension = ".o" + static_lib_extension = ".a" + shared_lib_extension = ".so" + dylib_lib_extension = ".dylib" + xcode_stub_lib_extension = ".tbd" + static_lib_format = shared_lib_format = dylib_lib_format = "lib%s%s" + xcode_stub_lib_format = dylib_lib_format + if sys.platform == "cygwin": + exe_extension = ".exe" + shared_lib_extension = ".dll.a" + dylib_lib_extension = ".dll" + dylib_lib_format = "cyg%s%s" + + def preprocess( + self, + source, + output_file=None, + macros=None, + include_dirs=None, + extra_preargs=None, + extra_postargs=None, + ): + fixed_args = self._fix_compile_args(None, macros, include_dirs) + ignore, macros, include_dirs = fixed_args + pp_opts = gen_preprocess_options(macros, include_dirs) + pp_args = self.preprocessor + pp_opts + if output_file: + pp_args.extend(['-o', output_file]) + if extra_preargs: + pp_args[:0] = extra_preargs + if extra_postargs: + pp_args.extend(extra_postargs) + pp_args.append(source) + + # reasons to preprocess: + # - force is indicated + # - output is directed to stdout + # - source file is newer than the target + preprocess = self.force or output_file is None or newer(source, output_file) + if not preprocess: + return + + if output_file: + self.mkpath(os.path.dirname(output_file)) + + try: + self.spawn(pp_args) + except DistutilsExecError as msg: + raise CompileError(msg) + + def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts): + compiler_so = compiler_fixup(self.compiler_so, cc_args + extra_postargs) + compiler_so_cxx = compiler_fixup(self.compiler_so_cxx, cc_args + extra_postargs) + try: + if self.detect_language(src) == 'c++': + self.spawn( + compiler_so_cxx + cc_args + [src, '-o', obj] + extra_postargs + ) + else: + self.spawn(compiler_so + cc_args + [src, '-o', obj] + extra_postargs) + except DistutilsExecError as msg: + raise CompileError(msg) + + def create_static_lib( + self, objects, output_libname, output_dir=None, debug=False, target_lang=None + ): + objects, output_dir = self._fix_object_args(objects, output_dir) + + output_filename = self.library_filename(output_libname, output_dir=output_dir) + + if self._need_link(objects, output_filename): + self.mkpath(os.path.dirname(output_filename)) + self.spawn(self.archiver + [output_filename] + objects + self.objects) + + # Not many Unices required ranlib anymore -- SunOS 4.x is, I + # think the only major Unix that does. Maybe we need some + # platform intelligence here to skip ranlib if it's not + # needed -- or maybe Python's configure script took care of + # it for us, hence the check for leading colon. + if self.ranlib: + try: + self.spawn(self.ranlib + [output_filename]) + except DistutilsExecError as msg: + raise LibError(msg) + else: + log.debug("skipping %s (up-to-date)", output_filename) + + def link( + self, + target_desc, + objects, + output_filename, + output_dir=None, + libraries=None, + library_dirs=None, + runtime_library_dirs=None, + export_symbols=None, + debug=False, + extra_preargs=None, + extra_postargs=None, + build_temp=None, + target_lang=None, + ): + objects, output_dir = self._fix_object_args(objects, output_dir) + fixed_args = self._fix_lib_args(libraries, library_dirs, runtime_library_dirs) + libraries, library_dirs, runtime_library_dirs = fixed_args + + lib_opts = gen_lib_options(self, library_dirs, runtime_library_dirs, libraries) + if not isinstance(output_dir, (str, type(None))): + raise TypeError("'output_dir' must be a string or None") + if output_dir is not None: + output_filename = os.path.join(output_dir, output_filename) + + if self._need_link(objects, output_filename): + ld_args = objects + self.objects + lib_opts + ['-o', output_filename] + if debug: + ld_args[:0] = ['-g'] + if extra_preargs: + ld_args[:0] = extra_preargs + if extra_postargs: + ld_args.extend(extra_postargs) + self.mkpath(os.path.dirname(output_filename)) + try: + # Select a linker based on context: linker_exe when + # building an executable or linker_so (with shared options) + # when building a shared library. + building_exe = target_desc == CCompiler.EXECUTABLE + linker = ( + self.linker_exe + if building_exe + else ( + self.linker_so_cxx if target_lang == "c++" else self.linker_so + ) + )[:] + + if target_lang == "c++" and self.compiler_cxx: + env, linker_ne = _split_env(linker) + aix, linker_na = _split_aix(linker_ne) + _, compiler_cxx_ne = _split_env(self.compiler_cxx) + _, linker_exe_ne = _split_env(self.linker_exe) + + params = _linker_params(linker_na, linker_exe_ne) + linker = env + aix + compiler_cxx_ne + params + + linker = compiler_fixup(linker, ld_args) + + self.spawn(linker + ld_args) + except DistutilsExecError as msg: + raise LinkError(msg) + else: + log.debug("skipping %s (up-to-date)", output_filename) + + # -- Miscellaneous methods ----------------------------------------- + # These are all used by the 'gen_lib_options() function, in + # ccompiler.py. + + def library_dir_option(self, dir): + return "-L" + dir + + def _is_gcc(self): + cc_var = sysconfig.get_config_var("CC") + compiler = os.path.basename(shlex.split(cc_var)[0]) + return "gcc" in compiler or "g++" in compiler + + def runtime_library_dir_option(self, dir: str) -> str | list[str]: + # XXX Hackish, at the very least. See Python bug #445902: + # https://bugs.python.org/issue445902 + # Linkers on different platforms need different options to + # specify that directories need to be added to the list of + # directories searched for dependencies when a dynamic library + # is sought. GCC on GNU systems (Linux, FreeBSD, ...) has to + # be told to pass the -R option through to the linker, whereas + # other compilers and gcc on other systems just know this. + # Other compilers may need something slightly different. At + # this time, there's no way to determine this information from + # the configuration data stored in the Python installation, so + # we use this hack. + if sys.platform[:6] == "darwin": + from distutils.util import get_macosx_target_ver, split_version + + macosx_target_ver = get_macosx_target_ver() + if macosx_target_ver and split_version(macosx_target_ver) >= [10, 5]: + return "-Wl,-rpath," + dir + else: # no support for -rpath on earlier macOS versions + return "-L" + dir + elif sys.platform[:7] == "freebsd": + return "-Wl,-rpath=" + dir + elif sys.platform[:5] == "hp-ux": + return [ + "-Wl,+s" if self._is_gcc() else "+s", + "-L" + dir, + ] + + # For all compilers, `-Wl` is the presumed way to pass a + # compiler option to the linker + if sysconfig.get_config_var("GNULD") == "yes": + return consolidate_linker_args([ + # Force RUNPATH instead of RPATH + "-Wl,--enable-new-dtags", + "-Wl,-rpath," + dir, + ]) + else: + return "-Wl,-R" + dir + + def library_option(self, lib): + return "-l" + lib + + @staticmethod + def _library_root(dir): + """ + macOS users can specify an alternate SDK using'-isysroot'. + Calculate the SDK root if it is specified. + + Note that, as of Xcode 7, Apple SDKs may contain textual stub + libraries with .tbd extensions rather than the normal .dylib + shared libraries installed in /. The Apple compiler tool + chain handles this transparently but it can cause problems + for programs that are being built with an SDK and searching + for specific libraries. Callers of find_library_file need to + keep in mind that the base filename of the returned SDK library + file might have a different extension from that of the library + file installed on the running system, for example: + /Applications/Xcode.app/Contents/Developer/Platforms/ + MacOSX.platform/Developer/SDKs/MacOSX10.11.sdk/ + usr/lib/libedit.tbd + vs + /usr/lib/libedit.dylib + """ + cflags = sysconfig.get_config_var('CFLAGS') + match = re.search(r'-isysroot\s*(\S+)', cflags) + + apply_root = ( + sys.platform == 'darwin' + and match + and ( + dir.startswith('/System/') + or (dir.startswith('/usr/') and not dir.startswith('/usr/local/')) + ) + ) + + return os.path.join(match.group(1), dir[1:]) if apply_root else dir + + def find_library_file(self, dirs, lib, debug=False): + """ + Second-guess the linker with not much hard + data to go on: GCC seems to prefer the shared library, so + assume that *all* Unix C compilers do, + ignoring even GCC's "-static" option. + """ + lib_names = ( + self.library_filename(lib, lib_type=type) + for type in 'dylib xcode_stub shared static'.split() + ) + + roots = map(self._library_root, dirs) + + searched = itertools.starmap(os.path.join, itertools.product(roots, lib_names)) + + found = filter(os.path.exists, searched) + + # Return None if it could not be found in any dir. + return next(found, None) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/util.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/util.py new file mode 100644 index 0000000000000000000000000000000000000000..83ad39e958de67e323b4f92406be623f4aa2da8b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/util.py @@ -0,0 +1,502 @@ +"""distutils.util + +Miscellaneous utility functions -- anything that doesn't fit into +one of the other *util.py modules. +""" + +from __future__ import annotations + +import functools +import importlib.util +import os +import pathlib +import re +import string +import subprocess +import sys +import sysconfig +import tempfile + +from jaraco.functools import pass_none + +from ._log import log +from ._modified import newer +from .errors import DistutilsByteCompileError, DistutilsPlatformError +from .spawn import spawn + + +def get_host_platform() -> str: + """ + Return a string that identifies the current platform. Use this + function to distinguish platform-specific build directories and + platform-specific built distributions. + """ + + # This function initially exposed platforms as defined in Python 3.9 + # even with older Python versions when distutils was split out. + # Now it delegates to stdlib sysconfig. + + return sysconfig.get_platform() + + +def get_platform(): + if os.name == 'nt': + TARGET_TO_PLAT = { + 'x86': 'win32', + 'x64': 'win-amd64', + 'arm': 'win-arm32', + 'arm64': 'win-arm64', + } + target = os.environ.get('VSCMD_ARG_TGT_ARCH') + return TARGET_TO_PLAT.get(target) or get_host_platform() + return get_host_platform() + + +if sys.platform == 'darwin': + _syscfg_macosx_ver = None # cache the version pulled from sysconfig +MACOSX_VERSION_VAR = 'MACOSX_DEPLOYMENT_TARGET' + + +def _clear_cached_macosx_ver(): + """For testing only. Do not call.""" + global _syscfg_macosx_ver + _syscfg_macosx_ver = None + + +def get_macosx_target_ver_from_syscfg(): + """Get the version of macOS latched in the Python interpreter configuration. + Returns the version as a string or None if can't obtain one. Cached.""" + global _syscfg_macosx_ver + if _syscfg_macosx_ver is None: + from distutils import sysconfig + + ver = sysconfig.get_config_var(MACOSX_VERSION_VAR) or '' + if ver: + _syscfg_macosx_ver = ver + return _syscfg_macosx_ver + + +def get_macosx_target_ver(): + """Return the version of macOS for which we are building. + + The target version defaults to the version in sysconfig latched at time + the Python interpreter was built, unless overridden by an environment + variable. If neither source has a value, then None is returned""" + + syscfg_ver = get_macosx_target_ver_from_syscfg() + env_ver = os.environ.get(MACOSX_VERSION_VAR) + + if env_ver: + # Validate overridden version against sysconfig version, if have both. + # Ensure that the deployment target of the build process is not less + # than 10.3 if the interpreter was built for 10.3 or later. This + # ensures extension modules are built with correct compatibility + # values, specifically LDSHARED which can use + # '-undefined dynamic_lookup' which only works on >= 10.3. + if ( + syscfg_ver + and split_version(syscfg_ver) >= [10, 3] + and split_version(env_ver) < [10, 3] + ): + my_msg = ( + '$' + MACOSX_VERSION_VAR + ' mismatch: ' + f'now "{env_ver}" but "{syscfg_ver}" during configure; ' + 'must use 10.3 or later' + ) + raise DistutilsPlatformError(my_msg) + return env_ver + return syscfg_ver + + +def split_version(s): + """Convert a dot-separated string into a list of numbers for comparisons""" + return [int(n) for n in s.split('.')] + + +@pass_none +def convert_path(pathname: str | os.PathLike) -> str: + r""" + Allow for pathlib.Path inputs, coax to a native path string. + + If None is passed, will just pass it through as + Setuptools relies on this behavior. + + >>> convert_path(None) is None + True + + Removes empty paths. + + >>> convert_path('foo/./bar').replace('\\', '/') + 'foo/bar' + """ + return os.fspath(pathlib.PurePath(pathname)) + + +def change_root(new_root, pathname): + """Return 'pathname' with 'new_root' prepended. If 'pathname' is + relative, this is equivalent to "os.path.join(new_root,pathname)". + Otherwise, it requires making 'pathname' relative and then joining the + two, which is tricky on DOS/Windows and Mac OS. + """ + if os.name == 'posix': + if not os.path.isabs(pathname): + return os.path.join(new_root, pathname) + else: + return os.path.join(new_root, pathname[1:]) + + elif os.name == 'nt': + (drive, path) = os.path.splitdrive(pathname) + if path[0] == os.sep: + path = path[1:] + return os.path.join(new_root, path) + + raise DistutilsPlatformError(f"nothing known about platform '{os.name}'") + + +@functools.lru_cache +def check_environ(): + """Ensure that 'os.environ' has all the environment variables we + guarantee that users can use in config files, command-line options, + etc. Currently this includes: + HOME - user's home directory (Unix only) + PLAT - description of the current platform, including hardware + and OS (see 'get_platform()') + """ + if os.name == 'posix' and 'HOME' not in os.environ: + try: + import pwd + + os.environ['HOME'] = pwd.getpwuid(os.getuid())[5] + except (ImportError, KeyError): + # bpo-10496: if the current user identifier doesn't exist in the + # password database, do nothing + pass + + if 'PLAT' not in os.environ: + os.environ['PLAT'] = get_platform() + + +def subst_vars(s, local_vars): + """ + Perform variable substitution on 'string'. + Variables are indicated by format-style braces ("{var}"). + Variable is substituted by the value found in the 'local_vars' + dictionary or in 'os.environ' if it's not in 'local_vars'. + 'os.environ' is first checked/augmented to guarantee that it contains + certain values: see 'check_environ()'. Raise ValueError for any + variables not found in either 'local_vars' or 'os.environ'. + """ + check_environ() + lookup = dict(os.environ) + lookup.update((name, str(value)) for name, value in local_vars.items()) + try: + return _subst_compat(s).format_map(lookup) + except KeyError as var: + raise ValueError(f"invalid variable {var}") + + +def _subst_compat(s): + """ + Replace shell/Perl-style variable substitution with + format-style. For compatibility. + """ + + def _subst(match): + return f'{{{match.group(1)}}}' + + repl = re.sub(r'\$([a-zA-Z_][a-zA-Z_0-9]*)', _subst, s) + if repl != s: + import warnings + + warnings.warn( + "shell/Perl-style substitutions are deprecated", + DeprecationWarning, + ) + return repl + + +def grok_environment_error(exc, prefix="error: "): + # Function kept for backward compatibility. + # Used to try clever things with EnvironmentErrors, + # but nowadays str(exception) produces good messages. + return prefix + str(exc) + + +# Needed by 'split_quoted()' +_wordchars_re = _squote_re = _dquote_re = None + + +def _init_regex(): + global _wordchars_re, _squote_re, _dquote_re + _wordchars_re = re.compile(rf'[^\\\'\"{string.whitespace} ]*') + _squote_re = re.compile(r"'(?:[^'\\]|\\.)*'") + _dquote_re = re.compile(r'"(?:[^"\\]|\\.)*"') + + +def split_quoted(s): + """Split a string up according to Unix shell-like rules for quotes and + backslashes. In short: words are delimited by spaces, as long as those + spaces are not escaped by a backslash, or inside a quoted string. + Single and double quotes are equivalent, and the quote characters can + be backslash-escaped. The backslash is stripped from any two-character + escape sequence, leaving only the escaped character. The quote + characters are stripped from any quoted string. Returns a list of + words. + """ + + # This is a nice algorithm for splitting up a single string, since it + # doesn't require character-by-character examination. It was a little + # bit of a brain-bender to get it working right, though... + if _wordchars_re is None: + _init_regex() + + s = s.strip() + words = [] + pos = 0 + + while s: + m = _wordchars_re.match(s, pos) + end = m.end() + if end == len(s): + words.append(s[:end]) + break + + if s[end] in string.whitespace: + # unescaped, unquoted whitespace: now + # we definitely have a word delimiter + words.append(s[:end]) + s = s[end:].lstrip() + pos = 0 + + elif s[end] == '\\': + # preserve whatever is being escaped; + # will become part of the current word + s = s[:end] + s[end + 1 :] + pos = end + 1 + + else: + if s[end] == "'": # slurp singly-quoted string + m = _squote_re.match(s, end) + elif s[end] == '"': # slurp doubly-quoted string + m = _dquote_re.match(s, end) + else: + raise RuntimeError(f"this can't happen (bad char '{s[end]}')") + + if m is None: + raise ValueError(f"bad string (mismatched {s[end]} quotes?)") + + (beg, end) = m.span() + s = s[:beg] + s[beg + 1 : end - 1] + s[end:] + pos = m.end() - 2 + + if pos >= len(s): + words.append(s) + break + + return words + + +# split_quoted () + + +def execute(func, args, msg=None, verbose=False, dry_run=False): + """Perform some action that affects the outside world (eg. by + writing to the filesystem). Such actions are special because they + are disabled by the 'dry_run' flag. This method takes care of all + that bureaucracy for you; all you have to do is supply the + function to call and an argument tuple for it (to embody the + "external action" being performed), and an optional message to + print. + """ + if msg is None: + msg = f"{func.__name__}{args!r}" + if msg[-2:] == ',)': # correct for singleton tuple + msg = msg[0:-2] + ')' + + log.info(msg) + if not dry_run: + func(*args) + + +def strtobool(val): + """Convert a string representation of truth to true (1) or false (0). + + True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values + are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if + 'val' is anything else. + """ + val = val.lower() + if val in ('y', 'yes', 't', 'true', 'on', '1'): + return 1 + elif val in ('n', 'no', 'f', 'false', 'off', '0'): + return 0 + else: + raise ValueError(f"invalid truth value {val!r}") + + +def byte_compile( # noqa: C901 + py_files, + optimize=0, + force=False, + prefix=None, + base_dir=None, + verbose=True, + dry_run=False, + direct=None, +): + """Byte-compile a collection of Python source files to .pyc + files in a __pycache__ subdirectory. 'py_files' is a list + of files to compile; any files that don't end in ".py" are silently + skipped. 'optimize' must be one of the following: + 0 - don't optimize + 1 - normal optimization (like "python -O") + 2 - extra optimization (like "python -OO") + If 'force' is true, all files are recompiled regardless of + timestamps. + + The source filename encoded in each bytecode file defaults to the + filenames listed in 'py_files'; you can modify these with 'prefix' and + 'basedir'. 'prefix' is a string that will be stripped off of each + source filename, and 'base_dir' is a directory name that will be + prepended (after 'prefix' is stripped). You can supply either or both + (or neither) of 'prefix' and 'base_dir', as you wish. + + If 'dry_run' is true, doesn't actually do anything that would + affect the filesystem. + + Byte-compilation is either done directly in this interpreter process + with the standard py_compile module, or indirectly by writing a + temporary script and executing it. Normally, you should let + 'byte_compile()' figure out to use direct compilation or not (see + the source for details). The 'direct' flag is used by the script + generated in indirect mode; unless you know what you're doing, leave + it set to None. + """ + + # nothing is done if sys.dont_write_bytecode is True + if sys.dont_write_bytecode: + raise DistutilsByteCompileError('byte-compiling is disabled.') + + # First, if the caller didn't force us into direct or indirect mode, + # figure out which mode we should be in. We take a conservative + # approach: choose direct mode *only* if the current interpreter is + # in debug mode and optimize is 0. If we're not in debug mode (-O + # or -OO), we don't know which level of optimization this + # interpreter is running with, so we can't do direct + # byte-compilation and be certain that it's the right thing. Thus, + # always compile indirectly if the current interpreter is in either + # optimize mode, or if either optimization level was requested by + # the caller. + if direct is None: + direct = __debug__ and optimize == 0 + + # "Indirect" byte-compilation: write a temporary script and then + # run it with the appropriate flags. + if not direct: + (script_fd, script_name) = tempfile.mkstemp(".py") + log.info("writing byte-compilation script '%s'", script_name) + if not dry_run: + script = os.fdopen(script_fd, "w", encoding='utf-8') + + with script: + script.write( + """\ +from distutils.util import byte_compile +files = [ +""" + ) + + # XXX would be nice to write absolute filenames, just for + # safety's sake (script should be more robust in the face of + # chdir'ing before running it). But this requires abspath'ing + # 'prefix' as well, and that breaks the hack in build_lib's + # 'byte_compile()' method that carefully tacks on a trailing + # slash (os.sep really) to make sure the prefix here is "just + # right". This whole prefix business is rather delicate -- the + # problem is that it's really a directory, but I'm treating it + # as a dumb string, so trailing slashes and so forth matter. + + script.write(",\n".join(map(repr, py_files)) + "]\n") + script.write( + f""" +byte_compile(files, optimize={optimize!r}, force={force!r}, + prefix={prefix!r}, base_dir={base_dir!r}, + verbose={verbose!r}, dry_run=False, + direct=True) +""" + ) + + cmd = [sys.executable] + cmd.extend(subprocess._optim_args_from_interpreter_flags()) + cmd.append(script_name) + spawn(cmd, dry_run=dry_run) + execute(os.remove, (script_name,), f"removing {script_name}", dry_run=dry_run) + + # "Direct" byte-compilation: use the py_compile module to compile + # right here, right now. Note that the script generated in indirect + # mode simply calls 'byte_compile()' in direct mode, a weird sort of + # cross-process recursion. Hey, it works! + else: + from py_compile import compile + + for file in py_files: + if file[-3:] != ".py": + # This lets us be lazy and not filter filenames in + # the "install_lib" command. + continue + + # Terminology from the py_compile module: + # cfile - byte-compiled file + # dfile - purported source filename (same as 'file' by default) + if optimize >= 0: + opt = '' if optimize == 0 else optimize + cfile = importlib.util.cache_from_source(file, optimization=opt) + else: + cfile = importlib.util.cache_from_source(file) + dfile = file + if prefix: + if file[: len(prefix)] != prefix: + raise ValueError( + f"invalid prefix: filename {file!r} doesn't start with {prefix!r}" + ) + dfile = dfile[len(prefix) :] + if base_dir: + dfile = os.path.join(base_dir, dfile) + + cfile_base = os.path.basename(cfile) + if direct: + if force or newer(file, cfile): + log.info("byte-compiling %s to %s", file, cfile_base) + if not dry_run: + compile(file, cfile, dfile) + else: + log.debug("skipping byte-compilation of %s to %s", file, cfile_base) + + +def rfc822_escape(header): + """Return a version of the string escaped for inclusion in an + RFC-822 header, by ensuring there are 8 spaces space after each newline. + """ + indent = 8 * " " + lines = header.splitlines(keepends=True) + + # Emulate the behaviour of `str.split` + # (the terminal line break in `splitlines` does not result in an extra line): + ends_in_newline = lines and lines[-1].splitlines()[0] != lines[-1] + suffix = indent if ends_in_newline else "" + + return indent.join(lines) + suffix + + +def is_mingw(): + """Returns True if the current platform is mingw. + + Python compiled with Mingw-w64 has sys.platform == 'win32' and + get_platform() starts with 'mingw'. + """ + return sys.platform == 'win32' and get_platform().startswith('mingw') + + +def is_freethreaded(): + """Return True if the Python interpreter is built with free threading support.""" + return bool(sysconfig.get_config_var('Py_GIL_DISABLED')) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/version.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/version.py new file mode 100644 index 0000000000000000000000000000000000000000..2223ee9c8cab2495034a57d4585625feb81a8330 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/version.py @@ -0,0 +1,348 @@ +# +# distutils/version.py +# +# Implements multiple version numbering conventions for the +# Python Module Distribution Utilities. +# +# $Id$ +# + +"""Provides classes to represent module version numbers (one class for +each style of version numbering). There are currently two such classes +implemented: StrictVersion and LooseVersion. + +Every version number class implements the following interface: + * the 'parse' method takes a string and parses it to some internal + representation; if the string is an invalid version number, + 'parse' raises a ValueError exception + * the class constructor takes an optional string argument which, + if supplied, is passed to 'parse' + * __str__ reconstructs the string that was passed to 'parse' (or + an equivalent string -- ie. one that will generate an equivalent + version number instance) + * __repr__ generates Python code to recreate the version number instance + * _cmp compares the current instance with either another instance + of the same class or a string (which will be parsed to an instance + of the same class, thus must follow the same rules) +""" + +import contextlib +import re +import warnings + + +@contextlib.contextmanager +def suppress_known_deprecation(): + with warnings.catch_warnings(record=True) as ctx: + warnings.filterwarnings( + action='default', + category=DeprecationWarning, + message="distutils Version classes are deprecated.", + ) + yield ctx + + +class Version: + """Abstract base class for version numbering classes. Just provides + constructor (__init__) and reproducer (__repr__), because those + seem to be the same for all version numbering classes; and route + rich comparisons to _cmp. + """ + + def __init__(self, vstring=None): + if vstring: + self.parse(vstring) + warnings.warn( + "distutils Version classes are deprecated. Use packaging.version instead.", + DeprecationWarning, + stacklevel=2, + ) + + def __repr__(self): + return f"{self.__class__.__name__} ('{self}')" + + def __eq__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c == 0 + + def __lt__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c < 0 + + def __le__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c <= 0 + + def __gt__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c > 0 + + def __ge__(self, other): + c = self._cmp(other) + if c is NotImplemented: + return c + return c >= 0 + + +# Interface for version-number classes -- must be implemented +# by the following classes (the concrete ones -- Version should +# be treated as an abstract class). +# __init__ (string) - create and take same action as 'parse' +# (string parameter is optional) +# parse (string) - convert a string representation to whatever +# internal representation is appropriate for +# this style of version numbering +# __str__ (self) - convert back to a string; should be very similar +# (if not identical to) the string supplied to parse +# __repr__ (self) - generate Python code to recreate +# the instance +# _cmp (self, other) - compare two version numbers ('other' may +# be an unparsed version string, or another +# instance of your version class) + + +class StrictVersion(Version): + """Version numbering for anal retentives and software idealists. + Implements the standard interface for version number classes as + described above. A version number consists of two or three + dot-separated numeric components, with an optional "pre-release" tag + on the end. The pre-release tag consists of the letter 'a' or 'b' + followed by a number. If the numeric components of two version + numbers are equal, then one with a pre-release tag will always + be deemed earlier (lesser) than one without. + + The following are valid version numbers (shown in the order that + would be obtained by sorting according to the supplied cmp function): + + 0.4 0.4.0 (these two are equivalent) + 0.4.1 + 0.5a1 + 0.5b3 + 0.5 + 0.9.6 + 1.0 + 1.0.4a3 + 1.0.4b1 + 1.0.4 + + The following are examples of invalid version numbers: + + 1 + 2.7.2.2 + 1.3.a4 + 1.3pl1 + 1.3c4 + + The rationale for this version numbering system will be explained + in the distutils documentation. + """ + + version_re = re.compile( + r'^(\d+) \. (\d+) (\. (\d+))? ([ab](\d+))?$', re.VERBOSE | re.ASCII + ) + + def parse(self, vstring): + match = self.version_re.match(vstring) + if not match: + raise ValueError(f"invalid version number '{vstring}'") + + (major, minor, patch, prerelease, prerelease_num) = match.group(1, 2, 4, 5, 6) + + if patch: + self.version = tuple(map(int, [major, minor, patch])) + else: + self.version = tuple(map(int, [major, minor])) + (0,) + + if prerelease: + self.prerelease = (prerelease[0], int(prerelease_num)) + else: + self.prerelease = None + + def __str__(self): + if self.version[2] == 0: + vstring = '.'.join(map(str, self.version[0:2])) + else: + vstring = '.'.join(map(str, self.version)) + + if self.prerelease: + vstring = vstring + self.prerelease[0] + str(self.prerelease[1]) + + return vstring + + def _cmp(self, other): + if isinstance(other, str): + with suppress_known_deprecation(): + other = StrictVersion(other) + elif not isinstance(other, StrictVersion): + return NotImplemented + + if self.version == other.version: + # versions match; pre-release drives the comparison + return self._cmp_prerelease(other) + + return -1 if self.version < other.version else 1 + + def _cmp_prerelease(self, other): + """ + case 1: self has prerelease, other doesn't; other is greater + case 2: self doesn't have prerelease, other does: self is greater + case 3: both or neither have prerelease: compare them! + """ + if self.prerelease and not other.prerelease: + return -1 + elif not self.prerelease and other.prerelease: + return 1 + + if self.prerelease == other.prerelease: + return 0 + elif self.prerelease < other.prerelease: + return -1 + else: + return 1 + + +# end class StrictVersion + + +# The rules according to Greg Stein: +# 1) a version number has 1 or more numbers separated by a period or by +# sequences of letters. If only periods, then these are compared +# left-to-right to determine an ordering. +# 2) sequences of letters are part of the tuple for comparison and are +# compared lexicographically +# 3) recognize the numeric components may have leading zeroes +# +# The LooseVersion class below implements these rules: a version number +# string is split up into a tuple of integer and string components, and +# comparison is a simple tuple comparison. This means that version +# numbers behave in a predictable and obvious way, but a way that might +# not necessarily be how people *want* version numbers to behave. There +# wouldn't be a problem if people could stick to purely numeric version +# numbers: just split on period and compare the numbers as tuples. +# However, people insist on putting letters into their version numbers; +# the most common purpose seems to be: +# - indicating a "pre-release" version +# ('alpha', 'beta', 'a', 'b', 'pre', 'p') +# - indicating a post-release patch ('p', 'pl', 'patch') +# but of course this can't cover all version number schemes, and there's +# no way to know what a programmer means without asking him. +# +# The problem is what to do with letters (and other non-numeric +# characters) in a version number. The current implementation does the +# obvious and predictable thing: keep them as strings and compare +# lexically within a tuple comparison. This has the desired effect if +# an appended letter sequence implies something "post-release": +# eg. "0.99" < "0.99pl14" < "1.0", and "5.001" < "5.001m" < "5.002". +# +# However, if letters in a version number imply a pre-release version, +# the "obvious" thing isn't correct. Eg. you would expect that +# "1.5.1" < "1.5.2a2" < "1.5.2", but under the tuple/lexical comparison +# implemented here, this just isn't so. +# +# Two possible solutions come to mind. The first is to tie the +# comparison algorithm to a particular set of semantic rules, as has +# been done in the StrictVersion class above. This works great as long +# as everyone can go along with bondage and discipline. Hopefully a +# (large) subset of Python module programmers will agree that the +# particular flavour of bondage and discipline provided by StrictVersion +# provides enough benefit to be worth using, and will submit their +# version numbering scheme to its domination. The free-thinking +# anarchists in the lot will never give in, though, and something needs +# to be done to accommodate them. +# +# Perhaps a "moderately strict" version class could be implemented that +# lets almost anything slide (syntactically), and makes some heuristic +# assumptions about non-digits in version number strings. This could +# sink into special-case-hell, though; if I was as talented and +# idiosyncratic as Larry Wall, I'd go ahead and implement a class that +# somehow knows that "1.2.1" < "1.2.2a2" < "1.2.2" < "1.2.2pl3", and is +# just as happy dealing with things like "2g6" and "1.13++". I don't +# think I'm smart enough to do it right though. +# +# In any case, I've coded the test suite for this module (see +# ../test/test_version.py) specifically to fail on things like comparing +# "1.2a2" and "1.2". That's not because the *code* is doing anything +# wrong, it's because the simple, obvious design doesn't match my +# complicated, hairy expectations for real-world version numbers. It +# would be a snap to fix the test suite to say, "Yep, LooseVersion does +# the Right Thing" (ie. the code matches the conception). But I'd rather +# have a conception that matches common notions about version numbers. + + +class LooseVersion(Version): + """Version numbering for anarchists and software realists. + Implements the standard interface for version number classes as + described above. A version number consists of a series of numbers, + separated by either periods or strings of letters. When comparing + version numbers, the numeric components will be compared + numerically, and the alphabetic components lexically. The following + are all valid version numbers, in no particular order: + + 1.5.1 + 1.5.2b2 + 161 + 3.10a + 8.02 + 3.4j + 1996.07.12 + 3.2.pl0 + 3.1.1.6 + 2g6 + 11g + 0.960923 + 2.2beta29 + 1.13++ + 5.5.kw + 2.0b1pl0 + + In fact, there is no such thing as an invalid version number under + this scheme; the rules for comparison are simple and predictable, + but may not always give the results you want (for some definition + of "want"). + """ + + component_re = re.compile(r'(\d+ | [a-z]+ | \.)', re.VERBOSE) + + def parse(self, vstring): + # I've given up on thinking I can reconstruct the version string + # from the parsed tuple -- so I just store the string here for + # use by __str__ + self.vstring = vstring + components = [x for x in self.component_re.split(vstring) if x and x != '.'] + for i, obj in enumerate(components): + try: + components[i] = int(obj) + except ValueError: + pass + + self.version = components + + def __str__(self): + return self.vstring + + def __repr__(self): + return f"LooseVersion ('{self}')" + + def _cmp(self, other): + if isinstance(other, str): + other = LooseVersion(other) + elif not isinstance(other, LooseVersion): + return NotImplemented + + if self.version == other.version: + return 0 + if self.version < other.version: + return -1 + if self.version > other.version: + return 1 + + +# end class LooseVersion diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/versionpredicate.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/versionpredicate.py new file mode 100644 index 0000000000000000000000000000000000000000..fe31b0ed8ef843080ea64df9f58a398d317434ad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/versionpredicate.py @@ -0,0 +1,175 @@ +"""Module for parsing and testing package version predicate strings.""" + +import operator +import re + +from . import version + +re_validPackage = re.compile(r"(?i)^\s*([a-z_]\w*(?:\.[a-z_]\w*)*)(.*)", re.ASCII) +# (package) (rest) + +re_paren = re.compile(r"^\s*\((.*)\)\s*$") # (list) inside of parentheses +re_splitComparison = re.compile(r"^\s*(<=|>=|<|>|!=|==)\s*([^\s,]+)\s*$") +# (comp) (version) + + +def splitUp(pred): + """Parse a single version comparison. + + Return (comparison string, StrictVersion) + """ + res = re_splitComparison.match(pred) + if not res: + raise ValueError(f"bad package restriction syntax: {pred!r}") + comp, verStr = res.groups() + with version.suppress_known_deprecation(): + other = version.StrictVersion(verStr) + return (comp, other) + + +compmap = { + "<": operator.lt, + "<=": operator.le, + "==": operator.eq, + ">": operator.gt, + ">=": operator.ge, + "!=": operator.ne, +} + + +class VersionPredicate: + """Parse and test package version predicates. + + >>> v = VersionPredicate('pyepat.abc (>1.0, <3333.3a1, !=1555.1b3)') + + The `name` attribute provides the full dotted name that is given:: + + >>> v.name + 'pyepat.abc' + + The str() of a `VersionPredicate` provides a normalized + human-readable version of the expression:: + + >>> print(v) + pyepat.abc (> 1.0, < 3333.3a1, != 1555.1b3) + + The `satisfied_by()` method can be used to determine with a given + version number is included in the set described by the version + restrictions:: + + >>> v.satisfied_by('1.1') + True + >>> v.satisfied_by('1.4') + True + >>> v.satisfied_by('1.0') + False + >>> v.satisfied_by('4444.4') + False + >>> v.satisfied_by('1555.1b3') + False + + `VersionPredicate` is flexible in accepting extra whitespace:: + + >>> v = VersionPredicate(' pat( == 0.1 ) ') + >>> v.name + 'pat' + >>> v.satisfied_by('0.1') + True + >>> v.satisfied_by('0.2') + False + + If any version numbers passed in do not conform to the + restrictions of `StrictVersion`, a `ValueError` is raised:: + + >>> v = VersionPredicate('p1.p2.p3.p4(>=1.0, <=1.3a1, !=1.2zb3)') + Traceback (most recent call last): + ... + ValueError: invalid version number '1.2zb3' + + It the module or package name given does not conform to what's + allowed as a legal module or package name, `ValueError` is + raised:: + + >>> v = VersionPredicate('foo-bar') + Traceback (most recent call last): + ... + ValueError: expected parenthesized list: '-bar' + + >>> v = VersionPredicate('foo bar (12.21)') + Traceback (most recent call last): + ... + ValueError: expected parenthesized list: 'bar (12.21)' + + """ + + def __init__(self, versionPredicateStr): + """Parse a version predicate string.""" + # Fields: + # name: package name + # pred: list of (comparison string, StrictVersion) + + versionPredicateStr = versionPredicateStr.strip() + if not versionPredicateStr: + raise ValueError("empty package restriction") + match = re_validPackage.match(versionPredicateStr) + if not match: + raise ValueError(f"bad package name in {versionPredicateStr!r}") + self.name, paren = match.groups() + paren = paren.strip() + if paren: + match = re_paren.match(paren) + if not match: + raise ValueError(f"expected parenthesized list: {paren!r}") + str = match.groups()[0] + self.pred = [splitUp(aPred) for aPred in str.split(",")] + if not self.pred: + raise ValueError(f"empty parenthesized list in {versionPredicateStr!r}") + else: + self.pred = [] + + def __str__(self): + if self.pred: + seq = [cond + " " + str(ver) for cond, ver in self.pred] + return self.name + " (" + ", ".join(seq) + ")" + else: + return self.name + + def satisfied_by(self, version): + """True if version is compatible with all the predicates in self. + The parameter version must be acceptable to the StrictVersion + constructor. It may be either a string or StrictVersion. + """ + for cond, ver in self.pred: + if not compmap[cond](version, ver): + return False + return True + + +_provision_rx = None + + +def split_provision(value): + """Return the name and optional version number of a provision. + + The version number, if given, will be returned as a `StrictVersion` + instance, otherwise it will be `None`. + + >>> split_provision('mypkg') + ('mypkg', None) + >>> split_provision(' mypkg( 1.2 ) ') + ('mypkg', StrictVersion ('1.2')) + """ + global _provision_rx + if _provision_rx is None: + _provision_rx = re.compile( + r"([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)(?:\s*\(\s*([^)\s]+)\s*\))?$", re.ASCII + ) + value = value.strip() + m = _provision_rx.match(value) + if not m: + raise ValueError(f"illegal provides specification: {value!r}") + ver = m.group(2) or None + if ver: + with version.suppress_known_deprecation(): + ver = version.StrictVersion(ver) + return m.group(1), ver diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/zosccompiler.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/zosccompiler.py new file mode 100644 index 0000000000000000000000000000000000000000..af1e7fa5cced7b7ac2f29f3d50c10abaeb111dc0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_distutils/zosccompiler.py @@ -0,0 +1,229 @@ +"""distutils.zosccompiler + +Contains the selection of the c & c++ compilers on z/OS. There are several +different c compilers on z/OS, all of them are optional, so the correct +one needs to be chosen based on the users input. This is compatible with +the following compilers: + +IBM C/C++ For Open Enterprise Languages on z/OS 2.0 +IBM Open XL C/C++ 1.1 for z/OS +IBM XL C/C++ V2.4.1 for z/OS 2.4 and 2.5 +IBM z/OS XL C/C++ +""" + +import os + +from . import sysconfig +from .errors import CompileError, DistutilsExecError +from .unixccompiler import UnixCCompiler + +_cc_args = { + 'ibm-openxl': [ + '-m64', + '-fvisibility=default', + '-fzos-le-char-mode=ascii', + '-fno-short-enums', + ], + 'ibm-xlclang': [ + '-q64', + '-qexportall', + '-qascii', + '-qstrict', + '-qnocsect', + '-Wa,asa,goff', + '-Wa,xplink', + '-qgonumber', + '-qenum=int', + '-Wc,DLL', + ], + 'ibm-xlc': [ + '-q64', + '-qexportall', + '-qascii', + '-qstrict', + '-qnocsect', + '-Wa,asa,goff', + '-Wa,xplink', + '-qgonumber', + '-qenum=int', + '-Wc,DLL', + '-qlanglvl=extc99', + ], +} + +_cxx_args = { + 'ibm-openxl': [ + '-m64', + '-fvisibility=default', + '-fzos-le-char-mode=ascii', + '-fno-short-enums', + ], + 'ibm-xlclang': [ + '-q64', + '-qexportall', + '-qascii', + '-qstrict', + '-qnocsect', + '-Wa,asa,goff', + '-Wa,xplink', + '-qgonumber', + '-qenum=int', + '-Wc,DLL', + ], + 'ibm-xlc': [ + '-q64', + '-qexportall', + '-qascii', + '-qstrict', + '-qnocsect', + '-Wa,asa,goff', + '-Wa,xplink', + '-qgonumber', + '-qenum=int', + '-Wc,DLL', + '-qlanglvl=extended0x', + ], +} + +_asm_args = { + 'ibm-openxl': ['-fasm', '-fno-integrated-as', '-Wa,--ASA', '-Wa,--GOFF'], + 'ibm-xlclang': [], + 'ibm-xlc': [], +} + +_ld_args = { + 'ibm-openxl': [], + 'ibm-xlclang': ['-Wl,dll', '-q64'], + 'ibm-xlc': ['-Wl,dll', '-q64'], +} + + +# Python on z/OS is built with no compiler specific options in it's CFLAGS. +# But each compiler requires it's own specific options to build successfully, +# though some of the options are common between them +class zOSCCompiler(UnixCCompiler): + src_extensions = ['.c', '.C', '.cc', '.cxx', '.cpp', '.m', '.s'] + _cpp_extensions = ['.cc', '.cpp', '.cxx', '.C'] + _asm_extensions = ['.s'] + + def _get_zos_compiler_name(self): + zos_compiler_names = [ + os.path.basename(binary) + for envvar in ('CC', 'CXX', 'LDSHARED') + if (binary := os.environ.get(envvar, None)) + ] + if len(zos_compiler_names) == 0: + return 'ibm-openxl' + + zos_compilers = {} + for compiler in ( + 'ibm-clang', + 'ibm-clang64', + 'ibm-clang++', + 'ibm-clang++64', + 'clang', + 'clang++', + 'clang-14', + ): + zos_compilers[compiler] = 'ibm-openxl' + + for compiler in ('xlclang', 'xlclang++', 'njsc', 'njsc++'): + zos_compilers[compiler] = 'ibm-xlclang' + + for compiler in ('xlc', 'xlC', 'xlc++'): + zos_compilers[compiler] = 'ibm-xlc' + + return zos_compilers.get(zos_compiler_names[0], 'ibm-openxl') + + def __init__(self, verbose=False, dry_run=False, force=False): + super().__init__(verbose, dry_run, force) + self.zos_compiler = self._get_zos_compiler_name() + sysconfig.customize_compiler(self) + + def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts): + local_args = [] + if ext in self._cpp_extensions: + compiler = self.compiler_cxx + local_args.extend(_cxx_args[self.zos_compiler]) + elif ext in self._asm_extensions: + compiler = self.compiler_so + local_args.extend(_cc_args[self.zos_compiler]) + local_args.extend(_asm_args[self.zos_compiler]) + else: + compiler = self.compiler_so + local_args.extend(_cc_args[self.zos_compiler]) + local_args.extend(cc_args) + + try: + self.spawn(compiler + local_args + [src, '-o', obj] + extra_postargs) + except DistutilsExecError as msg: + raise CompileError(msg) + + def runtime_library_dir_option(self, dir): + return '-L' + dir + + def link( + self, + target_desc, + objects, + output_filename, + output_dir=None, + libraries=None, + library_dirs=None, + runtime_library_dirs=None, + export_symbols=None, + debug=False, + extra_preargs=None, + extra_postargs=None, + build_temp=None, + target_lang=None, + ): + # For a built module to use functions from cpython, it needs to use Pythons + # side deck file. The side deck is located beside the libpython3.xx.so + ldversion = sysconfig.get_config_var('LDVERSION') + if sysconfig.python_build: + side_deck_path = os.path.join( + sysconfig.get_config_var('abs_builddir'), + f'libpython{ldversion}.x', + ) + else: + side_deck_path = os.path.join( + sysconfig.get_config_var('installed_base'), + sysconfig.get_config_var('platlibdir'), + f'libpython{ldversion}.x', + ) + + if os.path.exists(side_deck_path): + if extra_postargs: + extra_postargs.append(side_deck_path) + else: + extra_postargs = [side_deck_path] + + # Check and replace libraries included side deck files + if runtime_library_dirs: + for dir in runtime_library_dirs: + for library in libraries[:]: + library_side_deck = os.path.join(dir, f'{library}.x') + if os.path.exists(library_side_deck): + libraries.remove(library) + extra_postargs.append(library_side_deck) + break + + # Any required ld args for the given compiler + extra_postargs.extend(_ld_args[self.zos_compiler]) + + super().link( + target_desc, + objects, + output_filename, + output_dir, + libraries, + library_dirs, + runtime_library_dirs, + export_symbols, + debug, + extra_preargs, + extra_postargs, + build_temp, + target_lang, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_vendor/typing_extensions.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_vendor/typing_extensions.py new file mode 100644 index 0000000000000000000000000000000000000000..dec429ca8723950097217d2140dd8daccb064e2d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/_vendor/typing_extensions.py @@ -0,0 +1,3641 @@ +import abc +import collections +import collections.abc +import contextlib +import functools +import inspect +import operator +import sys +import types as _types +import typing +import warnings + +__all__ = [ + # Super-special typing primitives. + 'Any', + 'ClassVar', + 'Concatenate', + 'Final', + 'LiteralString', + 'ParamSpec', + 'ParamSpecArgs', + 'ParamSpecKwargs', + 'Self', + 'Type', + 'TypeVar', + 'TypeVarTuple', + 'Unpack', + + # ABCs (from collections.abc). + 'Awaitable', + 'AsyncIterator', + 'AsyncIterable', + 'Coroutine', + 'AsyncGenerator', + 'AsyncContextManager', + 'Buffer', + 'ChainMap', + + # Concrete collection types. + 'ContextManager', + 'Counter', + 'Deque', + 'DefaultDict', + 'NamedTuple', + 'OrderedDict', + 'TypedDict', + + # Structural checks, a.k.a. protocols. + 'SupportsAbs', + 'SupportsBytes', + 'SupportsComplex', + 'SupportsFloat', + 'SupportsIndex', + 'SupportsInt', + 'SupportsRound', + + # One-off things. + 'Annotated', + 'assert_never', + 'assert_type', + 'clear_overloads', + 'dataclass_transform', + 'deprecated', + 'Doc', + 'get_overloads', + 'final', + 'get_args', + 'get_origin', + 'get_original_bases', + 'get_protocol_members', + 'get_type_hints', + 'IntVar', + 'is_protocol', + 'is_typeddict', + 'Literal', + 'NewType', + 'overload', + 'override', + 'Protocol', + 'reveal_type', + 'runtime', + 'runtime_checkable', + 'Text', + 'TypeAlias', + 'TypeAliasType', + 'TypeGuard', + 'TypeIs', + 'TYPE_CHECKING', + 'Never', + 'NoReturn', + 'ReadOnly', + 'Required', + 'NotRequired', + + # Pure aliases, have always been in typing + 'AbstractSet', + 'AnyStr', + 'BinaryIO', + 'Callable', + 'Collection', + 'Container', + 'Dict', + 'ForwardRef', + 'FrozenSet', + 'Generator', + 'Generic', + 'Hashable', + 'IO', + 'ItemsView', + 'Iterable', + 'Iterator', + 'KeysView', + 'List', + 'Mapping', + 'MappingView', + 'Match', + 'MutableMapping', + 'MutableSequence', + 'MutableSet', + 'NoDefault', + 'Optional', + 'Pattern', + 'Reversible', + 'Sequence', + 'Set', + 'Sized', + 'TextIO', + 'Tuple', + 'Union', + 'ValuesView', + 'cast', + 'no_type_check', + 'no_type_check_decorator', +] + +# for backward compatibility +PEP_560 = True +GenericMeta = type +_PEP_696_IMPLEMENTED = sys.version_info >= (3, 13, 0, "beta") + +# The functions below are modified copies of typing internal helpers. +# They are needed by _ProtocolMeta and they provide support for PEP 646. + + +class _Sentinel: + def __repr__(self): + return "" + + +_marker = _Sentinel() + + +if sys.version_info >= (3, 10): + def _should_collect_from_parameters(t): + return isinstance( + t, (typing._GenericAlias, _types.GenericAlias, _types.UnionType) + ) +elif sys.version_info >= (3, 9): + def _should_collect_from_parameters(t): + return isinstance(t, (typing._GenericAlias, _types.GenericAlias)) +else: + def _should_collect_from_parameters(t): + return isinstance(t, typing._GenericAlias) and not t._special + + +NoReturn = typing.NoReturn + +# Some unconstrained type variables. These are used by the container types. +# (These are not for export.) +T = typing.TypeVar('T') # Any type. +KT = typing.TypeVar('KT') # Key type. +VT = typing.TypeVar('VT') # Value type. +T_co = typing.TypeVar('T_co', covariant=True) # Any type covariant containers. +T_contra = typing.TypeVar('T_contra', contravariant=True) # Ditto contravariant. + + +if sys.version_info >= (3, 11): + from typing import Any +else: + + class _AnyMeta(type): + def __instancecheck__(self, obj): + if self is Any: + raise TypeError("typing_extensions.Any cannot be used with isinstance()") + return super().__instancecheck__(obj) + + def __repr__(self): + if self is Any: + return "typing_extensions.Any" + return super().__repr__() + + class Any(metaclass=_AnyMeta): + """Special type indicating an unconstrained type. + - Any is compatible with every type. + - Any assumed to have all methods. + - All values assumed to be instances of Any. + Note that all the above statements are true from the point of view of + static type checkers. At runtime, Any should not be used with instance + checks. + """ + def __new__(cls, *args, **kwargs): + if cls is Any: + raise TypeError("Any cannot be instantiated") + return super().__new__(cls, *args, **kwargs) + + +ClassVar = typing.ClassVar + + +class _ExtensionsSpecialForm(typing._SpecialForm, _root=True): + def __repr__(self): + return 'typing_extensions.' + self._name + + +Final = typing.Final + +if sys.version_info >= (3, 11): + final = typing.final +else: + # @final exists in 3.8+, but we backport it for all versions + # before 3.11 to keep support for the __final__ attribute. + # See https://bugs.python.org/issue46342 + def final(f): + """This decorator can be used to indicate to type checkers that + the decorated method cannot be overridden, and decorated class + cannot be subclassed. For example: + + class Base: + @final + def done(self) -> None: + ... + class Sub(Base): + def done(self) -> None: # Error reported by type checker + ... + @final + class Leaf: + ... + class Other(Leaf): # Error reported by type checker + ... + + There is no runtime checking of these properties. The decorator + sets the ``__final__`` attribute to ``True`` on the decorated object + to allow runtime introspection. + """ + try: + f.__final__ = True + except (AttributeError, TypeError): + # Skip the attribute silently if it is not writable. + # AttributeError happens if the object has __slots__ or a + # read-only property, TypeError if it's a builtin class. + pass + return f + + +def IntVar(name): + return typing.TypeVar(name) + + +# A Literal bug was fixed in 3.11.0, 3.10.1 and 3.9.8 +if sys.version_info >= (3, 10, 1): + Literal = typing.Literal +else: + def _flatten_literal_params(parameters): + """An internal helper for Literal creation: flatten Literals among parameters""" + params = [] + for p in parameters: + if isinstance(p, _LiteralGenericAlias): + params.extend(p.__args__) + else: + params.append(p) + return tuple(params) + + def _value_and_type_iter(params): + for p in params: + yield p, type(p) + + class _LiteralGenericAlias(typing._GenericAlias, _root=True): + def __eq__(self, other): + if not isinstance(other, _LiteralGenericAlias): + return NotImplemented + these_args_deduped = set(_value_and_type_iter(self.__args__)) + other_args_deduped = set(_value_and_type_iter(other.__args__)) + return these_args_deduped == other_args_deduped + + def __hash__(self): + return hash(frozenset(_value_and_type_iter(self.__args__))) + + class _LiteralForm(_ExtensionsSpecialForm, _root=True): + def __init__(self, doc: str): + self._name = 'Literal' + self._doc = self.__doc__ = doc + + def __getitem__(self, parameters): + if not isinstance(parameters, tuple): + parameters = (parameters,) + + parameters = _flatten_literal_params(parameters) + + val_type_pairs = list(_value_and_type_iter(parameters)) + try: + deduped_pairs = set(val_type_pairs) + except TypeError: + # unhashable parameters + pass + else: + # similar logic to typing._deduplicate on Python 3.9+ + if len(deduped_pairs) < len(val_type_pairs): + new_parameters = [] + for pair in val_type_pairs: + if pair in deduped_pairs: + new_parameters.append(pair[0]) + deduped_pairs.remove(pair) + assert not deduped_pairs, deduped_pairs + parameters = tuple(new_parameters) + + return _LiteralGenericAlias(self, parameters) + + Literal = _LiteralForm(doc="""\ + A type that can be used to indicate to type checkers + that the corresponding value has a value literally equivalent + to the provided parameter. For example: + + var: Literal[4] = 4 + + The type checker understands that 'var' is literally equal to + the value 4 and no other value. + + Literal[...] cannot be subclassed. There is no runtime + checking verifying that the parameter is actually a value + instead of a type.""") + + +_overload_dummy = typing._overload_dummy + + +if hasattr(typing, "get_overloads"): # 3.11+ + overload = typing.overload + get_overloads = typing.get_overloads + clear_overloads = typing.clear_overloads +else: + # {module: {qualname: {firstlineno: func}}} + _overload_registry = collections.defaultdict( + functools.partial(collections.defaultdict, dict) + ) + + def overload(func): + """Decorator for overloaded functions/methods. + + In a stub file, place two or more stub definitions for the same + function in a row, each decorated with @overload. For example: + + @overload + def utf8(value: None) -> None: ... + @overload + def utf8(value: bytes) -> bytes: ... + @overload + def utf8(value: str) -> bytes: ... + + In a non-stub file (i.e. a regular .py file), do the same but + follow it with an implementation. The implementation should *not* + be decorated with @overload. For example: + + @overload + def utf8(value: None) -> None: ... + @overload + def utf8(value: bytes) -> bytes: ... + @overload + def utf8(value: str) -> bytes: ... + def utf8(value): + # implementation goes here + + The overloads for a function can be retrieved at runtime using the + get_overloads() function. + """ + # classmethod and staticmethod + f = getattr(func, "__func__", func) + try: + _overload_registry[f.__module__][f.__qualname__][ + f.__code__.co_firstlineno + ] = func + except AttributeError: + # Not a normal function; ignore. + pass + return _overload_dummy + + def get_overloads(func): + """Return all defined overloads for *func* as a sequence.""" + # classmethod and staticmethod + f = getattr(func, "__func__", func) + if f.__module__ not in _overload_registry: + return [] + mod_dict = _overload_registry[f.__module__] + if f.__qualname__ not in mod_dict: + return [] + return list(mod_dict[f.__qualname__].values()) + + def clear_overloads(): + """Clear all overloads in the registry.""" + _overload_registry.clear() + + +# This is not a real generic class. Don't use outside annotations. +Type = typing.Type + +# Various ABCs mimicking those in collections.abc. +# A few are simply re-exported for completeness. +Awaitable = typing.Awaitable +Coroutine = typing.Coroutine +AsyncIterable = typing.AsyncIterable +AsyncIterator = typing.AsyncIterator +Deque = typing.Deque +DefaultDict = typing.DefaultDict +OrderedDict = typing.OrderedDict +Counter = typing.Counter +ChainMap = typing.ChainMap +Text = typing.Text +TYPE_CHECKING = typing.TYPE_CHECKING + + +if sys.version_info >= (3, 13, 0, "beta"): + from typing import AsyncContextManager, AsyncGenerator, ContextManager, Generator +else: + def _is_dunder(attr): + return attr.startswith('__') and attr.endswith('__') + + # Python <3.9 doesn't have typing._SpecialGenericAlias + _special_generic_alias_base = getattr( + typing, "_SpecialGenericAlias", typing._GenericAlias + ) + + class _SpecialGenericAlias(_special_generic_alias_base, _root=True): + def __init__(self, origin, nparams, *, inst=True, name=None, defaults=()): + if _special_generic_alias_base is typing._GenericAlias: + # Python <3.9 + self.__origin__ = origin + self._nparams = nparams + super().__init__(origin, nparams, special=True, inst=inst, name=name) + else: + # Python >= 3.9 + super().__init__(origin, nparams, inst=inst, name=name) + self._defaults = defaults + + def __setattr__(self, attr, val): + allowed_attrs = {'_name', '_inst', '_nparams', '_defaults'} + if _special_generic_alias_base is typing._GenericAlias: + # Python <3.9 + allowed_attrs.add("__origin__") + if _is_dunder(attr) or attr in allowed_attrs: + object.__setattr__(self, attr, val) + else: + setattr(self.__origin__, attr, val) + + @typing._tp_cache + def __getitem__(self, params): + if not isinstance(params, tuple): + params = (params,) + msg = "Parameters to generic types must be types." + params = tuple(typing._type_check(p, msg) for p in params) + if ( + self._defaults + and len(params) < self._nparams + and len(params) + len(self._defaults) >= self._nparams + ): + params = (*params, *self._defaults[len(params) - self._nparams:]) + actual_len = len(params) + + if actual_len != self._nparams: + if self._defaults: + expected = f"at least {self._nparams - len(self._defaults)}" + else: + expected = str(self._nparams) + if not self._nparams: + raise TypeError(f"{self} is not a generic class") + raise TypeError( + f"Too {'many' if actual_len > self._nparams else 'few'}" + f" arguments for {self};" + f" actual {actual_len}, expected {expected}" + ) + return self.copy_with(params) + + _NoneType = type(None) + Generator = _SpecialGenericAlias( + collections.abc.Generator, 3, defaults=(_NoneType, _NoneType) + ) + AsyncGenerator = _SpecialGenericAlias( + collections.abc.AsyncGenerator, 2, defaults=(_NoneType,) + ) + ContextManager = _SpecialGenericAlias( + contextlib.AbstractContextManager, + 2, + name="ContextManager", + defaults=(typing.Optional[bool],) + ) + AsyncContextManager = _SpecialGenericAlias( + contextlib.AbstractAsyncContextManager, + 2, + name="AsyncContextManager", + defaults=(typing.Optional[bool],) + ) + + +_PROTO_ALLOWLIST = { + 'collections.abc': [ + 'Callable', 'Awaitable', 'Iterable', 'Iterator', 'AsyncIterable', + 'Hashable', 'Sized', 'Container', 'Collection', 'Reversible', 'Buffer', + ], + 'contextlib': ['AbstractContextManager', 'AbstractAsyncContextManager'], + 'typing_extensions': ['Buffer'], +} + + +_EXCLUDED_ATTRS = frozenset(typing.EXCLUDED_ATTRIBUTES) | { + "__match_args__", "__protocol_attrs__", "__non_callable_proto_members__", + "__final__", +} + + +def _get_protocol_attrs(cls): + attrs = set() + for base in cls.__mro__[:-1]: # without object + if base.__name__ in {'Protocol', 'Generic'}: + continue + annotations = getattr(base, '__annotations__', {}) + for attr in (*base.__dict__, *annotations): + if (not attr.startswith('_abc_') and attr not in _EXCLUDED_ATTRS): + attrs.add(attr) + return attrs + + +def _caller(depth=2): + try: + return sys._getframe(depth).f_globals.get('__name__', '__main__') + except (AttributeError, ValueError): # For platforms without _getframe() + return None + + +# `__match_args__` attribute was removed from protocol members in 3.13, +# we want to backport this change to older Python versions. +if sys.version_info >= (3, 13): + Protocol = typing.Protocol +else: + def _allow_reckless_class_checks(depth=3): + """Allow instance and class checks for special stdlib modules. + The abc and functools modules indiscriminately call isinstance() and + issubclass() on the whole MRO of a user class, which may contain protocols. + """ + return _caller(depth) in {'abc', 'functools', None} + + def _no_init(self, *args, **kwargs): + if type(self)._is_protocol: + raise TypeError('Protocols cannot be instantiated') + + def _type_check_issubclass_arg_1(arg): + """Raise TypeError if `arg` is not an instance of `type` + in `issubclass(arg, )`. + + In most cases, this is verified by type.__subclasscheck__. + Checking it again unnecessarily would slow down issubclass() checks, + so, we don't perform this check unless we absolutely have to. + + For various error paths, however, + we want to ensure that *this* error message is shown to the user + where relevant, rather than a typing.py-specific error message. + """ + if not isinstance(arg, type): + # Same error message as for issubclass(1, int). + raise TypeError('issubclass() arg 1 must be a class') + + # Inheriting from typing._ProtocolMeta isn't actually desirable, + # but is necessary to allow typing.Protocol and typing_extensions.Protocol + # to mix without getting TypeErrors about "metaclass conflict" + class _ProtocolMeta(type(typing.Protocol)): + # This metaclass is somewhat unfortunate, + # but is necessary for several reasons... + # + # NOTE: DO NOT call super() in any methods in this class + # That would call the methods on typing._ProtocolMeta on Python 3.8-3.11 + # and those are slow + def __new__(mcls, name, bases, namespace, **kwargs): + if name == "Protocol" and len(bases) < 2: + pass + elif {Protocol, typing.Protocol} & set(bases): + for base in bases: + if not ( + base in {object, typing.Generic, Protocol, typing.Protocol} + or base.__name__ in _PROTO_ALLOWLIST.get(base.__module__, []) + or is_protocol(base) + ): + raise TypeError( + f"Protocols can only inherit from other protocols, " + f"got {base!r}" + ) + return abc.ABCMeta.__new__(mcls, name, bases, namespace, **kwargs) + + def __init__(cls, *args, **kwargs): + abc.ABCMeta.__init__(cls, *args, **kwargs) + if getattr(cls, "_is_protocol", False): + cls.__protocol_attrs__ = _get_protocol_attrs(cls) + + def __subclasscheck__(cls, other): + if cls is Protocol: + return type.__subclasscheck__(cls, other) + if ( + getattr(cls, '_is_protocol', False) + and not _allow_reckless_class_checks() + ): + if not getattr(cls, '_is_runtime_protocol', False): + _type_check_issubclass_arg_1(other) + raise TypeError( + "Instance and class checks can only be used with " + "@runtime_checkable protocols" + ) + if ( + # this attribute is set by @runtime_checkable: + cls.__non_callable_proto_members__ + and cls.__dict__.get("__subclasshook__") is _proto_hook + ): + _type_check_issubclass_arg_1(other) + non_method_attrs = sorted(cls.__non_callable_proto_members__) + raise TypeError( + "Protocols with non-method members don't support issubclass()." + f" Non-method members: {str(non_method_attrs)[1:-1]}." + ) + return abc.ABCMeta.__subclasscheck__(cls, other) + + def __instancecheck__(cls, instance): + # We need this method for situations where attributes are + # assigned in __init__. + if cls is Protocol: + return type.__instancecheck__(cls, instance) + if not getattr(cls, "_is_protocol", False): + # i.e., it's a concrete subclass of a protocol + return abc.ABCMeta.__instancecheck__(cls, instance) + + if ( + not getattr(cls, '_is_runtime_protocol', False) and + not _allow_reckless_class_checks() + ): + raise TypeError("Instance and class checks can only be used with" + " @runtime_checkable protocols") + + if abc.ABCMeta.__instancecheck__(cls, instance): + return True + + for attr in cls.__protocol_attrs__: + try: + val = inspect.getattr_static(instance, attr) + except AttributeError: + break + # this attribute is set by @runtime_checkable: + if val is None and attr not in cls.__non_callable_proto_members__: + break + else: + return True + + return False + + def __eq__(cls, other): + # Hack so that typing.Generic.__class_getitem__ + # treats typing_extensions.Protocol + # as equivalent to typing.Protocol + if abc.ABCMeta.__eq__(cls, other) is True: + return True + return cls is Protocol and other is typing.Protocol + + # This has to be defined, or the abc-module cache + # complains about classes with this metaclass being unhashable, + # if we define only __eq__! + def __hash__(cls) -> int: + return type.__hash__(cls) + + @classmethod + def _proto_hook(cls, other): + if not cls.__dict__.get('_is_protocol', False): + return NotImplemented + + for attr in cls.__protocol_attrs__: + for base in other.__mro__: + # Check if the members appears in the class dictionary... + if attr in base.__dict__: + if base.__dict__[attr] is None: + return NotImplemented + break + + # ...or in annotations, if it is a sub-protocol. + annotations = getattr(base, '__annotations__', {}) + if ( + isinstance(annotations, collections.abc.Mapping) + and attr in annotations + and is_protocol(other) + ): + break + else: + return NotImplemented + return True + + class Protocol(typing.Generic, metaclass=_ProtocolMeta): + __doc__ = typing.Protocol.__doc__ + __slots__ = () + _is_protocol = True + _is_runtime_protocol = False + + def __init_subclass__(cls, *args, **kwargs): + super().__init_subclass__(*args, **kwargs) + + # Determine if this is a protocol or a concrete subclass. + if not cls.__dict__.get('_is_protocol', False): + cls._is_protocol = any(b is Protocol for b in cls.__bases__) + + # Set (or override) the protocol subclass hook. + if '__subclasshook__' not in cls.__dict__: + cls.__subclasshook__ = _proto_hook + + # Prohibit instantiation for protocol classes + if cls._is_protocol and cls.__init__ is Protocol.__init__: + cls.__init__ = _no_init + + +if sys.version_info >= (3, 13): + runtime_checkable = typing.runtime_checkable +else: + def runtime_checkable(cls): + """Mark a protocol class as a runtime protocol. + + Such protocol can be used with isinstance() and issubclass(). + Raise TypeError if applied to a non-protocol class. + This allows a simple-minded structural check very similar to + one trick ponies in collections.abc such as Iterable. + + For example:: + + @runtime_checkable + class Closable(Protocol): + def close(self): ... + + assert isinstance(open('/some/file'), Closable) + + Warning: this will check only the presence of the required methods, + not their type signatures! + """ + if not issubclass(cls, typing.Generic) or not getattr(cls, '_is_protocol', False): + raise TypeError(f'@runtime_checkable can be only applied to protocol classes,' + f' got {cls!r}') + cls._is_runtime_protocol = True + + # typing.Protocol classes on <=3.11 break if we execute this block, + # because typing.Protocol classes on <=3.11 don't have a + # `__protocol_attrs__` attribute, and this block relies on the + # `__protocol_attrs__` attribute. Meanwhile, typing.Protocol classes on 3.12.2+ + # break if we *don't* execute this block, because *they* assume that all + # protocol classes have a `__non_callable_proto_members__` attribute + # (which this block sets) + if isinstance(cls, _ProtocolMeta) or sys.version_info >= (3, 12, 2): + # PEP 544 prohibits using issubclass() + # with protocols that have non-method members. + # See gh-113320 for why we compute this attribute here, + # rather than in `_ProtocolMeta.__init__` + cls.__non_callable_proto_members__ = set() + for attr in cls.__protocol_attrs__: + try: + is_callable = callable(getattr(cls, attr, None)) + except Exception as e: + raise TypeError( + f"Failed to determine whether protocol member {attr!r} " + "is a method member" + ) from e + else: + if not is_callable: + cls.__non_callable_proto_members__.add(attr) + + return cls + + +# The "runtime" alias exists for backwards compatibility. +runtime = runtime_checkable + + +# Our version of runtime-checkable protocols is faster on Python 3.8-3.11 +if sys.version_info >= (3, 12): + SupportsInt = typing.SupportsInt + SupportsFloat = typing.SupportsFloat + SupportsComplex = typing.SupportsComplex + SupportsBytes = typing.SupportsBytes + SupportsIndex = typing.SupportsIndex + SupportsAbs = typing.SupportsAbs + SupportsRound = typing.SupportsRound +else: + @runtime_checkable + class SupportsInt(Protocol): + """An ABC with one abstract method __int__.""" + __slots__ = () + + @abc.abstractmethod + def __int__(self) -> int: + pass + + @runtime_checkable + class SupportsFloat(Protocol): + """An ABC with one abstract method __float__.""" + __slots__ = () + + @abc.abstractmethod + def __float__(self) -> float: + pass + + @runtime_checkable + class SupportsComplex(Protocol): + """An ABC with one abstract method __complex__.""" + __slots__ = () + + @abc.abstractmethod + def __complex__(self) -> complex: + pass + + @runtime_checkable + class SupportsBytes(Protocol): + """An ABC with one abstract method __bytes__.""" + __slots__ = () + + @abc.abstractmethod + def __bytes__(self) -> bytes: + pass + + @runtime_checkable + class SupportsIndex(Protocol): + __slots__ = () + + @abc.abstractmethod + def __index__(self) -> int: + pass + + @runtime_checkable + class SupportsAbs(Protocol[T_co]): + """ + An ABC with one abstract method __abs__ that is covariant in its return type. + """ + __slots__ = () + + @abc.abstractmethod + def __abs__(self) -> T_co: + pass + + @runtime_checkable + class SupportsRound(Protocol[T_co]): + """ + An ABC with one abstract method __round__ that is covariant in its return type. + """ + __slots__ = () + + @abc.abstractmethod + def __round__(self, ndigits: int = 0) -> T_co: + pass + + +def _ensure_subclassable(mro_entries): + def inner(func): + if sys.implementation.name == "pypy" and sys.version_info < (3, 9): + cls_dict = { + "__call__": staticmethod(func), + "__mro_entries__": staticmethod(mro_entries) + } + t = type(func.__name__, (), cls_dict) + return functools.update_wrapper(t(), func) + else: + func.__mro_entries__ = mro_entries + return func + return inner + + +# Update this to something like >=3.13.0b1 if and when +# PEP 728 is implemented in CPython +_PEP_728_IMPLEMENTED = False + +if _PEP_728_IMPLEMENTED: + # The standard library TypedDict in Python 3.8 does not store runtime information + # about which (if any) keys are optional. See https://bugs.python.org/issue38834 + # The standard library TypedDict in Python 3.9.0/1 does not honour the "total" + # keyword with old-style TypedDict(). See https://bugs.python.org/issue42059 + # The standard library TypedDict below Python 3.11 does not store runtime + # information about optional and required keys when using Required or NotRequired. + # Generic TypedDicts are also impossible using typing.TypedDict on Python <3.11. + # Aaaand on 3.12 we add __orig_bases__ to TypedDict + # to enable better runtime introspection. + # On 3.13 we deprecate some odd ways of creating TypedDicts. + # Also on 3.13, PEP 705 adds the ReadOnly[] qualifier. + # PEP 728 (still pending) makes more changes. + TypedDict = typing.TypedDict + _TypedDictMeta = typing._TypedDictMeta + is_typeddict = typing.is_typeddict +else: + # 3.10.0 and later + _TAKES_MODULE = "module" in inspect.signature(typing._type_check).parameters + + def _get_typeddict_qualifiers(annotation_type): + while True: + annotation_origin = get_origin(annotation_type) + if annotation_origin is Annotated: + annotation_args = get_args(annotation_type) + if annotation_args: + annotation_type = annotation_args[0] + else: + break + elif annotation_origin is Required: + yield Required + annotation_type, = get_args(annotation_type) + elif annotation_origin is NotRequired: + yield NotRequired + annotation_type, = get_args(annotation_type) + elif annotation_origin is ReadOnly: + yield ReadOnly + annotation_type, = get_args(annotation_type) + else: + break + + class _TypedDictMeta(type): + def __new__(cls, name, bases, ns, *, total=True, closed=False): + """Create new typed dict class object. + + This method is called when TypedDict is subclassed, + or when TypedDict is instantiated. This way + TypedDict supports all three syntax forms described in its docstring. + Subclasses and instances of TypedDict return actual dictionaries. + """ + for base in bases: + if type(base) is not _TypedDictMeta and base is not typing.Generic: + raise TypeError('cannot inherit from both a TypedDict type ' + 'and a non-TypedDict base class') + + if any(issubclass(b, typing.Generic) for b in bases): + generic_base = (typing.Generic,) + else: + generic_base = () + + # typing.py generally doesn't let you inherit from plain Generic, unless + # the name of the class happens to be "Protocol" + tp_dict = type.__new__(_TypedDictMeta, "Protocol", (*generic_base, dict), ns) + tp_dict.__name__ = name + if tp_dict.__qualname__ == "Protocol": + tp_dict.__qualname__ = name + + if not hasattr(tp_dict, '__orig_bases__'): + tp_dict.__orig_bases__ = bases + + annotations = {} + if "__annotations__" in ns: + own_annotations = ns["__annotations__"] + elif "__annotate__" in ns: + # TODO: Use inspect.VALUE here, and make the annotations lazily evaluated + own_annotations = ns["__annotate__"](1) + else: + own_annotations = {} + msg = "TypedDict('Name', {f0: t0, f1: t1, ...}); each t must be a type" + if _TAKES_MODULE: + own_annotations = { + n: typing._type_check(tp, msg, module=tp_dict.__module__) + for n, tp in own_annotations.items() + } + else: + own_annotations = { + n: typing._type_check(tp, msg) + for n, tp in own_annotations.items() + } + required_keys = set() + optional_keys = set() + readonly_keys = set() + mutable_keys = set() + extra_items_type = None + + for base in bases: + base_dict = base.__dict__ + + annotations.update(base_dict.get('__annotations__', {})) + required_keys.update(base_dict.get('__required_keys__', ())) + optional_keys.update(base_dict.get('__optional_keys__', ())) + readonly_keys.update(base_dict.get('__readonly_keys__', ())) + mutable_keys.update(base_dict.get('__mutable_keys__', ())) + base_extra_items_type = base_dict.get('__extra_items__', None) + if base_extra_items_type is not None: + extra_items_type = base_extra_items_type + + if closed and extra_items_type is None: + extra_items_type = Never + if closed and "__extra_items__" in own_annotations: + annotation_type = own_annotations.pop("__extra_items__") + qualifiers = set(_get_typeddict_qualifiers(annotation_type)) + if Required in qualifiers: + raise TypeError( + "Special key __extra_items__ does not support " + "Required" + ) + if NotRequired in qualifiers: + raise TypeError( + "Special key __extra_items__ does not support " + "NotRequired" + ) + extra_items_type = annotation_type + + annotations.update(own_annotations) + for annotation_key, annotation_type in own_annotations.items(): + qualifiers = set(_get_typeddict_qualifiers(annotation_type)) + + if Required in qualifiers: + required_keys.add(annotation_key) + elif NotRequired in qualifiers: + optional_keys.add(annotation_key) + elif total: + required_keys.add(annotation_key) + else: + optional_keys.add(annotation_key) + if ReadOnly in qualifiers: + mutable_keys.discard(annotation_key) + readonly_keys.add(annotation_key) + else: + mutable_keys.add(annotation_key) + readonly_keys.discard(annotation_key) + + tp_dict.__annotations__ = annotations + tp_dict.__required_keys__ = frozenset(required_keys) + tp_dict.__optional_keys__ = frozenset(optional_keys) + tp_dict.__readonly_keys__ = frozenset(readonly_keys) + tp_dict.__mutable_keys__ = frozenset(mutable_keys) + if not hasattr(tp_dict, '__total__'): + tp_dict.__total__ = total + tp_dict.__closed__ = closed + tp_dict.__extra_items__ = extra_items_type + return tp_dict + + __call__ = dict # static method + + def __subclasscheck__(cls, other): + # Typed dicts are only for static structural subtyping. + raise TypeError('TypedDict does not support instance and class checks') + + __instancecheck__ = __subclasscheck__ + + _TypedDict = type.__new__(_TypedDictMeta, 'TypedDict', (), {}) + + @_ensure_subclassable(lambda bases: (_TypedDict,)) + def TypedDict(typename, fields=_marker, /, *, total=True, closed=False, **kwargs): + """A simple typed namespace. At runtime it is equivalent to a plain dict. + + TypedDict creates a dictionary type such that a type checker will expect all + instances to have a certain set of keys, where each key is + associated with a value of a consistent type. This expectation + is not checked at runtime. + + Usage:: + + class Point2D(TypedDict): + x: int + y: int + label: str + + a: Point2D = {'x': 1, 'y': 2, 'label': 'good'} # OK + b: Point2D = {'z': 3, 'label': 'bad'} # Fails type check + + assert Point2D(x=1, y=2, label='first') == dict(x=1, y=2, label='first') + + The type info can be accessed via the Point2D.__annotations__ dict, and + the Point2D.__required_keys__ and Point2D.__optional_keys__ frozensets. + TypedDict supports an additional equivalent form:: + + Point2D = TypedDict('Point2D', {'x': int, 'y': int, 'label': str}) + + By default, all keys must be present in a TypedDict. It is possible + to override this by specifying totality:: + + class Point2D(TypedDict, total=False): + x: int + y: int + + This means that a Point2D TypedDict can have any of the keys omitted. A type + checker is only expected to support a literal False or True as the value of + the total argument. True is the default, and makes all items defined in the + class body be required. + + The Required and NotRequired special forms can also be used to mark + individual keys as being required or not required:: + + class Point2D(TypedDict): + x: int # the "x" key must always be present (Required is the default) + y: NotRequired[int] # the "y" key can be omitted + + See PEP 655 for more details on Required and NotRequired. + """ + if fields is _marker or fields is None: + if fields is _marker: + deprecated_thing = "Failing to pass a value for the 'fields' parameter" + else: + deprecated_thing = "Passing `None` as the 'fields' parameter" + + example = f"`{typename} = TypedDict({typename!r}, {{}})`" + deprecation_msg = ( + f"{deprecated_thing} is deprecated and will be disallowed in " + "Python 3.15. To create a TypedDict class with 0 fields " + "using the functional syntax, pass an empty dictionary, e.g. " + ) + example + "." + warnings.warn(deprecation_msg, DeprecationWarning, stacklevel=2) + if closed is not False and closed is not True: + kwargs["closed"] = closed + closed = False + fields = kwargs + elif kwargs: + raise TypeError("TypedDict takes either a dict or keyword arguments," + " but not both") + if kwargs: + if sys.version_info >= (3, 13): + raise TypeError("TypedDict takes no keyword arguments") + warnings.warn( + "The kwargs-based syntax for TypedDict definitions is deprecated " + "in Python 3.11, will be removed in Python 3.13, and may not be " + "understood by third-party type checkers.", + DeprecationWarning, + stacklevel=2, + ) + + ns = {'__annotations__': dict(fields)} + module = _caller() + if module is not None: + # Setting correct module is necessary to make typed dict classes pickleable. + ns['__module__'] = module + + td = _TypedDictMeta(typename, (), ns, total=total, closed=closed) + td.__orig_bases__ = (TypedDict,) + return td + + if hasattr(typing, "_TypedDictMeta"): + _TYPEDDICT_TYPES = (typing._TypedDictMeta, _TypedDictMeta) + else: + _TYPEDDICT_TYPES = (_TypedDictMeta,) + + def is_typeddict(tp): + """Check if an annotation is a TypedDict class + + For example:: + class Film(TypedDict): + title: str + year: int + + is_typeddict(Film) # => True + is_typeddict(Union[list, str]) # => False + """ + # On 3.8, this would otherwise return True + if hasattr(typing, "TypedDict") and tp is typing.TypedDict: + return False + return isinstance(tp, _TYPEDDICT_TYPES) + + +if hasattr(typing, "assert_type"): + assert_type = typing.assert_type + +else: + def assert_type(val, typ, /): + """Assert (to the type checker) that the value is of the given type. + + When the type checker encounters a call to assert_type(), it + emits an error if the value is not of the specified type:: + + def greet(name: str) -> None: + assert_type(name, str) # ok + assert_type(name, int) # type checker error + + At runtime this returns the first argument unchanged and otherwise + does nothing. + """ + return val + + +if hasattr(typing, "ReadOnly"): # 3.13+ + get_type_hints = typing.get_type_hints +else: # <=3.13 + # replaces _strip_annotations() + def _strip_extras(t): + """Strips Annotated, Required and NotRequired from a given type.""" + if isinstance(t, _AnnotatedAlias): + return _strip_extras(t.__origin__) + if hasattr(t, "__origin__") and t.__origin__ in (Required, NotRequired, ReadOnly): + return _strip_extras(t.__args__[0]) + if isinstance(t, typing._GenericAlias): + stripped_args = tuple(_strip_extras(a) for a in t.__args__) + if stripped_args == t.__args__: + return t + return t.copy_with(stripped_args) + if hasattr(_types, "GenericAlias") and isinstance(t, _types.GenericAlias): + stripped_args = tuple(_strip_extras(a) for a in t.__args__) + if stripped_args == t.__args__: + return t + return _types.GenericAlias(t.__origin__, stripped_args) + if hasattr(_types, "UnionType") and isinstance(t, _types.UnionType): + stripped_args = tuple(_strip_extras(a) for a in t.__args__) + if stripped_args == t.__args__: + return t + return functools.reduce(operator.or_, stripped_args) + + return t + + def get_type_hints(obj, globalns=None, localns=None, include_extras=False): + """Return type hints for an object. + + This is often the same as obj.__annotations__, but it handles + forward references encoded as string literals, adds Optional[t] if a + default value equal to None is set and recursively replaces all + 'Annotated[T, ...]', 'Required[T]' or 'NotRequired[T]' with 'T' + (unless 'include_extras=True'). + + The argument may be a module, class, method, or function. The annotations + are returned as a dictionary. For classes, annotations include also + inherited members. + + TypeError is raised if the argument is not of a type that can contain + annotations, and an empty dictionary is returned if no annotations are + present. + + BEWARE -- the behavior of globalns and localns is counterintuitive + (unless you are familiar with how eval() and exec() work). The + search order is locals first, then globals. + + - If no dict arguments are passed, an attempt is made to use the + globals from obj (or the respective module's globals for classes), + and these are also used as the locals. If the object does not appear + to have globals, an empty dictionary is used. + + - If one dict argument is passed, it is used for both globals and + locals. + + - If two dict arguments are passed, they specify globals and + locals, respectively. + """ + if hasattr(typing, "Annotated"): # 3.9+ + hint = typing.get_type_hints( + obj, globalns=globalns, localns=localns, include_extras=True + ) + else: # 3.8 + hint = typing.get_type_hints(obj, globalns=globalns, localns=localns) + if include_extras: + return hint + return {k: _strip_extras(t) for k, t in hint.items()} + + +# Python 3.9+ has PEP 593 (Annotated) +if hasattr(typing, 'Annotated'): + Annotated = typing.Annotated + # Not exported and not a public API, but needed for get_origin() and get_args() + # to work. + _AnnotatedAlias = typing._AnnotatedAlias +# 3.8 +else: + class _AnnotatedAlias(typing._GenericAlias, _root=True): + """Runtime representation of an annotated type. + + At its core 'Annotated[t, dec1, dec2, ...]' is an alias for the type 't' + with extra annotations. The alias behaves like a normal typing alias, + instantiating is the same as instantiating the underlying type, binding + it to types is also the same. + """ + def __init__(self, origin, metadata): + if isinstance(origin, _AnnotatedAlias): + metadata = origin.__metadata__ + metadata + origin = origin.__origin__ + super().__init__(origin, origin) + self.__metadata__ = metadata + + def copy_with(self, params): + assert len(params) == 1 + new_type = params[0] + return _AnnotatedAlias(new_type, self.__metadata__) + + def __repr__(self): + return (f"typing_extensions.Annotated[{typing._type_repr(self.__origin__)}, " + f"{', '.join(repr(a) for a in self.__metadata__)}]") + + def __reduce__(self): + return operator.getitem, ( + Annotated, (self.__origin__, *self.__metadata__) + ) + + def __eq__(self, other): + if not isinstance(other, _AnnotatedAlias): + return NotImplemented + if self.__origin__ != other.__origin__: + return False + return self.__metadata__ == other.__metadata__ + + def __hash__(self): + return hash((self.__origin__, self.__metadata__)) + + class Annotated: + """Add context specific metadata to a type. + + Example: Annotated[int, runtime_check.Unsigned] indicates to the + hypothetical runtime_check module that this type is an unsigned int. + Every other consumer of this type can ignore this metadata and treat + this type as int. + + The first argument to Annotated must be a valid type (and will be in + the __origin__ field), the remaining arguments are kept as a tuple in + the __extra__ field. + + Details: + + - It's an error to call `Annotated` with less than two arguments. + - Nested Annotated are flattened:: + + Annotated[Annotated[T, Ann1, Ann2], Ann3] == Annotated[T, Ann1, Ann2, Ann3] + + - Instantiating an annotated type is equivalent to instantiating the + underlying type:: + + Annotated[C, Ann1](5) == C(5) + + - Annotated can be used as a generic type alias:: + + Optimized = Annotated[T, runtime.Optimize()] + Optimized[int] == Annotated[int, runtime.Optimize()] + + OptimizedList = Annotated[List[T], runtime.Optimize()] + OptimizedList[int] == Annotated[List[int], runtime.Optimize()] + """ + + __slots__ = () + + def __new__(cls, *args, **kwargs): + raise TypeError("Type Annotated cannot be instantiated.") + + @typing._tp_cache + def __class_getitem__(cls, params): + if not isinstance(params, tuple) or len(params) < 2: + raise TypeError("Annotated[...] should be used " + "with at least two arguments (a type and an " + "annotation).") + allowed_special_forms = (ClassVar, Final) + if get_origin(params[0]) in allowed_special_forms: + origin = params[0] + else: + msg = "Annotated[t, ...]: t must be a type." + origin = typing._type_check(params[0], msg) + metadata = tuple(params[1:]) + return _AnnotatedAlias(origin, metadata) + + def __init_subclass__(cls, *args, **kwargs): + raise TypeError( + f"Cannot subclass {cls.__module__}.Annotated" + ) + +# Python 3.8 has get_origin() and get_args() but those implementations aren't +# Annotated-aware, so we can't use those. Python 3.9's versions don't support +# ParamSpecArgs and ParamSpecKwargs, so only Python 3.10's versions will do. +if sys.version_info[:2] >= (3, 10): + get_origin = typing.get_origin + get_args = typing.get_args +# 3.8-3.9 +else: + try: + # 3.9+ + from typing import _BaseGenericAlias + except ImportError: + _BaseGenericAlias = typing._GenericAlias + try: + # 3.9+ + from typing import GenericAlias as _typing_GenericAlias + except ImportError: + _typing_GenericAlias = typing._GenericAlias + + def get_origin(tp): + """Get the unsubscripted version of a type. + + This supports generic types, Callable, Tuple, Union, Literal, Final, ClassVar + and Annotated. Return None for unsupported types. Examples:: + + get_origin(Literal[42]) is Literal + get_origin(int) is None + get_origin(ClassVar[int]) is ClassVar + get_origin(Generic) is Generic + get_origin(Generic[T]) is Generic + get_origin(Union[T, int]) is Union + get_origin(List[Tuple[T, T]][int]) == list + get_origin(P.args) is P + """ + if isinstance(tp, _AnnotatedAlias): + return Annotated + if isinstance(tp, (typing._GenericAlias, _typing_GenericAlias, _BaseGenericAlias, + ParamSpecArgs, ParamSpecKwargs)): + return tp.__origin__ + if tp is typing.Generic: + return typing.Generic + return None + + def get_args(tp): + """Get type arguments with all substitutions performed. + + For unions, basic simplifications used by Union constructor are performed. + Examples:: + get_args(Dict[str, int]) == (str, int) + get_args(int) == () + get_args(Union[int, Union[T, int], str][int]) == (int, str) + get_args(Union[int, Tuple[T, int]][str]) == (int, Tuple[str, int]) + get_args(Callable[[], T][int]) == ([], int) + """ + if isinstance(tp, _AnnotatedAlias): + return (tp.__origin__, *tp.__metadata__) + if isinstance(tp, (typing._GenericAlias, _typing_GenericAlias)): + if getattr(tp, "_special", False): + return () + res = tp.__args__ + if get_origin(tp) is collections.abc.Callable and res[0] is not Ellipsis: + res = (list(res[:-1]), res[-1]) + return res + return () + + +# 3.10+ +if hasattr(typing, 'TypeAlias'): + TypeAlias = typing.TypeAlias +# 3.9 +elif sys.version_info[:2] >= (3, 9): + @_ExtensionsSpecialForm + def TypeAlias(self, parameters): + """Special marker indicating that an assignment should + be recognized as a proper type alias definition by type + checkers. + + For example:: + + Predicate: TypeAlias = Callable[..., bool] + + It's invalid when used anywhere except as in the example above. + """ + raise TypeError(f"{self} is not subscriptable") +# 3.8 +else: + TypeAlias = _ExtensionsSpecialForm( + 'TypeAlias', + doc="""Special marker indicating that an assignment should + be recognized as a proper type alias definition by type + checkers. + + For example:: + + Predicate: TypeAlias = Callable[..., bool] + + It's invalid when used anywhere except as in the example + above.""" + ) + + +if hasattr(typing, "NoDefault"): + NoDefault = typing.NoDefault +else: + class NoDefaultTypeMeta(type): + def __setattr__(cls, attr, value): + # TypeError is consistent with the behavior of NoneType + raise TypeError( + f"cannot set {attr!r} attribute of immutable type {cls.__name__!r}" + ) + + class NoDefaultType(metaclass=NoDefaultTypeMeta): + """The type of the NoDefault singleton.""" + + __slots__ = () + + def __new__(cls): + return globals().get("NoDefault") or object.__new__(cls) + + def __repr__(self): + return "typing_extensions.NoDefault" + + def __reduce__(self): + return "NoDefault" + + NoDefault = NoDefaultType() + del NoDefaultType, NoDefaultTypeMeta + + +def _set_default(type_param, default): + type_param.has_default = lambda: default is not NoDefault + type_param.__default__ = default + + +def _set_module(typevarlike): + # for pickling: + def_mod = _caller(depth=3) + if def_mod != 'typing_extensions': + typevarlike.__module__ = def_mod + + +class _DefaultMixin: + """Mixin for TypeVarLike defaults.""" + + __slots__ = () + __init__ = _set_default + + +# Classes using this metaclass must provide a _backported_typevarlike ClassVar +class _TypeVarLikeMeta(type): + def __instancecheck__(cls, __instance: Any) -> bool: + return isinstance(__instance, cls._backported_typevarlike) + + +if _PEP_696_IMPLEMENTED: + from typing import TypeVar +else: + # Add default and infer_variance parameters from PEP 696 and 695 + class TypeVar(metaclass=_TypeVarLikeMeta): + """Type variable.""" + + _backported_typevarlike = typing.TypeVar + + def __new__(cls, name, *constraints, bound=None, + covariant=False, contravariant=False, + default=NoDefault, infer_variance=False): + if hasattr(typing, "TypeAliasType"): + # PEP 695 implemented (3.12+), can pass infer_variance to typing.TypeVar + typevar = typing.TypeVar(name, *constraints, bound=bound, + covariant=covariant, contravariant=contravariant, + infer_variance=infer_variance) + else: + typevar = typing.TypeVar(name, *constraints, bound=bound, + covariant=covariant, contravariant=contravariant) + if infer_variance and (covariant or contravariant): + raise ValueError("Variance cannot be specified with infer_variance.") + typevar.__infer_variance__ = infer_variance + + _set_default(typevar, default) + _set_module(typevar) + + def _tvar_prepare_subst(alias, args): + if ( + typevar.has_default() + and alias.__parameters__.index(typevar) == len(args) + ): + args += (typevar.__default__,) + return args + + typevar.__typing_prepare_subst__ = _tvar_prepare_subst + return typevar + + def __init_subclass__(cls) -> None: + raise TypeError(f"type '{__name__}.TypeVar' is not an acceptable base type") + + +# Python 3.10+ has PEP 612 +if hasattr(typing, 'ParamSpecArgs'): + ParamSpecArgs = typing.ParamSpecArgs + ParamSpecKwargs = typing.ParamSpecKwargs +# 3.8-3.9 +else: + class _Immutable: + """Mixin to indicate that object should not be copied.""" + __slots__ = () + + def __copy__(self): + return self + + def __deepcopy__(self, memo): + return self + + class ParamSpecArgs(_Immutable): + """The args for a ParamSpec object. + + Given a ParamSpec object P, P.args is an instance of ParamSpecArgs. + + ParamSpecArgs objects have a reference back to their ParamSpec: + + P.args.__origin__ is P + + This type is meant for runtime introspection and has no special meaning to + static type checkers. + """ + def __init__(self, origin): + self.__origin__ = origin + + def __repr__(self): + return f"{self.__origin__.__name__}.args" + + def __eq__(self, other): + if not isinstance(other, ParamSpecArgs): + return NotImplemented + return self.__origin__ == other.__origin__ + + class ParamSpecKwargs(_Immutable): + """The kwargs for a ParamSpec object. + + Given a ParamSpec object P, P.kwargs is an instance of ParamSpecKwargs. + + ParamSpecKwargs objects have a reference back to their ParamSpec: + + P.kwargs.__origin__ is P + + This type is meant for runtime introspection and has no special meaning to + static type checkers. + """ + def __init__(self, origin): + self.__origin__ = origin + + def __repr__(self): + return f"{self.__origin__.__name__}.kwargs" + + def __eq__(self, other): + if not isinstance(other, ParamSpecKwargs): + return NotImplemented + return self.__origin__ == other.__origin__ + + +if _PEP_696_IMPLEMENTED: + from typing import ParamSpec + +# 3.10+ +elif hasattr(typing, 'ParamSpec'): + + # Add default parameter - PEP 696 + class ParamSpec(metaclass=_TypeVarLikeMeta): + """Parameter specification.""" + + _backported_typevarlike = typing.ParamSpec + + def __new__(cls, name, *, bound=None, + covariant=False, contravariant=False, + infer_variance=False, default=NoDefault): + if hasattr(typing, "TypeAliasType"): + # PEP 695 implemented, can pass infer_variance to typing.TypeVar + paramspec = typing.ParamSpec(name, bound=bound, + covariant=covariant, + contravariant=contravariant, + infer_variance=infer_variance) + else: + paramspec = typing.ParamSpec(name, bound=bound, + covariant=covariant, + contravariant=contravariant) + paramspec.__infer_variance__ = infer_variance + + _set_default(paramspec, default) + _set_module(paramspec) + + def _paramspec_prepare_subst(alias, args): + params = alias.__parameters__ + i = params.index(paramspec) + if i == len(args) and paramspec.has_default(): + args = [*args, paramspec.__default__] + if i >= len(args): + raise TypeError(f"Too few arguments for {alias}") + # Special case where Z[[int, str, bool]] == Z[int, str, bool] in PEP 612. + if len(params) == 1 and not typing._is_param_expr(args[0]): + assert i == 0 + args = (args,) + # Convert lists to tuples to help other libraries cache the results. + elif isinstance(args[i], list): + args = (*args[:i], tuple(args[i]), *args[i + 1:]) + return args + + paramspec.__typing_prepare_subst__ = _paramspec_prepare_subst + return paramspec + + def __init_subclass__(cls) -> None: + raise TypeError(f"type '{__name__}.ParamSpec' is not an acceptable base type") + +# 3.8-3.9 +else: + + # Inherits from list as a workaround for Callable checks in Python < 3.9.2. + class ParamSpec(list, _DefaultMixin): + """Parameter specification variable. + + Usage:: + + P = ParamSpec('P') + + Parameter specification variables exist primarily for the benefit of static + type checkers. They are used to forward the parameter types of one + callable to another callable, a pattern commonly found in higher order + functions and decorators. They are only valid when used in ``Concatenate``, + or s the first argument to ``Callable``. In Python 3.10 and higher, + they are also supported in user-defined Generics at runtime. + See class Generic for more information on generic types. An + example for annotating a decorator:: + + T = TypeVar('T') + P = ParamSpec('P') + + def add_logging(f: Callable[P, T]) -> Callable[P, T]: + '''A type-safe decorator to add logging to a function.''' + def inner(*args: P.args, **kwargs: P.kwargs) -> T: + logging.info(f'{f.__name__} was called') + return f(*args, **kwargs) + return inner + + @add_logging + def add_two(x: float, y: float) -> float: + '''Add two numbers together.''' + return x + y + + Parameter specification variables defined with covariant=True or + contravariant=True can be used to declare covariant or contravariant + generic types. These keyword arguments are valid, but their actual semantics + are yet to be decided. See PEP 612 for details. + + Parameter specification variables can be introspected. e.g.: + + P.__name__ == 'T' + P.__bound__ == None + P.__covariant__ == False + P.__contravariant__ == False + + Note that only parameter specification variables defined in global scope can + be pickled. + """ + + # Trick Generic __parameters__. + __class__ = typing.TypeVar + + @property + def args(self): + return ParamSpecArgs(self) + + @property + def kwargs(self): + return ParamSpecKwargs(self) + + def __init__(self, name, *, bound=None, covariant=False, contravariant=False, + infer_variance=False, default=NoDefault): + list.__init__(self, [self]) + self.__name__ = name + self.__covariant__ = bool(covariant) + self.__contravariant__ = bool(contravariant) + self.__infer_variance__ = bool(infer_variance) + if bound: + self.__bound__ = typing._type_check(bound, 'Bound must be a type.') + else: + self.__bound__ = None + _DefaultMixin.__init__(self, default) + + # for pickling: + def_mod = _caller() + if def_mod != 'typing_extensions': + self.__module__ = def_mod + + def __repr__(self): + if self.__infer_variance__: + prefix = '' + elif self.__covariant__: + prefix = '+' + elif self.__contravariant__: + prefix = '-' + else: + prefix = '~' + return prefix + self.__name__ + + def __hash__(self): + return object.__hash__(self) + + def __eq__(self, other): + return self is other + + def __reduce__(self): + return self.__name__ + + # Hack to get typing._type_check to pass. + def __call__(self, *args, **kwargs): + pass + + +# 3.8-3.9 +if not hasattr(typing, 'Concatenate'): + # Inherits from list as a workaround for Callable checks in Python < 3.9.2. + class _ConcatenateGenericAlias(list): + + # Trick Generic into looking into this for __parameters__. + __class__ = typing._GenericAlias + + # Flag in 3.8. + _special = False + + def __init__(self, origin, args): + super().__init__(args) + self.__origin__ = origin + self.__args__ = args + + def __repr__(self): + _type_repr = typing._type_repr + return (f'{_type_repr(self.__origin__)}' + f'[{", ".join(_type_repr(arg) for arg in self.__args__)}]') + + def __hash__(self): + return hash((self.__origin__, self.__args__)) + + # Hack to get typing._type_check to pass in Generic. + def __call__(self, *args, **kwargs): + pass + + @property + def __parameters__(self): + return tuple( + tp for tp in self.__args__ if isinstance(tp, (typing.TypeVar, ParamSpec)) + ) + + +# 3.8-3.9 +@typing._tp_cache +def _concatenate_getitem(self, parameters): + if parameters == (): + raise TypeError("Cannot take a Concatenate of no types.") + if not isinstance(parameters, tuple): + parameters = (parameters,) + if not isinstance(parameters[-1], ParamSpec): + raise TypeError("The last parameter to Concatenate should be a " + "ParamSpec variable.") + msg = "Concatenate[arg, ...]: each arg must be a type." + parameters = tuple(typing._type_check(p, msg) for p in parameters) + return _ConcatenateGenericAlias(self, parameters) + + +# 3.10+ +if hasattr(typing, 'Concatenate'): + Concatenate = typing.Concatenate + _ConcatenateGenericAlias = typing._ConcatenateGenericAlias +# 3.9 +elif sys.version_info[:2] >= (3, 9): + @_ExtensionsSpecialForm + def Concatenate(self, parameters): + """Used in conjunction with ``ParamSpec`` and ``Callable`` to represent a + higher order function which adds, removes or transforms parameters of a + callable. + + For example:: + + Callable[Concatenate[int, P], int] + + See PEP 612 for detailed information. + """ + return _concatenate_getitem(self, parameters) +# 3.8 +else: + class _ConcatenateForm(_ExtensionsSpecialForm, _root=True): + def __getitem__(self, parameters): + return _concatenate_getitem(self, parameters) + + Concatenate = _ConcatenateForm( + 'Concatenate', + doc="""Used in conjunction with ``ParamSpec`` and ``Callable`` to represent a + higher order function which adds, removes or transforms parameters of a + callable. + + For example:: + + Callable[Concatenate[int, P], int] + + See PEP 612 for detailed information. + """) + +# 3.10+ +if hasattr(typing, 'TypeGuard'): + TypeGuard = typing.TypeGuard +# 3.9 +elif sys.version_info[:2] >= (3, 9): + @_ExtensionsSpecialForm + def TypeGuard(self, parameters): + """Special typing form used to annotate the return type of a user-defined + type guard function. ``TypeGuard`` only accepts a single type argument. + At runtime, functions marked this way should return a boolean. + + ``TypeGuard`` aims to benefit *type narrowing* -- a technique used by static + type checkers to determine a more precise type of an expression within a + program's code flow. Usually type narrowing is done by analyzing + conditional code flow and applying the narrowing to a block of code. The + conditional expression here is sometimes referred to as a "type guard". + + Sometimes it would be convenient to use a user-defined boolean function + as a type guard. Such a function should use ``TypeGuard[...]`` as its + return type to alert static type checkers to this intention. + + Using ``-> TypeGuard`` tells the static type checker that for a given + function: + + 1. The return value is a boolean. + 2. If the return value is ``True``, the type of its argument + is the type inside ``TypeGuard``. + + For example:: + + def is_str(val: Union[str, float]): + # "isinstance" type guard + if isinstance(val, str): + # Type of ``val`` is narrowed to ``str`` + ... + else: + # Else, type of ``val`` is narrowed to ``float``. + ... + + Strict type narrowing is not enforced -- ``TypeB`` need not be a narrower + form of ``TypeA`` (it can even be a wider form) and this may lead to + type-unsafe results. The main reason is to allow for things like + narrowing ``List[object]`` to ``List[str]`` even though the latter is not + a subtype of the former, since ``List`` is invariant. The responsibility of + writing type-safe type guards is left to the user. + + ``TypeGuard`` also works with type variables. For more information, see + PEP 647 (User-Defined Type Guards). + """ + item = typing._type_check(parameters, f'{self} accepts only a single type.') + return typing._GenericAlias(self, (item,)) +# 3.8 +else: + class _TypeGuardForm(_ExtensionsSpecialForm, _root=True): + def __getitem__(self, parameters): + item = typing._type_check(parameters, + f'{self._name} accepts only a single type') + return typing._GenericAlias(self, (item,)) + + TypeGuard = _TypeGuardForm( + 'TypeGuard', + doc="""Special typing form used to annotate the return type of a user-defined + type guard function. ``TypeGuard`` only accepts a single type argument. + At runtime, functions marked this way should return a boolean. + + ``TypeGuard`` aims to benefit *type narrowing* -- a technique used by static + type checkers to determine a more precise type of an expression within a + program's code flow. Usually type narrowing is done by analyzing + conditional code flow and applying the narrowing to a block of code. The + conditional expression here is sometimes referred to as a "type guard". + + Sometimes it would be convenient to use a user-defined boolean function + as a type guard. Such a function should use ``TypeGuard[...]`` as its + return type to alert static type checkers to this intention. + + Using ``-> TypeGuard`` tells the static type checker that for a given + function: + + 1. The return value is a boolean. + 2. If the return value is ``True``, the type of its argument + is the type inside ``TypeGuard``. + + For example:: + + def is_str(val: Union[str, float]): + # "isinstance" type guard + if isinstance(val, str): + # Type of ``val`` is narrowed to ``str`` + ... + else: + # Else, type of ``val`` is narrowed to ``float``. + ... + + Strict type narrowing is not enforced -- ``TypeB`` need not be a narrower + form of ``TypeA`` (it can even be a wider form) and this may lead to + type-unsafe results. The main reason is to allow for things like + narrowing ``List[object]`` to ``List[str]`` even though the latter is not + a subtype of the former, since ``List`` is invariant. The responsibility of + writing type-safe type guards is left to the user. + + ``TypeGuard`` also works with type variables. For more information, see + PEP 647 (User-Defined Type Guards). + """) + +# 3.13+ +if hasattr(typing, 'TypeIs'): + TypeIs = typing.TypeIs +# 3.9 +elif sys.version_info[:2] >= (3, 9): + @_ExtensionsSpecialForm + def TypeIs(self, parameters): + """Special typing form used to annotate the return type of a user-defined + type narrower function. ``TypeIs`` only accepts a single type argument. + At runtime, functions marked this way should return a boolean. + + ``TypeIs`` aims to benefit *type narrowing* -- a technique used by static + type checkers to determine a more precise type of an expression within a + program's code flow. Usually type narrowing is done by analyzing + conditional code flow and applying the narrowing to a block of code. The + conditional expression here is sometimes referred to as a "type guard". + + Sometimes it would be convenient to use a user-defined boolean function + as a type guard. Such a function should use ``TypeIs[...]`` as its + return type to alert static type checkers to this intention. + + Using ``-> TypeIs`` tells the static type checker that for a given + function: + + 1. The return value is a boolean. + 2. If the return value is ``True``, the type of its argument + is the intersection of the type inside ``TypeGuard`` and the argument's + previously known type. + + For example:: + + def is_awaitable(val: object) -> TypeIs[Awaitable[Any]]: + return hasattr(val, '__await__') + + def f(val: Union[int, Awaitable[int]]) -> int: + if is_awaitable(val): + assert_type(val, Awaitable[int]) + else: + assert_type(val, int) + + ``TypeIs`` also works with type variables. For more information, see + PEP 742 (Narrowing types with TypeIs). + """ + item = typing._type_check(parameters, f'{self} accepts only a single type.') + return typing._GenericAlias(self, (item,)) +# 3.8 +else: + class _TypeIsForm(_ExtensionsSpecialForm, _root=True): + def __getitem__(self, parameters): + item = typing._type_check(parameters, + f'{self._name} accepts only a single type') + return typing._GenericAlias(self, (item,)) + + TypeIs = _TypeIsForm( + 'TypeIs', + doc="""Special typing form used to annotate the return type of a user-defined + type narrower function. ``TypeIs`` only accepts a single type argument. + At runtime, functions marked this way should return a boolean. + + ``TypeIs`` aims to benefit *type narrowing* -- a technique used by static + type checkers to determine a more precise type of an expression within a + program's code flow. Usually type narrowing is done by analyzing + conditional code flow and applying the narrowing to a block of code. The + conditional expression here is sometimes referred to as a "type guard". + + Sometimes it would be convenient to use a user-defined boolean function + as a type guard. Such a function should use ``TypeIs[...]`` as its + return type to alert static type checkers to this intention. + + Using ``-> TypeIs`` tells the static type checker that for a given + function: + + 1. The return value is a boolean. + 2. If the return value is ``True``, the type of its argument + is the intersection of the type inside ``TypeGuard`` and the argument's + previously known type. + + For example:: + + def is_awaitable(val: object) -> TypeIs[Awaitable[Any]]: + return hasattr(val, '__await__') + + def f(val: Union[int, Awaitable[int]]) -> int: + if is_awaitable(val): + assert_type(val, Awaitable[int]) + else: + assert_type(val, int) + + ``TypeIs`` also works with type variables. For more information, see + PEP 742 (Narrowing types with TypeIs). + """) + + +# Vendored from cpython typing._SpecialFrom +class _SpecialForm(typing._Final, _root=True): + __slots__ = ('_name', '__doc__', '_getitem') + + def __init__(self, getitem): + self._getitem = getitem + self._name = getitem.__name__ + self.__doc__ = getitem.__doc__ + + def __getattr__(self, item): + if item in {'__name__', '__qualname__'}: + return self._name + + raise AttributeError(item) + + def __mro_entries__(self, bases): + raise TypeError(f"Cannot subclass {self!r}") + + def __repr__(self): + return f'typing_extensions.{self._name}' + + def __reduce__(self): + return self._name + + def __call__(self, *args, **kwds): + raise TypeError(f"Cannot instantiate {self!r}") + + def __or__(self, other): + return typing.Union[self, other] + + def __ror__(self, other): + return typing.Union[other, self] + + def __instancecheck__(self, obj): + raise TypeError(f"{self} cannot be used with isinstance()") + + def __subclasscheck__(self, cls): + raise TypeError(f"{self} cannot be used with issubclass()") + + @typing._tp_cache + def __getitem__(self, parameters): + return self._getitem(self, parameters) + + +if hasattr(typing, "LiteralString"): # 3.11+ + LiteralString = typing.LiteralString +else: + @_SpecialForm + def LiteralString(self, params): + """Represents an arbitrary literal string. + + Example:: + + from typing_extensions import LiteralString + + def query(sql: LiteralString) -> ...: + ... + + query("SELECT * FROM table") # ok + query(f"SELECT * FROM {input()}") # not ok + + See PEP 675 for details. + + """ + raise TypeError(f"{self} is not subscriptable") + + +if hasattr(typing, "Self"): # 3.11+ + Self = typing.Self +else: + @_SpecialForm + def Self(self, params): + """Used to spell the type of "self" in classes. + + Example:: + + from typing import Self + + class ReturnsSelf: + def parse(self, data: bytes) -> Self: + ... + return self + + """ + + raise TypeError(f"{self} is not subscriptable") + + +if hasattr(typing, "Never"): # 3.11+ + Never = typing.Never +else: + @_SpecialForm + def Never(self, params): + """The bottom type, a type that has no members. + + This can be used to define a function that should never be + called, or a function that never returns:: + + from typing_extensions import Never + + def never_call_me(arg: Never) -> None: + pass + + def int_or_str(arg: int | str) -> None: + never_call_me(arg) # type checker error + match arg: + case int(): + print("It's an int") + case str(): + print("It's a str") + case _: + never_call_me(arg) # ok, arg is of type Never + + """ + + raise TypeError(f"{self} is not subscriptable") + + +if hasattr(typing, 'Required'): # 3.11+ + Required = typing.Required + NotRequired = typing.NotRequired +elif sys.version_info[:2] >= (3, 9): # 3.9-3.10 + @_ExtensionsSpecialForm + def Required(self, parameters): + """A special typing construct to mark a key of a total=False TypedDict + as required. For example: + + class Movie(TypedDict, total=False): + title: Required[str] + year: int + + m = Movie( + title='The Matrix', # typechecker error if key is omitted + year=1999, + ) + + There is no runtime checking that a required key is actually provided + when instantiating a related TypedDict. + """ + item = typing._type_check(parameters, f'{self._name} accepts only a single type.') + return typing._GenericAlias(self, (item,)) + + @_ExtensionsSpecialForm + def NotRequired(self, parameters): + """A special typing construct to mark a key of a TypedDict as + potentially missing. For example: + + class Movie(TypedDict): + title: str + year: NotRequired[int] + + m = Movie( + title='The Matrix', # typechecker error if key is omitted + year=1999, + ) + """ + item = typing._type_check(parameters, f'{self._name} accepts only a single type.') + return typing._GenericAlias(self, (item,)) + +else: # 3.8 + class _RequiredForm(_ExtensionsSpecialForm, _root=True): + def __getitem__(self, parameters): + item = typing._type_check(parameters, + f'{self._name} accepts only a single type.') + return typing._GenericAlias(self, (item,)) + + Required = _RequiredForm( + 'Required', + doc="""A special typing construct to mark a key of a total=False TypedDict + as required. For example: + + class Movie(TypedDict, total=False): + title: Required[str] + year: int + + m = Movie( + title='The Matrix', # typechecker error if key is omitted + year=1999, + ) + + There is no runtime checking that a required key is actually provided + when instantiating a related TypedDict. + """) + NotRequired = _RequiredForm( + 'NotRequired', + doc="""A special typing construct to mark a key of a TypedDict as + potentially missing. For example: + + class Movie(TypedDict): + title: str + year: NotRequired[int] + + m = Movie( + title='The Matrix', # typechecker error if key is omitted + year=1999, + ) + """) + + +if hasattr(typing, 'ReadOnly'): + ReadOnly = typing.ReadOnly +elif sys.version_info[:2] >= (3, 9): # 3.9-3.12 + @_ExtensionsSpecialForm + def ReadOnly(self, parameters): + """A special typing construct to mark an item of a TypedDict as read-only. + + For example: + + class Movie(TypedDict): + title: ReadOnly[str] + year: int + + def mutate_movie(m: Movie) -> None: + m["year"] = 1992 # allowed + m["title"] = "The Matrix" # typechecker error + + There is no runtime checking for this property. + """ + item = typing._type_check(parameters, f'{self._name} accepts only a single type.') + return typing._GenericAlias(self, (item,)) + +else: # 3.8 + class _ReadOnlyForm(_ExtensionsSpecialForm, _root=True): + def __getitem__(self, parameters): + item = typing._type_check(parameters, + f'{self._name} accepts only a single type.') + return typing._GenericAlias(self, (item,)) + + ReadOnly = _ReadOnlyForm( + 'ReadOnly', + doc="""A special typing construct to mark a key of a TypedDict as read-only. + + For example: + + class Movie(TypedDict): + title: ReadOnly[str] + year: int + + def mutate_movie(m: Movie) -> None: + m["year"] = 1992 # allowed + m["title"] = "The Matrix" # typechecker error + + There is no runtime checking for this propery. + """) + + +_UNPACK_DOC = """\ +Type unpack operator. + +The type unpack operator takes the child types from some container type, +such as `tuple[int, str]` or a `TypeVarTuple`, and 'pulls them out'. For +example: + + # For some generic class `Foo`: + Foo[Unpack[tuple[int, str]]] # Equivalent to Foo[int, str] + + Ts = TypeVarTuple('Ts') + # Specifies that `Bar` is generic in an arbitrary number of types. + # (Think of `Ts` as a tuple of an arbitrary number of individual + # `TypeVar`s, which the `Unpack` is 'pulling out' directly into the + # `Generic[]`.) + class Bar(Generic[Unpack[Ts]]): ... + Bar[int] # Valid + Bar[int, str] # Also valid + +From Python 3.11, this can also be done using the `*` operator: + + Foo[*tuple[int, str]] + class Bar(Generic[*Ts]): ... + +The operator can also be used along with a `TypedDict` to annotate +`**kwargs` in a function signature. For instance: + + class Movie(TypedDict): + name: str + year: int + + # This function expects two keyword arguments - *name* of type `str` and + # *year* of type `int`. + def foo(**kwargs: Unpack[Movie]): ... + +Note that there is only some runtime checking of this operator. Not +everything the runtime allows may be accepted by static type checkers. + +For more information, see PEP 646 and PEP 692. +""" + + +if sys.version_info >= (3, 12): # PEP 692 changed the repr of Unpack[] + Unpack = typing.Unpack + + def _is_unpack(obj): + return get_origin(obj) is Unpack + +elif sys.version_info[:2] >= (3, 9): # 3.9+ + class _UnpackSpecialForm(_ExtensionsSpecialForm, _root=True): + def __init__(self, getitem): + super().__init__(getitem) + self.__doc__ = _UNPACK_DOC + + class _UnpackAlias(typing._GenericAlias, _root=True): + __class__ = typing.TypeVar + + @property + def __typing_unpacked_tuple_args__(self): + assert self.__origin__ is Unpack + assert len(self.__args__) == 1 + arg, = self.__args__ + if isinstance(arg, (typing._GenericAlias, _types.GenericAlias)): + if arg.__origin__ is not tuple: + raise TypeError("Unpack[...] must be used with a tuple type") + return arg.__args__ + return None + + @_UnpackSpecialForm + def Unpack(self, parameters): + item = typing._type_check(parameters, f'{self._name} accepts only a single type.') + return _UnpackAlias(self, (item,)) + + def _is_unpack(obj): + return isinstance(obj, _UnpackAlias) + +else: # 3.8 + class _UnpackAlias(typing._GenericAlias, _root=True): + __class__ = typing.TypeVar + + class _UnpackForm(_ExtensionsSpecialForm, _root=True): + def __getitem__(self, parameters): + item = typing._type_check(parameters, + f'{self._name} accepts only a single type.') + return _UnpackAlias(self, (item,)) + + Unpack = _UnpackForm('Unpack', doc=_UNPACK_DOC) + + def _is_unpack(obj): + return isinstance(obj, _UnpackAlias) + + +if _PEP_696_IMPLEMENTED: + from typing import TypeVarTuple + +elif hasattr(typing, "TypeVarTuple"): # 3.11+ + + def _unpack_args(*args): + newargs = [] + for arg in args: + subargs = getattr(arg, '__typing_unpacked_tuple_args__', None) + if subargs is not None and not (subargs and subargs[-1] is ...): + newargs.extend(subargs) + else: + newargs.append(arg) + return newargs + + # Add default parameter - PEP 696 + class TypeVarTuple(metaclass=_TypeVarLikeMeta): + """Type variable tuple.""" + + _backported_typevarlike = typing.TypeVarTuple + + def __new__(cls, name, *, default=NoDefault): + tvt = typing.TypeVarTuple(name) + _set_default(tvt, default) + _set_module(tvt) + + def _typevartuple_prepare_subst(alias, args): + params = alias.__parameters__ + typevartuple_index = params.index(tvt) + for param in params[typevartuple_index + 1:]: + if isinstance(param, TypeVarTuple): + raise TypeError( + f"More than one TypeVarTuple parameter in {alias}" + ) + + alen = len(args) + plen = len(params) + left = typevartuple_index + right = plen - typevartuple_index - 1 + var_tuple_index = None + fillarg = None + for k, arg in enumerate(args): + if not isinstance(arg, type): + subargs = getattr(arg, '__typing_unpacked_tuple_args__', None) + if subargs and len(subargs) == 2 and subargs[-1] is ...: + if var_tuple_index is not None: + raise TypeError( + "More than one unpacked " + "arbitrary-length tuple argument" + ) + var_tuple_index = k + fillarg = subargs[0] + if var_tuple_index is not None: + left = min(left, var_tuple_index) + right = min(right, alen - var_tuple_index - 1) + elif left + right > alen: + raise TypeError(f"Too few arguments for {alias};" + f" actual {alen}, expected at least {plen - 1}") + if left == alen - right and tvt.has_default(): + replacement = _unpack_args(tvt.__default__) + else: + replacement = args[left: alen - right] + + return ( + *args[:left], + *([fillarg] * (typevartuple_index - left)), + replacement, + *([fillarg] * (plen - right - left - typevartuple_index - 1)), + *args[alen - right:], + ) + + tvt.__typing_prepare_subst__ = _typevartuple_prepare_subst + return tvt + + def __init_subclass__(self, *args, **kwds): + raise TypeError("Cannot subclass special typing classes") + +else: # <=3.10 + class TypeVarTuple(_DefaultMixin): + """Type variable tuple. + + Usage:: + + Ts = TypeVarTuple('Ts') + + In the same way that a normal type variable is a stand-in for a single + type such as ``int``, a type variable *tuple* is a stand-in for a *tuple* + type such as ``Tuple[int, str]``. + + Type variable tuples can be used in ``Generic`` declarations. + Consider the following example:: + + class Array(Generic[*Ts]): ... + + The ``Ts`` type variable tuple here behaves like ``tuple[T1, T2]``, + where ``T1`` and ``T2`` are type variables. To use these type variables + as type parameters of ``Array``, we must *unpack* the type variable tuple using + the star operator: ``*Ts``. The signature of ``Array`` then behaves + as if we had simply written ``class Array(Generic[T1, T2]): ...``. + In contrast to ``Generic[T1, T2]``, however, ``Generic[*Shape]`` allows + us to parameterise the class with an *arbitrary* number of type parameters. + + Type variable tuples can be used anywhere a normal ``TypeVar`` can. + This includes class definitions, as shown above, as well as function + signatures and variable annotations:: + + class Array(Generic[*Ts]): + + def __init__(self, shape: Tuple[*Ts]): + self._shape: Tuple[*Ts] = shape + + def get_shape(self) -> Tuple[*Ts]: + return self._shape + + shape = (Height(480), Width(640)) + x: Array[Height, Width] = Array(shape) + y = abs(x) # Inferred type is Array[Height, Width] + z = x + x # ... is Array[Height, Width] + x.get_shape() # ... is tuple[Height, Width] + + """ + + # Trick Generic __parameters__. + __class__ = typing.TypeVar + + def __iter__(self): + yield self.__unpacked__ + + def __init__(self, name, *, default=NoDefault): + self.__name__ = name + _DefaultMixin.__init__(self, default) + + # for pickling: + def_mod = _caller() + if def_mod != 'typing_extensions': + self.__module__ = def_mod + + self.__unpacked__ = Unpack[self] + + def __repr__(self): + return self.__name__ + + def __hash__(self): + return object.__hash__(self) + + def __eq__(self, other): + return self is other + + def __reduce__(self): + return self.__name__ + + def __init_subclass__(self, *args, **kwds): + if '_root' not in kwds: + raise TypeError("Cannot subclass special typing classes") + + +if hasattr(typing, "reveal_type"): # 3.11+ + reveal_type = typing.reveal_type +else: # <=3.10 + def reveal_type(obj: T, /) -> T: + """Reveal the inferred type of a variable. + + When a static type checker encounters a call to ``reveal_type()``, + it will emit the inferred type of the argument:: + + x: int = 1 + reveal_type(x) + + Running a static type checker (e.g., ``mypy``) on this example + will produce output similar to 'Revealed type is "builtins.int"'. + + At runtime, the function prints the runtime type of the + argument and returns it unchanged. + + """ + print(f"Runtime type is {type(obj).__name__!r}", file=sys.stderr) + return obj + + +if hasattr(typing, "_ASSERT_NEVER_REPR_MAX_LENGTH"): # 3.11+ + _ASSERT_NEVER_REPR_MAX_LENGTH = typing._ASSERT_NEVER_REPR_MAX_LENGTH +else: # <=3.10 + _ASSERT_NEVER_REPR_MAX_LENGTH = 100 + + +if hasattr(typing, "assert_never"): # 3.11+ + assert_never = typing.assert_never +else: # <=3.10 + def assert_never(arg: Never, /) -> Never: + """Assert to the type checker that a line of code is unreachable. + + Example:: + + def int_or_str(arg: int | str) -> None: + match arg: + case int(): + print("It's an int") + case str(): + print("It's a str") + case _: + assert_never(arg) + + If a type checker finds that a call to assert_never() is + reachable, it will emit an error. + + At runtime, this throws an exception when called. + + """ + value = repr(arg) + if len(value) > _ASSERT_NEVER_REPR_MAX_LENGTH: + value = value[:_ASSERT_NEVER_REPR_MAX_LENGTH] + '...' + raise AssertionError(f"Expected code to be unreachable, but got: {value}") + + +if sys.version_info >= (3, 12): # 3.12+ + # dataclass_transform exists in 3.11 but lacks the frozen_default parameter + dataclass_transform = typing.dataclass_transform +else: # <=3.11 + def dataclass_transform( + *, + eq_default: bool = True, + order_default: bool = False, + kw_only_default: bool = False, + frozen_default: bool = False, + field_specifiers: typing.Tuple[ + typing.Union[typing.Type[typing.Any], typing.Callable[..., typing.Any]], + ... + ] = (), + **kwargs: typing.Any, + ) -> typing.Callable[[T], T]: + """Decorator that marks a function, class, or metaclass as providing + dataclass-like behavior. + + Example: + + from typing_extensions import dataclass_transform + + _T = TypeVar("_T") + + # Used on a decorator function + @dataclass_transform() + def create_model(cls: type[_T]) -> type[_T]: + ... + return cls + + @create_model + class CustomerModel: + id: int + name: str + + # Used on a base class + @dataclass_transform() + class ModelBase: ... + + class CustomerModel(ModelBase): + id: int + name: str + + # Used on a metaclass + @dataclass_transform() + class ModelMeta(type): ... + + class ModelBase(metaclass=ModelMeta): ... + + class CustomerModel(ModelBase): + id: int + name: str + + Each of the ``CustomerModel`` classes defined in this example will now + behave similarly to a dataclass created with the ``@dataclasses.dataclass`` + decorator. For example, the type checker will synthesize an ``__init__`` + method. + + The arguments to this decorator can be used to customize this behavior: + - ``eq_default`` indicates whether the ``eq`` parameter is assumed to be + True or False if it is omitted by the caller. + - ``order_default`` indicates whether the ``order`` parameter is + assumed to be True or False if it is omitted by the caller. + - ``kw_only_default`` indicates whether the ``kw_only`` parameter is + assumed to be True or False if it is omitted by the caller. + - ``frozen_default`` indicates whether the ``frozen`` parameter is + assumed to be True or False if it is omitted by the caller. + - ``field_specifiers`` specifies a static list of supported classes + or functions that describe fields, similar to ``dataclasses.field()``. + + At runtime, this decorator records its arguments in the + ``__dataclass_transform__`` attribute on the decorated object. + + See PEP 681 for details. + + """ + def decorator(cls_or_fn): + cls_or_fn.__dataclass_transform__ = { + "eq_default": eq_default, + "order_default": order_default, + "kw_only_default": kw_only_default, + "frozen_default": frozen_default, + "field_specifiers": field_specifiers, + "kwargs": kwargs, + } + return cls_or_fn + return decorator + + +if hasattr(typing, "override"): # 3.12+ + override = typing.override +else: # <=3.11 + _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any]) + + def override(arg: _F, /) -> _F: + """Indicate that a method is intended to override a method in a base class. + + Usage: + + class Base: + def method(self) -> None: + pass + + class Child(Base): + @override + def method(self) -> None: + super().method() + + When this decorator is applied to a method, the type checker will + validate that it overrides a method with the same name on a base class. + This helps prevent bugs that may occur when a base class is changed + without an equivalent change to a child class. + + There is no runtime checking of these properties. The decorator + sets the ``__override__`` attribute to ``True`` on the decorated object + to allow runtime introspection. + + See PEP 698 for details. + + """ + try: + arg.__override__ = True + except (AttributeError, TypeError): + # Skip the attribute silently if it is not writable. + # AttributeError happens if the object has __slots__ or a + # read-only property, TypeError if it's a builtin class. + pass + return arg + + +if hasattr(warnings, "deprecated"): + deprecated = warnings.deprecated +else: + _T = typing.TypeVar("_T") + + class deprecated: + """Indicate that a class, function or overload is deprecated. + + When this decorator is applied to an object, the type checker + will generate a diagnostic on usage of the deprecated object. + + Usage: + + @deprecated("Use B instead") + class A: + pass + + @deprecated("Use g instead") + def f(): + pass + + @overload + @deprecated("int support is deprecated") + def g(x: int) -> int: ... + @overload + def g(x: str) -> int: ... + + The warning specified by *category* will be emitted at runtime + on use of deprecated objects. For functions, that happens on calls; + for classes, on instantiation and on creation of subclasses. + If the *category* is ``None``, no warning is emitted at runtime. + The *stacklevel* determines where the + warning is emitted. If it is ``1`` (the default), the warning + is emitted at the direct caller of the deprecated object; if it + is higher, it is emitted further up the stack. + Static type checker behavior is not affected by the *category* + and *stacklevel* arguments. + + The deprecation message passed to the decorator is saved in the + ``__deprecated__`` attribute on the decorated object. + If applied to an overload, the decorator + must be after the ``@overload`` decorator for the attribute to + exist on the overload as returned by ``get_overloads()``. + + See PEP 702 for details. + + """ + def __init__( + self, + message: str, + /, + *, + category: typing.Optional[typing.Type[Warning]] = DeprecationWarning, + stacklevel: int = 1, + ) -> None: + if not isinstance(message, str): + raise TypeError( + "Expected an object of type str for 'message', not " + f"{type(message).__name__!r}" + ) + self.message = message + self.category = category + self.stacklevel = stacklevel + + def __call__(self, arg: _T, /) -> _T: + # Make sure the inner functions created below don't + # retain a reference to self. + msg = self.message + category = self.category + stacklevel = self.stacklevel + if category is None: + arg.__deprecated__ = msg + return arg + elif isinstance(arg, type): + import functools + from types import MethodType + + original_new = arg.__new__ + + @functools.wraps(original_new) + def __new__(cls, *args, **kwargs): + if cls is arg: + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + if original_new is not object.__new__: + return original_new(cls, *args, **kwargs) + # Mirrors a similar check in object.__new__. + elif cls.__init__ is object.__init__ and (args or kwargs): + raise TypeError(f"{cls.__name__}() takes no arguments") + else: + return original_new(cls) + + arg.__new__ = staticmethod(__new__) + + original_init_subclass = arg.__init_subclass__ + # We need slightly different behavior if __init_subclass__ + # is a bound method (likely if it was implemented in Python) + if isinstance(original_init_subclass, MethodType): + original_init_subclass = original_init_subclass.__func__ + + @functools.wraps(original_init_subclass) + def __init_subclass__(*args, **kwargs): + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + return original_init_subclass(*args, **kwargs) + + arg.__init_subclass__ = classmethod(__init_subclass__) + # Or otherwise, which likely means it's a builtin such as + # object's implementation of __init_subclass__. + else: + @functools.wraps(original_init_subclass) + def __init_subclass__(*args, **kwargs): + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + return original_init_subclass(*args, **kwargs) + + arg.__init_subclass__ = __init_subclass__ + + arg.__deprecated__ = __new__.__deprecated__ = msg + __init_subclass__.__deprecated__ = msg + return arg + elif callable(arg): + import functools + + @functools.wraps(arg) + def wrapper(*args, **kwargs): + warnings.warn(msg, category=category, stacklevel=stacklevel + 1) + return arg(*args, **kwargs) + + arg.__deprecated__ = wrapper.__deprecated__ = msg + return wrapper + else: + raise TypeError( + "@deprecated decorator with non-None category must be applied to " + f"a class or callable, not {arg!r}" + ) + + +# We have to do some monkey patching to deal with the dual nature of +# Unpack/TypeVarTuple: +# - We want Unpack to be a kind of TypeVar so it gets accepted in +# Generic[Unpack[Ts]] +# - We want it to *not* be treated as a TypeVar for the purposes of +# counting generic parameters, so that when we subscript a generic, +# the runtime doesn't try to substitute the Unpack with the subscripted type. +if not hasattr(typing, "TypeVarTuple"): + def _check_generic(cls, parameters, elen=_marker): + """Check correct count for parameters of a generic cls (internal helper). + + This gives a nice error message in case of count mismatch. + """ + if not elen: + raise TypeError(f"{cls} is not a generic class") + if elen is _marker: + if not hasattr(cls, "__parameters__") or not cls.__parameters__: + raise TypeError(f"{cls} is not a generic class") + elen = len(cls.__parameters__) + alen = len(parameters) + if alen != elen: + expect_val = elen + if hasattr(cls, "__parameters__"): + parameters = [p for p in cls.__parameters__ if not _is_unpack(p)] + num_tv_tuples = sum(isinstance(p, TypeVarTuple) for p in parameters) + if (num_tv_tuples > 0) and (alen >= elen - num_tv_tuples): + return + + # deal with TypeVarLike defaults + # required TypeVarLikes cannot appear after a defaulted one. + if alen < elen: + # since we validate TypeVarLike default in _collect_type_vars + # or _collect_parameters we can safely check parameters[alen] + if ( + getattr(parameters[alen], '__default__', NoDefault) + is not NoDefault + ): + return + + num_default_tv = sum(getattr(p, '__default__', NoDefault) + is not NoDefault for p in parameters) + + elen -= num_default_tv + + expect_val = f"at least {elen}" + + things = "arguments" if sys.version_info >= (3, 10) else "parameters" + raise TypeError(f"Too {'many' if alen > elen else 'few'} {things}" + f" for {cls}; actual {alen}, expected {expect_val}") +else: + # Python 3.11+ + + def _check_generic(cls, parameters, elen): + """Check correct count for parameters of a generic cls (internal helper). + + This gives a nice error message in case of count mismatch. + """ + if not elen: + raise TypeError(f"{cls} is not a generic class") + alen = len(parameters) + if alen != elen: + expect_val = elen + if hasattr(cls, "__parameters__"): + parameters = [p for p in cls.__parameters__ if not _is_unpack(p)] + + # deal with TypeVarLike defaults + # required TypeVarLikes cannot appear after a defaulted one. + if alen < elen: + # since we validate TypeVarLike default in _collect_type_vars + # or _collect_parameters we can safely check parameters[alen] + if ( + getattr(parameters[alen], '__default__', NoDefault) + is not NoDefault + ): + return + + num_default_tv = sum(getattr(p, '__default__', NoDefault) + is not NoDefault for p in parameters) + + elen -= num_default_tv + + expect_val = f"at least {elen}" + + raise TypeError(f"Too {'many' if alen > elen else 'few'} arguments" + f" for {cls}; actual {alen}, expected {expect_val}") + +if not _PEP_696_IMPLEMENTED: + typing._check_generic = _check_generic + + +def _has_generic_or_protocol_as_origin() -> bool: + try: + frame = sys._getframe(2) + # - Catch AttributeError: not all Python implementations have sys._getframe() + # - Catch ValueError: maybe we're called from an unexpected module + # and the call stack isn't deep enough + except (AttributeError, ValueError): + return False # err on the side of leniency + else: + # If we somehow get invoked from outside typing.py, + # also err on the side of leniency + if frame.f_globals.get("__name__") != "typing": + return False + origin = frame.f_locals.get("origin") + # Cannot use "in" because origin may be an object with a buggy __eq__ that + # throws an error. + return origin is typing.Generic or origin is Protocol or origin is typing.Protocol + + +_TYPEVARTUPLE_TYPES = {TypeVarTuple, getattr(typing, "TypeVarTuple", None)} + + +def _is_unpacked_typevartuple(x) -> bool: + if get_origin(x) is not Unpack: + return False + args = get_args(x) + return ( + bool(args) + and len(args) == 1 + and type(args[0]) in _TYPEVARTUPLE_TYPES + ) + + +# Python 3.11+ _collect_type_vars was renamed to _collect_parameters +if hasattr(typing, '_collect_type_vars'): + def _collect_type_vars(types, typevar_types=None): + """Collect all type variable contained in types in order of + first appearance (lexicographic order). For example:: + + _collect_type_vars((T, List[S, T])) == (T, S) + """ + if typevar_types is None: + typevar_types = typing.TypeVar + tvars = [] + + # A required TypeVarLike cannot appear after a TypeVarLike with a default + # if it was a direct call to `Generic[]` or `Protocol[]` + enforce_default_ordering = _has_generic_or_protocol_as_origin() + default_encountered = False + + # Also, a TypeVarLike with a default cannot appear after a TypeVarTuple + type_var_tuple_encountered = False + + for t in types: + if _is_unpacked_typevartuple(t): + type_var_tuple_encountered = True + elif isinstance(t, typevar_types) and t not in tvars: + if enforce_default_ordering: + has_default = getattr(t, '__default__', NoDefault) is not NoDefault + if has_default: + if type_var_tuple_encountered: + raise TypeError('Type parameter with a default' + ' follows TypeVarTuple') + default_encountered = True + elif default_encountered: + raise TypeError(f'Type parameter {t!r} without a default' + ' follows type parameter with a default') + + tvars.append(t) + if _should_collect_from_parameters(t): + tvars.extend([t for t in t.__parameters__ if t not in tvars]) + return tuple(tvars) + + typing._collect_type_vars = _collect_type_vars +else: + def _collect_parameters(args): + """Collect all type variables and parameter specifications in args + in order of first appearance (lexicographic order). + + For example:: + + assert _collect_parameters((T, Callable[P, T])) == (T, P) + """ + parameters = [] + + # A required TypeVarLike cannot appear after a TypeVarLike with default + # if it was a direct call to `Generic[]` or `Protocol[]` + enforce_default_ordering = _has_generic_or_protocol_as_origin() + default_encountered = False + + # Also, a TypeVarLike with a default cannot appear after a TypeVarTuple + type_var_tuple_encountered = False + + for t in args: + if isinstance(t, type): + # We don't want __parameters__ descriptor of a bare Python class. + pass + elif isinstance(t, tuple): + # `t` might be a tuple, when `ParamSpec` is substituted with + # `[T, int]`, or `[int, *Ts]`, etc. + for x in t: + for collected in _collect_parameters([x]): + if collected not in parameters: + parameters.append(collected) + elif hasattr(t, '__typing_subst__'): + if t not in parameters: + if enforce_default_ordering: + has_default = ( + getattr(t, '__default__', NoDefault) is not NoDefault + ) + + if type_var_tuple_encountered and has_default: + raise TypeError('Type parameter with a default' + ' follows TypeVarTuple') + + if has_default: + default_encountered = True + elif default_encountered: + raise TypeError(f'Type parameter {t!r} without a default' + ' follows type parameter with a default') + + parameters.append(t) + else: + if _is_unpacked_typevartuple(t): + type_var_tuple_encountered = True + for x in getattr(t, '__parameters__', ()): + if x not in parameters: + parameters.append(x) + + return tuple(parameters) + + if not _PEP_696_IMPLEMENTED: + typing._collect_parameters = _collect_parameters + +# Backport typing.NamedTuple as it exists in Python 3.13. +# In 3.11, the ability to define generic `NamedTuple`s was supported. +# This was explicitly disallowed in 3.9-3.10, and only half-worked in <=3.8. +# On 3.12, we added __orig_bases__ to call-based NamedTuples +# On 3.13, we deprecated kwargs-based NamedTuples +if sys.version_info >= (3, 13): + NamedTuple = typing.NamedTuple +else: + def _make_nmtuple(name, types, module, defaults=()): + fields = [n for n, t in types] + annotations = {n: typing._type_check(t, f"field {n} annotation must be a type") + for n, t in types} + nm_tpl = collections.namedtuple(name, fields, + defaults=defaults, module=module) + nm_tpl.__annotations__ = nm_tpl.__new__.__annotations__ = annotations + # The `_field_types` attribute was removed in 3.9; + # in earlier versions, it is the same as the `__annotations__` attribute + if sys.version_info < (3, 9): + nm_tpl._field_types = annotations + return nm_tpl + + _prohibited_namedtuple_fields = typing._prohibited + _special_namedtuple_fields = frozenset({'__module__', '__name__', '__annotations__'}) + + class _NamedTupleMeta(type): + def __new__(cls, typename, bases, ns): + assert _NamedTuple in bases + for base in bases: + if base is not _NamedTuple and base is not typing.Generic: + raise TypeError( + 'can only inherit from a NamedTuple type and Generic') + bases = tuple(tuple if base is _NamedTuple else base for base in bases) + if "__annotations__" in ns: + types = ns["__annotations__"] + elif "__annotate__" in ns: + # TODO: Use inspect.VALUE here, and make the annotations lazily evaluated + types = ns["__annotate__"](1) + else: + types = {} + default_names = [] + for field_name in types: + if field_name in ns: + default_names.append(field_name) + elif default_names: + raise TypeError(f"Non-default namedtuple field {field_name} " + f"cannot follow default field" + f"{'s' if len(default_names) > 1 else ''} " + f"{', '.join(default_names)}") + nm_tpl = _make_nmtuple( + typename, types.items(), + defaults=[ns[n] for n in default_names], + module=ns['__module__'] + ) + nm_tpl.__bases__ = bases + if typing.Generic in bases: + if hasattr(typing, '_generic_class_getitem'): # 3.12+ + nm_tpl.__class_getitem__ = classmethod(typing._generic_class_getitem) + else: + class_getitem = typing.Generic.__class_getitem__.__func__ + nm_tpl.__class_getitem__ = classmethod(class_getitem) + # update from user namespace without overriding special namedtuple attributes + for key, val in ns.items(): + if key in _prohibited_namedtuple_fields: + raise AttributeError("Cannot overwrite NamedTuple attribute " + key) + elif key not in _special_namedtuple_fields: + if key not in nm_tpl._fields: + setattr(nm_tpl, key, ns[key]) + try: + set_name = type(val).__set_name__ + except AttributeError: + pass + else: + try: + set_name(val, nm_tpl, key) + except BaseException as e: + msg = ( + f"Error calling __set_name__ on {type(val).__name__!r} " + f"instance {key!r} in {typename!r}" + ) + # BaseException.add_note() existed on py311, + # but the __set_name__ machinery didn't start + # using add_note() until py312. + # Making sure exceptions are raised in the same way + # as in "normal" classes seems most important here. + if sys.version_info >= (3, 12): + e.add_note(msg) + raise + else: + raise RuntimeError(msg) from e + + if typing.Generic in bases: + nm_tpl.__init_subclass__() + return nm_tpl + + _NamedTuple = type.__new__(_NamedTupleMeta, 'NamedTuple', (), {}) + + def _namedtuple_mro_entries(bases): + assert NamedTuple in bases + return (_NamedTuple,) + + @_ensure_subclassable(_namedtuple_mro_entries) + def NamedTuple(typename, fields=_marker, /, **kwargs): + """Typed version of namedtuple. + + Usage:: + + class Employee(NamedTuple): + name: str + id: int + + This is equivalent to:: + + Employee = collections.namedtuple('Employee', ['name', 'id']) + + The resulting class has an extra __annotations__ attribute, giving a + dict that maps field names to types. (The field names are also in + the _fields attribute, which is part of the namedtuple API.) + An alternative equivalent functional syntax is also accepted:: + + Employee = NamedTuple('Employee', [('name', str), ('id', int)]) + """ + if fields is _marker: + if kwargs: + deprecated_thing = "Creating NamedTuple classes using keyword arguments" + deprecation_msg = ( + "{name} is deprecated and will be disallowed in Python {remove}. " + "Use the class-based or functional syntax instead." + ) + else: + deprecated_thing = "Failing to pass a value for the 'fields' parameter" + example = f"`{typename} = NamedTuple({typename!r}, [])`" + deprecation_msg = ( + "{name} is deprecated and will be disallowed in Python {remove}. " + "To create a NamedTuple class with 0 fields " + "using the functional syntax, " + "pass an empty list, e.g. " + ) + example + "." + elif fields is None: + if kwargs: + raise TypeError( + "Cannot pass `None` as the 'fields' parameter " + "and also specify fields using keyword arguments" + ) + else: + deprecated_thing = "Passing `None` as the 'fields' parameter" + example = f"`{typename} = NamedTuple({typename!r}, [])`" + deprecation_msg = ( + "{name} is deprecated and will be disallowed in Python {remove}. " + "To create a NamedTuple class with 0 fields " + "using the functional syntax, " + "pass an empty list, e.g. " + ) + example + "." + elif kwargs: + raise TypeError("Either list of fields or keywords" + " can be provided to NamedTuple, not both") + if fields is _marker or fields is None: + warnings.warn( + deprecation_msg.format(name=deprecated_thing, remove="3.15"), + DeprecationWarning, + stacklevel=2, + ) + fields = kwargs.items() + nt = _make_nmtuple(typename, fields, module=_caller()) + nt.__orig_bases__ = (NamedTuple,) + return nt + + +if hasattr(collections.abc, "Buffer"): + Buffer = collections.abc.Buffer +else: + class Buffer(abc.ABC): # noqa: B024 + """Base class for classes that implement the buffer protocol. + + The buffer protocol allows Python objects to expose a low-level + memory buffer interface. Before Python 3.12, it is not possible + to implement the buffer protocol in pure Python code, or even + to check whether a class implements the buffer protocol. In + Python 3.12 and higher, the ``__buffer__`` method allows access + to the buffer protocol from Python code, and the + ``collections.abc.Buffer`` ABC allows checking whether a class + implements the buffer protocol. + + To indicate support for the buffer protocol in earlier versions, + inherit from this ABC, either in a stub file or at runtime, + or use ABC registration. This ABC provides no methods, because + there is no Python-accessible methods shared by pre-3.12 buffer + classes. It is useful primarily for static checks. + + """ + + # As a courtesy, register the most common stdlib buffer classes. + Buffer.register(memoryview) + Buffer.register(bytearray) + Buffer.register(bytes) + + +# Backport of types.get_original_bases, available on 3.12+ in CPython +if hasattr(_types, "get_original_bases"): + get_original_bases = _types.get_original_bases +else: + def get_original_bases(cls, /): + """Return the class's "original" bases prior to modification by `__mro_entries__`. + + Examples:: + + from typing import TypeVar, Generic + from typing_extensions import NamedTuple, TypedDict + + T = TypeVar("T") + class Foo(Generic[T]): ... + class Bar(Foo[int], float): ... + class Baz(list[str]): ... + Eggs = NamedTuple("Eggs", [("a", int), ("b", str)]) + Spam = TypedDict("Spam", {"a": int, "b": str}) + + assert get_original_bases(Bar) == (Foo[int], float) + assert get_original_bases(Baz) == (list[str],) + assert get_original_bases(Eggs) == (NamedTuple,) + assert get_original_bases(Spam) == (TypedDict,) + assert get_original_bases(int) == (object,) + """ + try: + return cls.__dict__.get("__orig_bases__", cls.__bases__) + except AttributeError: + raise TypeError( + f'Expected an instance of type, not {type(cls).__name__!r}' + ) from None + + +# NewType is a class on Python 3.10+, making it pickleable +# The error message for subclassing instances of NewType was improved on 3.11+ +if sys.version_info >= (3, 11): + NewType = typing.NewType +else: + class NewType: + """NewType creates simple unique types with almost zero + runtime overhead. NewType(name, tp) is considered a subtype of tp + by static type checkers. At runtime, NewType(name, tp) returns + a dummy callable that simply returns its argument. Usage:: + UserId = NewType('UserId', int) + def name_by_id(user_id: UserId) -> str: + ... + UserId('user') # Fails type check + name_by_id(42) # Fails type check + name_by_id(UserId(42)) # OK + num = UserId(5) + 1 # type: int + """ + + def __call__(self, obj, /): + return obj + + def __init__(self, name, tp): + self.__qualname__ = name + if '.' in name: + name = name.rpartition('.')[-1] + self.__name__ = name + self.__supertype__ = tp + def_mod = _caller() + if def_mod != 'typing_extensions': + self.__module__ = def_mod + + def __mro_entries__(self, bases): + # We defined __mro_entries__ to get a better error message + # if a user attempts to subclass a NewType instance. bpo-46170 + supercls_name = self.__name__ + + class Dummy: + def __init_subclass__(cls): + subcls_name = cls.__name__ + raise TypeError( + f"Cannot subclass an instance of NewType. " + f"Perhaps you were looking for: " + f"`{subcls_name} = NewType({subcls_name!r}, {supercls_name})`" + ) + + return (Dummy,) + + def __repr__(self): + return f'{self.__module__}.{self.__qualname__}' + + def __reduce__(self): + return self.__qualname__ + + if sys.version_info >= (3, 10): + # PEP 604 methods + # It doesn't make sense to have these methods on Python <3.10 + + def __or__(self, other): + return typing.Union[self, other] + + def __ror__(self, other): + return typing.Union[other, self] + + +if hasattr(typing, "TypeAliasType"): + TypeAliasType = typing.TypeAliasType +else: + def _is_unionable(obj): + """Corresponds to is_unionable() in unionobject.c in CPython.""" + return obj is None or isinstance(obj, ( + type, + _types.GenericAlias, + _types.UnionType, + TypeAliasType, + )) + + class TypeAliasType: + """Create named, parameterized type aliases. + + This provides a backport of the new `type` statement in Python 3.12: + + type ListOrSet[T] = list[T] | set[T] + + is equivalent to: + + T = TypeVar("T") + ListOrSet = TypeAliasType("ListOrSet", list[T] | set[T], type_params=(T,)) + + The name ListOrSet can then be used as an alias for the type it refers to. + + The type_params argument should contain all the type parameters used + in the value of the type alias. If the alias is not generic, this + argument is omitted. + + Static type checkers should only support type aliases declared using + TypeAliasType that follow these rules: + + - The first argument (the name) must be a string literal. + - The TypeAliasType instance must be immediately assigned to a variable + of the same name. (For example, 'X = TypeAliasType("Y", int)' is invalid, + as is 'X, Y = TypeAliasType("X", int), TypeAliasType("Y", int)'). + + """ + + def __init__(self, name: str, value, *, type_params=()): + if not isinstance(name, str): + raise TypeError("TypeAliasType name must be a string") + self.__value__ = value + self.__type_params__ = type_params + + parameters = [] + for type_param in type_params: + if isinstance(type_param, TypeVarTuple): + parameters.extend(type_param) + else: + parameters.append(type_param) + self.__parameters__ = tuple(parameters) + def_mod = _caller() + if def_mod != 'typing_extensions': + self.__module__ = def_mod + # Setting this attribute closes the TypeAliasType from further modification + self.__name__ = name + + def __setattr__(self, name: str, value: object, /) -> None: + if hasattr(self, "__name__"): + self._raise_attribute_error(name) + super().__setattr__(name, value) + + def __delattr__(self, name: str, /) -> Never: + self._raise_attribute_error(name) + + def _raise_attribute_error(self, name: str) -> Never: + # Match the Python 3.12 error messages exactly + if name == "__name__": + raise AttributeError("readonly attribute") + elif name in {"__value__", "__type_params__", "__parameters__", "__module__"}: + raise AttributeError( + f"attribute '{name}' of 'typing.TypeAliasType' objects " + "is not writable" + ) + else: + raise AttributeError( + f"'typing.TypeAliasType' object has no attribute '{name}'" + ) + + def __repr__(self) -> str: + return self.__name__ + + def __getitem__(self, parameters): + if not isinstance(parameters, tuple): + parameters = (parameters,) + parameters = [ + typing._type_check( + item, f'Subscripting {self.__name__} requires a type.' + ) + for item in parameters + ] + return typing._GenericAlias(self, tuple(parameters)) + + def __reduce__(self): + return self.__name__ + + def __init_subclass__(cls, *args, **kwargs): + raise TypeError( + "type 'typing_extensions.TypeAliasType' is not an acceptable base type" + ) + + # The presence of this method convinces typing._type_check + # that TypeAliasTypes are types. + def __call__(self): + raise TypeError("Type alias is not callable") + + if sys.version_info >= (3, 10): + def __or__(self, right): + # For forward compatibility with 3.12, reject Unions + # that are not accepted by the built-in Union. + if not _is_unionable(right): + return NotImplemented + return typing.Union[self, right] + + def __ror__(self, left): + if not _is_unionable(left): + return NotImplemented + return typing.Union[left, self] + + +if hasattr(typing, "is_protocol"): + is_protocol = typing.is_protocol + get_protocol_members = typing.get_protocol_members +else: + def is_protocol(tp: type, /) -> bool: + """Return True if the given type is a Protocol. + + Example:: + + >>> from typing_extensions import Protocol, is_protocol + >>> class P(Protocol): + ... def a(self) -> str: ... + ... b: int + >>> is_protocol(P) + True + >>> is_protocol(int) + False + """ + return ( + isinstance(tp, type) + and getattr(tp, '_is_protocol', False) + and tp is not Protocol + and tp is not typing.Protocol + ) + + def get_protocol_members(tp: type, /) -> typing.FrozenSet[str]: + """Return the set of members defined in a Protocol. + + Example:: + + >>> from typing_extensions import Protocol, get_protocol_members + >>> class P(Protocol): + ... def a(self) -> str: ... + ... b: int + >>> get_protocol_members(P) + frozenset({'a', 'b'}) + + Raise a TypeError for arguments that are not Protocols. + """ + if not is_protocol(tp): + raise TypeError(f'{tp!r} is not a Protocol') + if hasattr(tp, '__protocol_attrs__'): + return frozenset(tp.__protocol_attrs__) + return frozenset(_get_protocol_attrs(tp)) + + +if hasattr(typing, "Doc"): + Doc = typing.Doc +else: + class Doc: + """Define the documentation of a type annotation using ``Annotated``, to be + used in class attributes, function and method parameters, return values, + and variables. + + The value should be a positional-only string literal to allow static tools + like editors and documentation generators to use it. + + This complements docstrings. + + The string value passed is available in the attribute ``documentation``. + + Example:: + + >>> from typing_extensions import Annotated, Doc + >>> def hi(to: Annotated[str, Doc("Who to say hi to")]) -> None: ... + """ + def __init__(self, documentation: str, /) -> None: + self.documentation = documentation + + def __repr__(self) -> str: + return f"Doc({self.documentation!r})" + + def __hash__(self) -> int: + return hash(self.documentation) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, Doc): + return NotImplemented + return self.documentation == other.documentation + + +_CapsuleType = getattr(_types, "CapsuleType", None) + +if _CapsuleType is None: + try: + import _socket + except ImportError: + pass + else: + _CAPI = getattr(_socket, "CAPI", None) + if _CAPI is not None: + _CapsuleType = type(_CAPI) + +if _CapsuleType is not None: + CapsuleType = _CapsuleType + __all__.append("CapsuleType") + + +# Aliases for items that have always been in typing. +# Explicitly assign these (rather than using `from typing import *` at the top), +# so that we get a CI error if one of these is deleted from typing.py +# in a future version of Python +AbstractSet = typing.AbstractSet +AnyStr = typing.AnyStr +BinaryIO = typing.BinaryIO +Callable = typing.Callable +Collection = typing.Collection +Container = typing.Container +Dict = typing.Dict +ForwardRef = typing.ForwardRef +FrozenSet = typing.FrozenSet +Generic = typing.Generic +Hashable = typing.Hashable +IO = typing.IO +ItemsView = typing.ItemsView +Iterable = typing.Iterable +Iterator = typing.Iterator +KeysView = typing.KeysView +List = typing.List +Mapping = typing.Mapping +MappingView = typing.MappingView +Match = typing.Match +MutableMapping = typing.MutableMapping +MutableSequence = typing.MutableSequence +MutableSet = typing.MutableSet +Optional = typing.Optional +Pattern = typing.Pattern +Reversible = typing.Reversible +Sequence = typing.Sequence +Set = typing.Set +Sized = typing.Sized +TextIO = typing.TextIO +Tuple = typing.Tuple +Union = typing.Union +ValuesView = typing.ValuesView +cast = typing.cast +no_type_check = typing.no_type_check +no_type_check_decorator = typing.no_type_check_decorator diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..50e6c2f54f38db5bf5175a7a4db219815785f1bc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/__init__.py @@ -0,0 +1,21 @@ +# mypy: disable_error_code=call-overload +# pyright: reportCallIssue=false, reportArgumentType=false +# Can't disable on the exact line because distutils doesn't exists on Python 3.12 +# and type-checkers aren't aware of distutils_hack, +# causing distutils.command.bdist.bdist.format_commands to be Any. + +import sys + +from distutils.command.bdist import bdist + +if 'egg' not in bdist.format_commands: + try: + # format_commands is a dict in vendored distutils + # It used to be a list in older (stdlib) distutils + # We support both for backwards compatibility + bdist.format_commands['egg'] = ('bdist_egg', "Python .egg file") + except TypeError: + bdist.format_command['egg'] = ('bdist_egg', "Python .egg file") + bdist.format_commands.append('egg') + +del bdist, sys diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/_requirestxt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/_requirestxt.py new file mode 100644 index 0000000000000000000000000000000000000000..9029b125141158d4b12de1db8215819ce6ccf437 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/_requirestxt.py @@ -0,0 +1,131 @@ +"""Helper code used to generate ``requires.txt`` files in the egg-info directory. + +The ``requires.txt`` file has an specific format: + - Environment markers need to be part of the section headers and + should not be part of the requirement spec itself. + +See https://setuptools.pypa.io/en/latest/deprecated/python_eggs.html#requires-txt +""" + +from __future__ import annotations + +import io +from collections import defaultdict +from collections.abc import Mapping +from itertools import filterfalse +from typing import TypeVar + +from jaraco.text import yield_lines +from packaging.requirements import Requirement + +from .. import _reqs +from .._reqs import _StrOrIter + +# dict can work as an ordered set +_T = TypeVar("_T") +_Ordered = dict[_T, None] + + +def _prepare( + install_requires: _StrOrIter, extras_require: Mapping[str, _StrOrIter] +) -> tuple[list[str], dict[str, list[str]]]: + """Given values for ``install_requires`` and ``extras_require`` + create modified versions in a way that can be written in ``requires.txt`` + """ + extras = _convert_extras_requirements(extras_require) + return _move_install_requirements_markers(install_requires, extras) + + +def _convert_extras_requirements( + extras_require: Mapping[str, _StrOrIter], +) -> defaultdict[str, _Ordered[Requirement]]: + """ + Convert requirements in `extras_require` of the form + `"extra": ["barbazquux; {marker}"]` to + `"extra:{marker}": ["barbazquux"]`. + """ + output = defaultdict[str, _Ordered[Requirement]](dict) + for section, v in extras_require.items(): + # Do not strip empty sections. + output[section] + for r in _reqs.parse(v): + output[section + _suffix_for(r)].setdefault(r) + + return output + + +def _move_install_requirements_markers( + install_requires: _StrOrIter, extras_require: Mapping[str, _Ordered[Requirement]] +) -> tuple[list[str], dict[str, list[str]]]: + """ + The ``requires.txt`` file has an specific format: + - Environment markers need to be part of the section headers and + should not be part of the requirement spec itself. + + Move requirements in ``install_requires`` that are using environment + markers ``extras_require``. + """ + + # divide the install_requires into two sets, simple ones still + # handled by install_requires and more complex ones handled by extras_require. + + inst_reqs = list(_reqs.parse(install_requires)) + simple_reqs = filter(_no_marker, inst_reqs) + complex_reqs = filterfalse(_no_marker, inst_reqs) + simple_install_requires = list(map(str, simple_reqs)) + + for r in complex_reqs: + extras_require[':' + str(r.marker)].setdefault(r) + + expanded_extras = dict( + # list(dict.fromkeys(...)) ensures a list of unique strings + (k, list(dict.fromkeys(str(r) for r in map(_clean_req, v)))) + for k, v in extras_require.items() + ) + + return simple_install_requires, expanded_extras + + +def _suffix_for(req): + """Return the 'extras_require' suffix for a given requirement.""" + return ':' + str(req.marker) if req.marker else '' + + +def _clean_req(req): + """Given a Requirement, remove environment markers and return it""" + r = Requirement(str(req)) # create a copy before modifying + r.marker = None + return r + + +def _no_marker(req): + return not req.marker + + +def _write_requirements(stream, reqs): + lines = yield_lines(reqs or ()) + + def append_cr(line): + return line + '\n' + + lines = map(append_cr, lines) + stream.writelines(lines) + + +def write_requirements(cmd, basename, filename): + dist = cmd.distribution + data = io.StringIO() + install_requires, extras_require = _prepare( + dist.install_requires or (), dist.extras_require or {} + ) + _write_requirements(data, install_requires) + for extra in sorted(extras_require): + data.write('\n[{extra}]\n'.format(**vars())) + _write_requirements(data, extras_require[extra]) + cmd.write_or_delete_file("requirements", filename, data.getvalue()) + + +def write_setup_requirements(cmd, basename, filename): + data = io.StringIO() + _write_requirements(data, cmd.distribution.setup_requires) + cmd.write_or_delete_file("setup-requirements", filename, data.getvalue()) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/alias.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/alias.py new file mode 100644 index 0000000000000000000000000000000000000000..b8d74af71d4464abf899b9f872b5741f23167165 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/alias.py @@ -0,0 +1,77 @@ +from setuptools.command.setopt import config_file, edit_config, option_base + +from distutils.errors import DistutilsOptionError + + +def shquote(arg): + """Quote an argument for later parsing by shlex.split()""" + for c in '"', "'", "\\", "#": + if c in arg: + return repr(arg) + if arg.split() != [arg]: + return repr(arg) + return arg + + +class alias(option_base): + """Define a shortcut that invokes one or more commands""" + + description = "define a shortcut to invoke one or more commands" + command_consumes_arguments = True + + user_options = [ + ('remove', 'r', 'remove (unset) the alias'), + ] + option_base.user_options + + boolean_options = option_base.boolean_options + ['remove'] + + def initialize_options(self): + option_base.initialize_options(self) + self.args = None + self.remove = None + + def finalize_options(self) -> None: + option_base.finalize_options(self) + if self.remove and len(self.args) != 1: + raise DistutilsOptionError( + "Must specify exactly one argument (the alias name) when using --remove" + ) + + def run(self) -> None: + aliases = self.distribution.get_option_dict('aliases') + + if not self.args: + print("Command Aliases") + print("---------------") + for alias in aliases: + print("setup.py alias", format_alias(alias, aliases)) + return + + elif len(self.args) == 1: + (alias,) = self.args + if self.remove: + command = None + elif alias in aliases: + print("setup.py alias", format_alias(alias, aliases)) + return + else: + print(f"No alias definition found for {alias!r}") + return + else: + alias = self.args[0] + command = ' '.join(map(shquote, self.args[1:])) + + edit_config(self.filename, {'aliases': {alias: command}}, self.dry_run) + + +def format_alias(name, aliases): + source, command = aliases[name] + if source == config_file('global'): + source = '--global-config ' + elif source == config_file('user'): + source = '--user-config ' + elif source == config_file('local'): + source = '' + else: + source = f'--filename={source!r}' + return source + name + ' ' + command diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/bdist_egg.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/bdist_egg.py new file mode 100644 index 0000000000000000000000000000000000000000..7f66c3ba6a7f554e910800a21c5ccc3733abf07d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/bdist_egg.py @@ -0,0 +1,479 @@ +"""setuptools.command.bdist_egg + +Build .egg distributions""" + +from __future__ import annotations + +import marshal +import os +import re +import sys +import textwrap +from sysconfig import get_path, get_python_version +from types import CodeType +from typing import TYPE_CHECKING, Literal + +from setuptools import Command +from setuptools.extension import Library + +from .._path import StrPathT, ensure_directory + +from distutils import log +from distutils.dir_util import mkpath, remove_tree + +if TYPE_CHECKING: + from typing_extensions import TypeAlias + +# Same as zipfile._ZipFileMode from typeshed +_ZipFileMode: TypeAlias = Literal["r", "w", "x", "a"] + + +def _get_purelib(): + return get_path("purelib") + + +def strip_module(filename): + if '.' in filename: + filename = os.path.splitext(filename)[0] + if filename.endswith('module'): + filename = filename[:-6] + return filename + + +def sorted_walk(dir): + """Do os.walk in a reproducible way, + independent of indeterministic filesystem readdir order + """ + for base, dirs, files in os.walk(dir): + dirs.sort() + files.sort() + yield base, dirs, files + + +def write_stub(resource, pyfile) -> None: + _stub_template = textwrap.dedent( + """ + def __bootstrap__(): + global __bootstrap__, __loader__, __file__ + import sys, pkg_resources, importlib.util + __file__ = pkg_resources.resource_filename(__name__, %r) + __loader__ = None; del __bootstrap__, __loader__ + spec = importlib.util.spec_from_file_location(__name__,__file__) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + __bootstrap__() + """ + ).lstrip() + with open(pyfile, 'w', encoding="utf-8") as f: + f.write(_stub_template % resource) + + +class bdist_egg(Command): + description = 'create an "egg" distribution' + + user_options = [ + ('bdist-dir=', 'b', "temporary directory for creating the distribution"), + ( + 'plat-name=', + 'p', + "platform name to embed in generated filenames " + "(by default uses `pkg_resources.get_build_platform()`)", + ), + ('exclude-source-files', None, "remove all .py files from the generated egg"), + ( + 'keep-temp', + 'k', + "keep the pseudo-installation tree around after " + "creating the distribution archive", + ), + ('dist-dir=', 'd', "directory to put final built distributions in"), + ('skip-build', None, "skip rebuilding everything (for testing/debugging)"), + ] + + boolean_options = ['keep-temp', 'skip-build', 'exclude-source-files'] + + def initialize_options(self): + self.bdist_dir = None + self.plat_name = None + self.keep_temp = False + self.dist_dir = None + self.skip_build = False + self.egg_output = None + self.exclude_source_files = None + + def finalize_options(self) -> None: + ei_cmd = self.ei_cmd = self.get_finalized_command("egg_info") + self.egg_info = ei_cmd.egg_info + + if self.bdist_dir is None: + bdist_base = self.get_finalized_command('bdist').bdist_base + self.bdist_dir = os.path.join(bdist_base, 'egg') + + if self.plat_name is None: + from pkg_resources import get_build_platform + + self.plat_name = get_build_platform() + + self.set_undefined_options('bdist', ('dist_dir', 'dist_dir')) + + if self.egg_output is None: + # Compute filename of the output egg + basename = ei_cmd._get_egg_basename( + py_version=get_python_version(), + platform=self.distribution.has_ext_modules() and self.plat_name, + ) + + self.egg_output = os.path.join(self.dist_dir, basename + '.egg') + + def do_install_data(self) -> None: + # Hack for packages that install data to install's --install-lib + self.get_finalized_command('install').install_lib = self.bdist_dir + + site_packages = os.path.normcase(os.path.realpath(_get_purelib())) + old, self.distribution.data_files = self.distribution.data_files, [] + + for item in old: + if isinstance(item, tuple) and len(item) == 2: + if os.path.isabs(item[0]): + realpath = os.path.realpath(item[0]) + normalized = os.path.normcase(realpath) + if normalized == site_packages or normalized.startswith( + site_packages + os.sep + ): + item = realpath[len(site_packages) + 1 :], item[1] + # XXX else: raise ??? + self.distribution.data_files.append(item) + + try: + log.info("installing package data to %s", self.bdist_dir) + self.call_command('install_data', force=False, root=None) + finally: + self.distribution.data_files = old + + def get_outputs(self): + return [self.egg_output] + + def call_command(self, cmdname, **kw): + """Invoke reinitialized command `cmdname` with keyword args""" + for dirname in INSTALL_DIRECTORY_ATTRS: + kw.setdefault(dirname, self.bdist_dir) + kw.setdefault('skip_build', self.skip_build) + kw.setdefault('dry_run', self.dry_run) + cmd = self.reinitialize_command(cmdname, **kw) + self.run_command(cmdname) + return cmd + + def run(self): # noqa: C901 # is too complex (14) # FIXME + # Generate metadata first + self.run_command("egg_info") + # We run install_lib before install_data, because some data hacks + # pull their data path from the install_lib command. + log.info("installing library code to %s", self.bdist_dir) + instcmd = self.get_finalized_command('install') + old_root = instcmd.root + instcmd.root = None + if self.distribution.has_c_libraries() and not self.skip_build: + self.run_command('build_clib') + cmd = self.call_command('install_lib', warn_dir=False) + instcmd.root = old_root + + all_outputs, ext_outputs = self.get_ext_outputs() + self.stubs = [] + to_compile = [] + for p, ext_name in enumerate(ext_outputs): + filename, _ext = os.path.splitext(ext_name) + pyfile = os.path.join(self.bdist_dir, strip_module(filename) + '.py') + self.stubs.append(pyfile) + log.info("creating stub loader for %s", ext_name) + if not self.dry_run: + write_stub(os.path.basename(ext_name), pyfile) + to_compile.append(pyfile) + ext_outputs[p] = ext_name.replace(os.sep, '/') + + if to_compile: + cmd.byte_compile(to_compile) + if self.distribution.data_files: + self.do_install_data() + + # Make the EGG-INFO directory + archive_root = self.bdist_dir + egg_info = os.path.join(archive_root, 'EGG-INFO') + self.mkpath(egg_info) + if self.distribution.scripts: + script_dir = os.path.join(egg_info, 'scripts') + log.info("installing scripts to %s", script_dir) + self.call_command('install_scripts', install_dir=script_dir, no_ep=True) + + self.copy_metadata_to(egg_info) + native_libs = os.path.join(egg_info, "native_libs.txt") + if all_outputs: + log.info("writing %s", native_libs) + if not self.dry_run: + ensure_directory(native_libs) + with open(native_libs, 'wt', encoding="utf-8") as libs_file: + libs_file.write('\n'.join(all_outputs)) + libs_file.write('\n') + elif os.path.isfile(native_libs): + log.info("removing %s", native_libs) + if not self.dry_run: + os.unlink(native_libs) + + write_safety_flag(os.path.join(archive_root, 'EGG-INFO'), self.zip_safe()) + + if os.path.exists(os.path.join(self.egg_info, 'depends.txt')): + log.warn( + "WARNING: 'depends.txt' will not be used by setuptools 0.6!\n" + "Use the install_requires/extras_require setup() args instead." + ) + + if self.exclude_source_files: + self.zap_pyfiles() + + # Make the archive + make_zipfile( + self.egg_output, + archive_root, + verbose=self.verbose, + dry_run=self.dry_run, + mode=self.gen_header(), + ) + if not self.keep_temp: + remove_tree(self.bdist_dir, dry_run=self.dry_run) + + # Add to 'Distribution.dist_files' so that the "upload" command works + getattr(self.distribution, 'dist_files', []).append(( + 'bdist_egg', + get_python_version(), + self.egg_output, + )) + + def zap_pyfiles(self): + log.info("Removing .py files from temporary directory") + for base, dirs, files in walk_egg(self.bdist_dir): + for name in files: + path = os.path.join(base, name) + + if name.endswith('.py'): + log.debug("Deleting %s", path) + os.unlink(path) + + if base.endswith('__pycache__'): + path_old = path + + pattern = r'(?P.+)\.(?P[^.]+)\.pyc' + m = re.match(pattern, name) + path_new = os.path.join(base, os.pardir, m.group('name') + '.pyc') + log.info(f"Renaming file from [{path_old}] to [{path_new}]") + try: + os.remove(path_new) + except OSError: + pass + os.rename(path_old, path_new) + + def zip_safe(self): + safe = getattr(self.distribution, 'zip_safe', None) + if safe is not None: + return safe + log.warn("zip_safe flag not set; analyzing archive contents...") + return analyze_egg(self.bdist_dir, self.stubs) + + def gen_header(self) -> Literal["w"]: + return 'w' + + def copy_metadata_to(self, target_dir) -> None: + "Copy metadata (egg info) to the target_dir" + # normalize the path (so that a forward-slash in egg_info will + # match using startswith below) + norm_egg_info = os.path.normpath(self.egg_info) + prefix = os.path.join(norm_egg_info, '') + for path in self.ei_cmd.filelist.files: + if path.startswith(prefix): + target = os.path.join(target_dir, path[len(prefix) :]) + ensure_directory(target) + self.copy_file(path, target) + + def get_ext_outputs(self): + """Get a list of relative paths to C extensions in the output distro""" + + all_outputs = [] + ext_outputs = [] + + paths = {self.bdist_dir: ''} + for base, dirs, files in sorted_walk(self.bdist_dir): + all_outputs.extend( + paths[base] + filename + for filename in files + if os.path.splitext(filename)[1].lower() in NATIVE_EXTENSIONS + ) + for filename in dirs: + paths[os.path.join(base, filename)] = paths[base] + filename + '/' + + if self.distribution.has_ext_modules(): + build_cmd = self.get_finalized_command('build_ext') + for ext in build_cmd.extensions: + if isinstance(ext, Library): + continue + fullname = build_cmd.get_ext_fullname(ext.name) + filename = build_cmd.get_ext_filename(fullname) + if not os.path.basename(filename).startswith('dl-'): + if os.path.exists(os.path.join(self.bdist_dir, filename)): + ext_outputs.append(filename) + + return all_outputs, ext_outputs + + +NATIVE_EXTENSIONS: dict[str, None] = dict.fromkeys('.dll .so .dylib .pyd'.split()) + + +def walk_egg(egg_dir): + """Walk an unpacked egg's contents, skipping the metadata directory""" + walker = sorted_walk(egg_dir) + base, dirs, files = next(walker) + if 'EGG-INFO' in dirs: + dirs.remove('EGG-INFO') + yield base, dirs, files + yield from walker + + +def analyze_egg(egg_dir, stubs): + # check for existing flag in EGG-INFO + for flag, fn in safety_flags.items(): + if os.path.exists(os.path.join(egg_dir, 'EGG-INFO', fn)): + return flag + if not can_scan(): + return False + safe = True + for base, dirs, files in walk_egg(egg_dir): + for name in files: + if name.endswith('.py') or name.endswith('.pyw'): + continue + elif name.endswith('.pyc') or name.endswith('.pyo'): + # always scan, even if we already know we're not safe + safe = scan_module(egg_dir, base, name, stubs) and safe + return safe + + +def write_safety_flag(egg_dir, safe) -> None: + # Write or remove zip safety flag file(s) + for flag, fn in safety_flags.items(): + fn = os.path.join(egg_dir, fn) + if os.path.exists(fn): + if safe is None or bool(safe) != flag: + os.unlink(fn) + elif safe is not None and bool(safe) == flag: + with open(fn, 'wt', encoding="utf-8") as f: + f.write('\n') + + +safety_flags = { + True: 'zip-safe', + False: 'not-zip-safe', +} + + +def scan_module(egg_dir, base, name, stubs): + """Check whether module possibly uses unsafe-for-zipfile stuff""" + + filename = os.path.join(base, name) + if filename[:-1] in stubs: + return True # Extension module + pkg = base[len(egg_dir) + 1 :].replace(os.sep, '.') + module = pkg + (pkg and '.' or '') + os.path.splitext(name)[0] + skip = 16 # skip magic & reserved? & date & file size + f = open(filename, 'rb') + f.read(skip) + code = marshal.load(f) + f.close() + safe = True + symbols = dict.fromkeys(iter_symbols(code)) + for bad in ['__file__', '__path__']: + if bad in symbols: + log.warn("%s: module references %s", module, bad) + safe = False + if 'inspect' in symbols: + for bad in [ + 'getsource', + 'getabsfile', + 'getfile', + 'getsourcefile', + 'getsourcelines', + 'findsource', + 'getcomments', + 'getframeinfo', + 'getinnerframes', + 'getouterframes', + 'stack', + 'trace', + ]: + if bad in symbols: + log.warn("%s: module MAY be using inspect.%s", module, bad) + safe = False + return safe + + +def iter_symbols(code): + """Yield names and strings used by `code` and its nested code objects""" + yield from code.co_names + for const in code.co_consts: + if isinstance(const, str): + yield const + elif isinstance(const, CodeType): + yield from iter_symbols(const) + + +def can_scan() -> bool: + if not sys.platform.startswith('java') and sys.platform != 'cli': + # CPython, PyPy, etc. + return True + log.warn("Unable to analyze compiled code on this platform.") + log.warn( + "Please ask the author to include a 'zip_safe'" + " setting (either True or False) in the package's setup.py" + ) + return False + + +# Attribute names of options for commands that might need to be convinced to +# install to the egg build directory + +INSTALL_DIRECTORY_ATTRS = ['install_lib', 'install_dir', 'install_data', 'install_base'] + + +def make_zipfile( + zip_filename: StrPathT, + base_dir, + verbose: bool = False, + dry_run: bool = False, + compress=True, + mode: _ZipFileMode = 'w', +) -> StrPathT: + """Create a zip file from all the files under 'base_dir'. The output + zip file will be named 'base_dir' + ".zip". Uses either the "zipfile" + Python module (if available) or the InfoZIP "zip" utility (if installed + and found on the default search path). If neither tool is available, + raises DistutilsExecError. Returns the name of the output zip file. + """ + import zipfile + + mkpath(os.path.dirname(zip_filename), dry_run=dry_run) # type: ignore[arg-type] # python/mypy#18075 + log.info("creating '%s' and adding '%s' to it", zip_filename, base_dir) + + def visit(z, dirname, names): + for name in names: + path = os.path.normpath(os.path.join(dirname, name)) + if os.path.isfile(path): + p = path[len(base_dir) + 1 :] + if not dry_run: + z.write(path, p) + log.debug("adding '%s'", p) + + compression = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED + if not dry_run: + z = zipfile.ZipFile(zip_filename, mode, compression=compression) + for dirname, dirs, files in sorted_walk(base_dir): + visit(z, dirname, files) + z.close() + else: + for dirname, dirs, files in sorted_walk(base_dir): + visit(None, dirname, files) + return zip_filename diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/bdist_rpm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/bdist_rpm.py new file mode 100644 index 0000000000000000000000000000000000000000..6dbb27002a8cc7f1cafd4f5cbb75ba39aa7430e2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/bdist_rpm.py @@ -0,0 +1,42 @@ +from ..dist import Distribution +from ..warnings import SetuptoolsDeprecationWarning + +import distutils.command.bdist_rpm as orig + + +class bdist_rpm(orig.bdist_rpm): + """ + Override the default bdist_rpm behavior to do the following: + + 1. Run egg_info to ensure the name and version are properly calculated. + 2. Always run 'install' using --single-version-externally-managed to + disable eggs in RPM distributions. + """ + + distribution: Distribution # override distutils.dist.Distribution with setuptools.dist.Distribution + + def run(self) -> None: + SetuptoolsDeprecationWarning.emit( + "Deprecated command", + """ + bdist_rpm is deprecated and will be removed in a future version. + Use bdist_wheel (wheel packages) instead. + """, + see_url="https://github.com/pypa/setuptools/issues/1988", + due_date=(2023, 10, 30), # Deprecation introduced in 22 Oct 2021. + ) + + # ensure distro name is up-to-date + self.run_command('egg_info') + + orig.bdist_rpm.run(self) + + def _make_spec_file(self): + spec = orig.bdist_rpm._make_spec_file(self) + return [ + line.replace( + "setup.py install ", + "setup.py install --single-version-externally-managed ", + ).replace("%setup", "%setup -n %{name}-%{unmangled_version}") + for line in spec + ] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/bdist_wheel.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/bdist_wheel.py new file mode 100644 index 0000000000000000000000000000000000000000..27b36232ec156c409e932f2c4a0700ce84bff75c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/bdist_wheel.py @@ -0,0 +1,600 @@ +""" +Create a wheel (.whl) distribution. + +A wheel is a built archive format. +""" + +from __future__ import annotations + +import os +import re +import shutil +import struct +import sys +import sysconfig +import warnings +from collections.abc import Iterable, Sequence +from email.generator import BytesGenerator +from glob import iglob +from typing import Literal, cast +from zipfile import ZIP_DEFLATED, ZIP_STORED + +from packaging import tags, version as _packaging_version +from wheel.wheelfile import WheelFile + +from .. import Command, __version__, _shutil +from .._normalization import safer_name +from ..warnings import SetuptoolsDeprecationWarning +from .egg_info import egg_info as egg_info_cls + +from distutils import log + + +def safe_version(version: str) -> str: + """ + Convert an arbitrary string to a standard version string + """ + try: + # normalize the version + return str(_packaging_version.Version(version)) + except _packaging_version.InvalidVersion: + version = version.replace(" ", ".") + return re.sub("[^A-Za-z0-9.]+", "-", version) + + +setuptools_major_version = int(__version__.split(".")[0]) + +PY_LIMITED_API_PATTERN = r"cp3\d" + + +def _is_32bit_interpreter() -> bool: + return struct.calcsize("P") == 4 + + +def python_tag() -> str: + return f"py{sys.version_info.major}" + + +def get_platform(archive_root: str | None) -> str: + """Return our platform name 'win32', 'linux_x86_64'""" + result = sysconfig.get_platform() + if result.startswith("macosx") and archive_root is not None: # pragma: no cover + from wheel.macosx_libfile import calculate_macosx_platform_tag + + result = calculate_macosx_platform_tag(archive_root, result) + elif _is_32bit_interpreter(): + if result == "linux-x86_64": + # pip pull request #3497 + result = "linux-i686" + elif result == "linux-aarch64": + # packaging pull request #234 + # TODO armv8l, packaging pull request #690 => this did not land + # in pip/packaging yet + result = "linux-armv7l" + + return result.replace("-", "_") + + +def get_flag( + var: str, fallback: bool, expected: bool = True, warn: bool = True +) -> bool: + """Use a fallback value for determining SOABI flags if the needed config + var is unset or unavailable.""" + val = sysconfig.get_config_var(var) + if val is None: + if warn: + warnings.warn( + f"Config variable '{var}' is unset, Python ABI tag may be incorrect", + RuntimeWarning, + stacklevel=2, + ) + return fallback + return val == expected + + +def get_abi_tag() -> str | None: + """Return the ABI tag based on SOABI (if available) or emulate SOABI (PyPy2).""" + soabi: str = sysconfig.get_config_var("SOABI") + impl = tags.interpreter_name() + if not soabi and impl in ("cp", "pp") and hasattr(sys, "maxunicode"): + d = "" + u = "" + if get_flag("Py_DEBUG", hasattr(sys, "gettotalrefcount"), warn=(impl == "cp")): + d = "d" + + abi = f"{impl}{tags.interpreter_version()}{d}{u}" + elif soabi and impl == "cp" and soabi.startswith("cpython"): + # non-Windows + abi = "cp" + soabi.split("-")[1] + elif soabi and impl == "cp" and soabi.startswith("cp"): + # Windows + abi = soabi.split("-")[0] + if hasattr(sys, "gettotalrefcount"): + # using debug build; append "d" flag + abi += "d" + elif soabi and impl == "pp": + # we want something like pypy36-pp73 + abi = "-".join(soabi.split("-")[:2]) + abi = abi.replace(".", "_").replace("-", "_") + elif soabi and impl == "graalpy": + abi = "-".join(soabi.split("-")[:3]) + abi = abi.replace(".", "_").replace("-", "_") + elif soabi: + abi = soabi.replace(".", "_").replace("-", "_") + else: + abi = None + + return abi + + +def safer_version(version: str) -> str: + return safe_version(version).replace("-", "_") + + +class bdist_wheel(Command): + description = "create a wheel distribution" + + supported_compressions = { + "stored": ZIP_STORED, + "deflated": ZIP_DEFLATED, + } + + user_options = [ + ("bdist-dir=", "b", "temporary directory for creating the distribution"), + ( + "plat-name=", + "p", + "platform name to embed in generated filenames " + f"[default: {get_platform(None)}]", + ), + ( + "keep-temp", + "k", + "keep the pseudo-installation tree around after " + "creating the distribution archive", + ), + ("dist-dir=", "d", "directory to put final built distributions in"), + ("skip-build", None, "skip rebuilding everything (for testing/debugging)"), + ( + "relative", + None, + "build the archive using relative paths [default: false]", + ), + ( + "owner=", + "u", + "Owner name used when creating a tar file [default: current user]", + ), + ( + "group=", + "g", + "Group name used when creating a tar file [default: current group]", + ), + ("universal", None, "*DEPRECATED* make a universal wheel [default: false]"), + ( + "compression=", + None, + f"zipfile compression (one of: {', '.join(supported_compressions)}) [default: 'deflated']", + ), + ( + "python-tag=", + None, + f"Python implementation compatibility tag [default: '{python_tag()}']", + ), + ( + "build-number=", + None, + "Build number for this particular version. " + "As specified in PEP-0427, this must start with a digit. " + "[default: None]", + ), + ( + "py-limited-api=", + None, + "Python tag (cp32|cp33|cpNN) for abi3 wheel tag [default: false]", + ), + ( + "dist-info-dir=", + None, + "directory where a pre-generated dist-info can be found (e.g. as a " + "result of calling the PEP517 'prepare_metadata_for_build_wheel' " + "method)", + ), + ] + + boolean_options = ["keep-temp", "skip-build", "relative", "universal"] + + def initialize_options(self) -> None: + self.bdist_dir: str | None = None + self.data_dir = "" + self.plat_name: str | None = None + self.plat_tag: str | None = None + self.format = "zip" + self.keep_temp = False + self.dist_dir: str | None = None + self.dist_info_dir = None + self.egginfo_dir: str | None = None + self.root_is_pure: bool | None = None + self.skip_build = False + self.relative = False + self.owner = None + self.group = None + self.universal = False + self.compression: str | int = "deflated" + self.python_tag = python_tag() + self.build_number: str | None = None + self.py_limited_api: str | Literal[False] = False + self.plat_name_supplied = False + + def finalize_options(self) -> None: + if not self.bdist_dir: + bdist_base = self.get_finalized_command("bdist").bdist_base + self.bdist_dir = os.path.join(bdist_base, "wheel") + + if self.dist_info_dir is None: + egg_info = cast(egg_info_cls, self.distribution.get_command_obj("egg_info")) + egg_info.ensure_finalized() # needed for correct `wheel_dist_name` + + self.data_dir = self.wheel_dist_name + ".data" + self.plat_name_supplied = bool(self.plat_name) + + need_options = ("dist_dir", "plat_name", "skip_build") + + self.set_undefined_options("bdist", *zip(need_options, need_options)) + + self.root_is_pure = not ( + self.distribution.has_ext_modules() or self.distribution.has_c_libraries() + ) + + self._validate_py_limited_api() + + # Support legacy [wheel] section for setting universal + wheel = self.distribution.get_option_dict("wheel") + if "universal" in wheel: # pragma: no cover + # please don't define this in your global configs + log.warn("The [wheel] section is deprecated. Use [bdist_wheel] instead.") + val = wheel["universal"][1].strip() + if val.lower() in ("1", "true", "yes"): + self.universal = True + + if self.universal: + SetuptoolsDeprecationWarning.emit( + "bdist_wheel.universal is deprecated", + """ + With Python 2.7 end-of-life, support for building universal wheels + (i.e., wheels that support both Python 2 and Python 3) + is being obviated. + Please discontinue using this option, or if you still need it, + file an issue with pypa/setuptools describing your use case. + """, + due_date=(2025, 8, 30), # Introduced in 2024-08-30 + ) + + if self.build_number is not None and not self.build_number[:1].isdigit(): + raise ValueError("Build tag (build-number) must start with a digit.") + + def _validate_py_limited_api(self) -> None: + if not self.py_limited_api: + return + + if not re.match(PY_LIMITED_API_PATTERN, self.py_limited_api): + raise ValueError(f"py-limited-api must match '{PY_LIMITED_API_PATTERN}'") + + if sysconfig.get_config_var("Py_GIL_DISABLED"): + raise ValueError( + f"`py_limited_api={self.py_limited_api!r}` not supported. " + "`Py_LIMITED_API` is currently incompatible with " + "`Py_GIL_DISABLED`." + "See https://github.com/python/cpython/issues/111506." + ) + + @property + def wheel_dist_name(self) -> str: + """Return distribution full name with - replaced with _""" + components = [ + safer_name(self.distribution.get_name()), + safer_version(self.distribution.get_version()), + ] + if self.build_number: + components.append(self.build_number) + return "-".join(components) + + def get_tag(self) -> tuple[str, str, str]: + # bdist sets self.plat_name if unset, we should only use it for purepy + # wheels if the user supplied it. + if self.plat_name_supplied and self.plat_name: + plat_name = self.plat_name + elif self.root_is_pure: + plat_name = "any" + else: + # macosx contains system version in platform name so need special handle + if self.plat_name and not self.plat_name.startswith("macosx"): + plat_name = self.plat_name + else: + # on macosx always limit the platform name to comply with any + # c-extension modules in bdist_dir, since the user can specify + # a higher MACOSX_DEPLOYMENT_TARGET via tools like CMake + + # on other platforms, and on macosx if there are no c-extension + # modules, use the default platform name. + plat_name = get_platform(self.bdist_dir) + + if _is_32bit_interpreter(): + if plat_name in ("linux-x86_64", "linux_x86_64"): + plat_name = "linux_i686" + if plat_name in ("linux-aarch64", "linux_aarch64"): + # TODO armv8l, packaging pull request #690 => this did not land + # in pip/packaging yet + plat_name = "linux_armv7l" + + plat_name = ( + plat_name.lower().replace("-", "_").replace(".", "_").replace(" ", "_") + ) + + if self.root_is_pure: + if self.universal: + impl = "py2.py3" + else: + impl = self.python_tag + tag = (impl, "none", plat_name) + else: + impl_name = tags.interpreter_name() + impl_ver = tags.interpreter_version() + impl = impl_name + impl_ver + # We don't work on CPython 3.1, 3.0. + if self.py_limited_api and (impl_name + impl_ver).startswith("cp3"): + impl = self.py_limited_api + abi_tag = "abi3" + else: + abi_tag = str(get_abi_tag()).lower() + tag = (impl, abi_tag, plat_name) + # issue gh-374: allow overriding plat_name + supported_tags = [ + (t.interpreter, t.abi, plat_name) for t in tags.sys_tags() + ] + assert tag in supported_tags, ( + f"would build wheel with unsupported tag {tag}" + ) + return tag + + def run(self): + build_scripts = self.reinitialize_command("build_scripts") + build_scripts.executable = "python" + build_scripts.force = True + + build_ext = self.reinitialize_command("build_ext") + build_ext.inplace = False + + if not self.skip_build: + self.run_command("build") + + install = self.reinitialize_command("install", reinit_subcommands=True) + install.root = self.bdist_dir + install.compile = False + install.skip_build = self.skip_build + install.warn_dir = False + + # A wheel without setuptools scripts is more cross-platform. + # Use the (undocumented) `no_ep` option to setuptools' + # install_scripts command to avoid creating entry point scripts. + install_scripts = self.reinitialize_command("install_scripts") + install_scripts.no_ep = True + + # Use a custom scheme for the archive, because we have to decide + # at installation time which scheme to use. + for key in ("headers", "scripts", "data", "purelib", "platlib"): + setattr(install, "install_" + key, os.path.join(self.data_dir, key)) + + basedir_observed = "" + + if os.name == "nt": + # win32 barfs if any of these are ''; could be '.'? + # (distutils.command.install:change_roots bug) + basedir_observed = os.path.normpath(os.path.join(self.data_dir, "..")) + self.install_libbase = self.install_lib = basedir_observed + + setattr( + install, + "install_purelib" if self.root_is_pure else "install_platlib", + basedir_observed, + ) + + log.info(f"installing to {self.bdist_dir}") + + self.run_command("install") + + impl_tag, abi_tag, plat_tag = self.get_tag() + archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}" + if not self.relative: + archive_root = self.bdist_dir + else: + archive_root = os.path.join( + self.bdist_dir, self._ensure_relative(install.install_base) + ) + + self.set_undefined_options("install_egg_info", ("target", "egginfo_dir")) + distinfo_dirname = ( + f"{safer_name(self.distribution.get_name())}-" + f"{safer_version(self.distribution.get_version())}.dist-info" + ) + distinfo_dir = os.path.join(self.bdist_dir, distinfo_dirname) + if self.dist_info_dir: + # Use the given dist-info directly. + log.debug(f"reusing {self.dist_info_dir}") + shutil.copytree(self.dist_info_dir, distinfo_dir) + # Egg info is still generated, so remove it now to avoid it getting + # copied into the wheel. + _shutil.rmtree(self.egginfo_dir) + else: + # Convert the generated egg-info into dist-info. + self.egg2dist(self.egginfo_dir, distinfo_dir) + + self.write_wheelfile(distinfo_dir) + + # Make the archive + if not os.path.exists(self.dist_dir): + os.makedirs(self.dist_dir) + + wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl") + with WheelFile(wheel_path, "w", self._zip_compression()) as wf: + wf.write_files(archive_root) + + # Add to 'Distribution.dist_files' so that the "upload" command works + getattr(self.distribution, "dist_files", []).append(( + "bdist_wheel", + f"{sys.version_info.major}.{sys.version_info.minor}", + wheel_path, + )) + + if not self.keep_temp: + log.info(f"removing {self.bdist_dir}") + if not self.dry_run: + _shutil.rmtree(self.bdist_dir) + + def write_wheelfile( + self, wheelfile_base: str, generator: str = f"setuptools ({__version__})" + ) -> None: + from email.message import Message + + msg = Message() + msg["Wheel-Version"] = "1.0" # of the spec + msg["Generator"] = generator + msg["Root-Is-Purelib"] = str(self.root_is_pure).lower() + if self.build_number is not None: + msg["Build"] = self.build_number + + # Doesn't work for bdist_wininst + impl_tag, abi_tag, plat_tag = self.get_tag() + for impl in impl_tag.split("."): + for abi in abi_tag.split("."): + for plat in plat_tag.split("."): + msg["Tag"] = "-".join((impl, abi, plat)) + + wheelfile_path = os.path.join(wheelfile_base, "WHEEL") + log.info(f"creating {wheelfile_path}") + with open(wheelfile_path, "wb") as f: + BytesGenerator(f, maxheaderlen=0).flatten(msg) + + def _ensure_relative(self, path: str) -> str: + # copied from dir_util, deleted + drive, path = os.path.splitdrive(path) + if path[0:1] == os.sep: + path = drive + path[1:] + return path + + @property + def license_paths(self) -> Iterable[str]: + if setuptools_major_version >= 57: + # Setuptools has resolved any patterns to actual file names + return self.distribution.metadata.license_files or () + + files = set[str]() + metadata = self.distribution.get_option_dict("metadata") + if setuptools_major_version >= 42: + # Setuptools recognizes the license_files option but does not do globbing + patterns = cast(Sequence[str], self.distribution.metadata.license_files) + else: + # Prior to those, wheel is entirely responsible for handling license files + if "license_files" in metadata: + patterns = metadata["license_files"][1].split() + else: + patterns = () + + if "license_file" in metadata: + warnings.warn( + 'The "license_file" option is deprecated. Use "license_files" instead.', + DeprecationWarning, + stacklevel=2, + ) + files.add(metadata["license_file"][1]) + + if not files and not patterns and not isinstance(patterns, list): + patterns = ("LICEN[CS]E*", "COPYING*", "NOTICE*", "AUTHORS*") + + for pattern in patterns: + for path in iglob(pattern): + if path.endswith("~"): + log.debug( + f'ignoring license file "{path}" as it looks like a backup' + ) + continue + + if path not in files and os.path.isfile(path): + log.info( + f'adding license file "{path}" (matched pattern "{pattern}")' + ) + files.add(path) + + return files + + def egg2dist(self, egginfo_path: str, distinfo_path: str) -> None: + """Convert an .egg-info directory into a .dist-info directory""" + + def adios(p: str) -> None: + """Appropriately delete directory, file or link.""" + if os.path.exists(p) and not os.path.islink(p) and os.path.isdir(p): + _shutil.rmtree(p) + elif os.path.exists(p): + os.unlink(p) + + adios(distinfo_path) + + if not os.path.exists(egginfo_path): + # There is no egg-info. This is probably because the egg-info + # file/directory is not named matching the distribution name used + # to name the archive file. Check for this case and report + # accordingly. + import glob + + pat = os.path.join(os.path.dirname(egginfo_path), "*.egg-info") + possible = glob.glob(pat) + err = f"Egg metadata expected at {egginfo_path} but not found" + if possible: + alt = os.path.basename(possible[0]) + err += f" ({alt} found - possible misnamed archive file?)" + + raise ValueError(err) + + # .egg-info is a directory + pkginfo_path = os.path.join(egginfo_path, "PKG-INFO") + + # ignore common egg metadata that is useless to wheel + shutil.copytree( + egginfo_path, + distinfo_path, + ignore=lambda x, y: { + "PKG-INFO", + "requires.txt", + "SOURCES.txt", + "not-zip-safe", + }, + ) + + # delete dependency_links if it is only whitespace + dependency_links_path = os.path.join(distinfo_path, "dependency_links.txt") + with open(dependency_links_path, encoding="utf-8") as dependency_links_file: + dependency_links = dependency_links_file.read().strip() + if not dependency_links: + adios(dependency_links_path) + + metadata_path = os.path.join(distinfo_path, "METADATA") + shutil.copy(pkginfo_path, metadata_path) + + for license_path in self.license_paths: + filename = os.path.basename(license_path) + shutil.copy(license_path, os.path.join(distinfo_path, filename)) + + adios(egginfo_path) + + def _zip_compression(self) -> int: + if ( + isinstance(self.compression, int) + and self.compression in self.supported_compressions.values() + ): + return self.compression + + compression = self.supported_compressions.get(str(self.compression)) + if compression is not None: + return compression + + raise ValueError(f"Unsupported compression: {self.compression!r}") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build.py new file mode 100644 index 0000000000000000000000000000000000000000..54cbb8d2e7bf41de4e187613718879f2a4c3d8d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +from typing import Protocol + +from ..dist import Distribution + +from distutils.command.build import build as _build + +_ORIGINAL_SUBCOMMANDS = {"build_py", "build_clib", "build_ext", "build_scripts"} + + +class build(_build): + distribution: Distribution # override distutils.dist.Distribution with setuptools.dist.Distribution + + # copy to avoid sharing the object with parent class + sub_commands = _build.sub_commands[:] + + +class SubCommand(Protocol): + """In order to support editable installations (see :pep:`660`) all + build subcommands **SHOULD** implement this protocol. They also **MUST** inherit + from ``setuptools.Command``. + + When creating an :pep:`editable wheel <660>`, ``setuptools`` will try to evaluate + custom ``build`` subcommands using the following procedure: + + 1. ``setuptools`` will set the ``editable_mode`` attribute to ``True`` + 2. ``setuptools`` will execute the ``run()`` command. + + .. important:: + Subcommands **SHOULD** take advantage of ``editable_mode=True`` to adequate + its behaviour or perform optimisations. + + For example, if a subcommand doesn't need to generate an extra file and + all it does is to copy a source file into the build directory, + ``run()`` **SHOULD** simply "early return". + + Similarly, if the subcommand creates files that would be placed alongside + Python files in the final distribution, during an editable install + the command **SHOULD** generate these files "in place" (i.e. write them to + the original source directory, instead of using the build directory). + Note that ``get_output_mapping()`` should reflect that and include mappings + for "in place" builds accordingly. + + 3. ``setuptools`` use any knowledge it can derive from the return values of + ``get_outputs()`` and ``get_output_mapping()`` to create an editable wheel. + When relevant ``setuptools`` **MAY** attempt to use file links based on the value + of ``get_output_mapping()``. Alternatively, ``setuptools`` **MAY** attempt to use + :doc:`import hooks ` to redirect any attempt to import + to the directory with the original source code and other files built in place. + + Please note that custom sub-commands **SHOULD NOT** rely on ``run()`` being + executed (or not) to provide correct return values for ``get_outputs()``, + ``get_output_mapping()`` or ``get_source_files()``. The ``get_*`` methods should + work independently of ``run()``. + """ + + editable_mode: bool = False + """Boolean flag that will be set to ``True`` when setuptools is used for an + editable installation (see :pep:`660`). + Implementations **SHOULD** explicitly set the default value of this attribute to + ``False``. + When subcommands run, they can use this flag to perform optimizations or change + their behaviour accordingly. + """ + + build_lib: str + """String representing the directory where the build artifacts should be stored, + e.g. ``build/lib``. + For example, if a distribution wants to provide a Python module named ``pkg.mod``, + then a corresponding file should be written to ``{build_lib}/package/module.py``. + A way of thinking about this is that the files saved under ``build_lib`` + would be eventually copied to one of the directories in :obj:`site.PREFIXES` + upon installation. + + A command that produces platform-independent files (e.g. compiling text templates + into Python functions), **CAN** initialize ``build_lib`` by copying its value from + the ``build_py`` command. On the other hand, a command that produces + platform-specific files **CAN** initialize ``build_lib`` by copying its value from + the ``build_ext`` command. In general this is done inside the ``finalize_options`` + method with the help of the ``set_undefined_options`` command:: + + def finalize_options(self): + self.set_undefined_options("build_py", ("build_lib", "build_lib")) + ... + """ + + def initialize_options(self) -> None: + """(Required by the original :class:`setuptools.Command` interface)""" + ... + + def finalize_options(self) -> None: + """(Required by the original :class:`setuptools.Command` interface)""" + ... + + def run(self) -> None: + """(Required by the original :class:`setuptools.Command` interface)""" + ... + + def get_source_files(self) -> list[str]: + """ + Return a list of all files that are used by the command to create the expected + outputs. + For example, if your build command transpiles Java files into Python, you should + list here all the Java files. + The primary purpose of this function is to help populating the ``sdist`` + with all the files necessary to build the distribution. + All files should be strings relative to the project root directory. + """ + ... + + def get_outputs(self) -> list[str]: + """ + Return a list of files intended for distribution as they would have been + produced by the build. + These files should be strings in the form of + ``"{build_lib}/destination/file/path"``. + + .. note:: + The return value of ``get_output()`` should include all files used as keys + in ``get_output_mapping()`` plus files that are generated during the build + and don't correspond to any source file already present in the project. + """ + ... + + def get_output_mapping(self) -> dict[str, str]: + """ + Return a mapping between destination files as they would be produced by the + build (dict keys) into the respective existing (source) files (dict values). + Existing (source) files should be represented as strings relative to the project + root directory. + Destination files should be strings in the form of + ``"{build_lib}/destination/file/path"``. + """ + ... diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build_clib.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build_clib.py new file mode 100644 index 0000000000000000000000000000000000000000..f376f4ce4d2afc4a58f1fa0e85624136edc93835 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build_clib.py @@ -0,0 +1,103 @@ +from ..dist import Distribution +from ..modified import newer_pairwise_group + +import distutils.command.build_clib as orig +from distutils import log +from distutils.errors import DistutilsSetupError + + +class build_clib(orig.build_clib): + """ + Override the default build_clib behaviour to do the following: + + 1. Implement a rudimentary timestamp-based dependency system + so 'compile()' doesn't run every time. + 2. Add more keys to the 'build_info' dictionary: + * obj_deps - specify dependencies for each object compiled. + this should be a dictionary mapping a key + with the source filename to a list of + dependencies. Use an empty string for global + dependencies. + * cflags - specify a list of additional flags to pass to + the compiler. + """ + + distribution: Distribution # override distutils.dist.Distribution with setuptools.dist.Distribution + + def build_libraries(self, libraries) -> None: + for lib_name, build_info in libraries: + sources = build_info.get('sources') + if sources is None or not isinstance(sources, (list, tuple)): + raise DistutilsSetupError( + f"in 'libraries' option (library '{lib_name}'), " + "'sources' must be present and must be " + "a list of source filenames" + ) + sources = sorted(list(sources)) + + log.info("building '%s' library", lib_name) + + # Make sure everything is the correct type. + # obj_deps should be a dictionary of keys as sources + # and a list/tuple of files that are its dependencies. + obj_deps = build_info.get('obj_deps', dict()) + if not isinstance(obj_deps, dict): + raise DistutilsSetupError( + f"in 'libraries' option (library '{lib_name}'), " + "'obj_deps' must be a dictionary of " + "type 'source: list'" + ) + dependencies = [] + + # Get the global dependencies that are specified by the '' key. + # These will go into every source's dependency list. + global_deps = obj_deps.get('', list()) + if not isinstance(global_deps, (list, tuple)): + raise DistutilsSetupError( + f"in 'libraries' option (library '{lib_name}'), " + "'obj_deps' must be a dictionary of " + "type 'source: list'" + ) + + # Build the list to be used by newer_pairwise_group + # each source will be auto-added to its dependencies. + for source in sources: + src_deps = [source] + src_deps.extend(global_deps) + extra_deps = obj_deps.get(source, list()) + if not isinstance(extra_deps, (list, tuple)): + raise DistutilsSetupError( + f"in 'libraries' option (library '{lib_name}'), " + "'obj_deps' must be a dictionary of " + "type 'source: list'" + ) + src_deps.extend(extra_deps) + dependencies.append(src_deps) + + expected_objects = self.compiler.object_filenames( + sources, + output_dir=self.build_temp, + ) + + if newer_pairwise_group(dependencies, expected_objects) != ([], []): + # First, compile the source code to object files in the library + # directory. (This should probably change to putting object + # files in a temporary build directory.) + macros = build_info.get('macros') + include_dirs = build_info.get('include_dirs') + cflags = build_info.get('cflags') + self.compiler.compile( + sources, + output_dir=self.build_temp, + macros=macros, + include_dirs=include_dirs, + extra_postargs=cflags, + debug=self.debug, + ) + + # Now "link" the object files together into a static library. + # (On Unix at least, this isn't really linking -- it just + # builds an archive. Whatever.) + self.compiler.create_static_lib( + expected_objects, lib_name, output_dir=self.build_clib, debug=self.debug + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build_ext.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build_ext.py new file mode 100644 index 0000000000000000000000000000000000000000..be833a379c71c3a6b1a85a89f8e9703c66d79824 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build_ext.py @@ -0,0 +1,469 @@ +from __future__ import annotations + +import itertools +import os +import sys +from collections.abc import Iterator +from importlib.machinery import EXTENSION_SUFFIXES +from importlib.util import cache_from_source as _compiled_file_name +from pathlib import Path +from typing import TYPE_CHECKING + +from setuptools.dist import Distribution +from setuptools.errors import BaseError +from setuptools.extension import Extension, Library + +from distutils import log +from distutils.ccompiler import new_compiler +from distutils.sysconfig import customize_compiler, get_config_var + +if TYPE_CHECKING: + # Cython not installed on CI tests, causing _build_ext to be `Any` + from distutils.command.build_ext import build_ext as _build_ext +else: + try: + # Attempt to use Cython for building extensions, if available + from Cython.Distutils.build_ext import build_ext as _build_ext + + # Additionally, assert that the compiler module will load + # also. Ref #1229. + __import__('Cython.Compiler.Main') + except ImportError: + from distutils.command.build_ext import build_ext as _build_ext + +# make sure _config_vars is initialized +get_config_var("LDSHARED") +# Not publicly exposed in typeshed distutils stubs, but this is done on purpose +# See https://github.com/pypa/setuptools/pull/4228#issuecomment-1959856400 +from distutils.sysconfig import _config_vars as _CONFIG_VARS # noqa: E402 + + +def _customize_compiler_for_shlib(compiler): + if sys.platform == "darwin": + # building .dylib requires additional compiler flags on OSX; here we + # temporarily substitute the pyconfig.h variables so that distutils' + # 'customize_compiler' uses them before we build the shared libraries. + tmp = _CONFIG_VARS.copy() + try: + # XXX Help! I don't have any idea whether these are right... + _CONFIG_VARS['LDSHARED'] = ( + "gcc -Wl,-x -dynamiclib -undefined dynamic_lookup" + ) + _CONFIG_VARS['CCSHARED'] = " -dynamiclib" + _CONFIG_VARS['SO'] = ".dylib" + customize_compiler(compiler) + finally: + _CONFIG_VARS.clear() + _CONFIG_VARS.update(tmp) + else: + customize_compiler(compiler) + + +have_rtld = False +use_stubs = False +libtype = 'shared' + +if sys.platform == "darwin": + use_stubs = True +elif os.name != 'nt': + try: + import dl # type: ignore[import-not-found] # https://github.com/python/mypy/issues/13002 + + use_stubs = have_rtld = hasattr(dl, 'RTLD_NOW') + except ImportError: + pass + + +def if_dl(s): + return s if have_rtld else '' + + +def get_abi3_suffix(): + """Return the file extension for an abi3-compliant Extension()""" + for suffix in EXTENSION_SUFFIXES: + if '.abi3' in suffix: # Unix + return suffix + elif suffix == '.pyd': # Windows + return suffix + return None + + +class build_ext(_build_ext): + distribution: Distribution # override distutils.dist.Distribution with setuptools.dist.Distribution + editable_mode = False + inplace = False + + def run(self): + """Build extensions in build directory, then copy if --inplace""" + old_inplace, self.inplace = self.inplace, False + _build_ext.run(self) + self.inplace = old_inplace + if old_inplace: + self.copy_extensions_to_source() + + def _get_inplace_equivalent(self, build_py, ext: Extension) -> tuple[str, str]: + fullname = self.get_ext_fullname(ext.name) + filename = self.get_ext_filename(fullname) + modpath = fullname.split('.') + package = '.'.join(modpath[:-1]) + package_dir = build_py.get_package_dir(package) + inplace_file = os.path.join(package_dir, os.path.basename(filename)) + regular_file = os.path.join(self.build_lib, filename) + return (inplace_file, regular_file) + + def copy_extensions_to_source(self) -> None: + build_py = self.get_finalized_command('build_py') + for ext in self.extensions: + inplace_file, regular_file = self._get_inplace_equivalent(build_py, ext) + + # Always copy, even if source is older than destination, to ensure + # that the right extensions for the current Python/platform are + # used. + if os.path.exists(regular_file) or not ext.optional: + self.copy_file(regular_file, inplace_file, level=self.verbose) + + if ext._needs_stub: + inplace_stub = self._get_equivalent_stub(ext, inplace_file) + self._write_stub_file(inplace_stub, ext, compile=True) + # Always compile stub and remove the original (leave the cache behind) + # (this behaviour was observed in previous iterations of the code) + + def _get_equivalent_stub(self, ext: Extension, output_file: str) -> str: + dir_ = os.path.dirname(output_file) + _, _, name = ext.name.rpartition(".") + return f"{os.path.join(dir_, name)}.py" + + def _get_output_mapping(self) -> Iterator[tuple[str, str]]: + if not self.inplace: + return + + build_py = self.get_finalized_command('build_py') + opt = self.get_finalized_command('install_lib').optimize or "" + + for ext in self.extensions: + inplace_file, regular_file = self._get_inplace_equivalent(build_py, ext) + yield (regular_file, inplace_file) + + if ext._needs_stub: + # This version of `build_ext` always builds artifacts in another dir, + # when "inplace=True" is given it just copies them back. + # This is done in the `copy_extensions_to_source` function, which + # always compile stub files via `_compile_and_remove_stub`. + # At the end of the process, a `.pyc` stub file is created without the + # corresponding `.py`. + + inplace_stub = self._get_equivalent_stub(ext, inplace_file) + regular_stub = self._get_equivalent_stub(ext, regular_file) + inplace_cache = _compiled_file_name(inplace_stub, optimization=opt) + output_cache = _compiled_file_name(regular_stub, optimization=opt) + yield (output_cache, inplace_cache) + + def get_ext_filename(self, fullname: str) -> str: + so_ext = os.getenv('SETUPTOOLS_EXT_SUFFIX') + if so_ext: + filename = os.path.join(*fullname.split('.')) + so_ext + else: + filename = _build_ext.get_ext_filename(self, fullname) + ext_suffix = get_config_var('EXT_SUFFIX') + if not isinstance(ext_suffix, str): + raise OSError( + "Configuration variable EXT_SUFFIX not found for this platform " + "and environment variable SETUPTOOLS_EXT_SUFFIX is missing" + ) + so_ext = ext_suffix + + if fullname in self.ext_map: + ext = self.ext_map[fullname] + abi3_suffix = get_abi3_suffix() + if ext.py_limited_api and abi3_suffix: # Use abi3 + filename = filename[: -len(so_ext)] + abi3_suffix + if isinstance(ext, Library): + fn, ext = os.path.splitext(filename) + return self.shlib_compiler.library_filename(fn, libtype) + elif use_stubs and ext._links_to_dynamic: + d, fn = os.path.split(filename) + return os.path.join(d, 'dl-' + fn) + return filename + + def initialize_options(self): + _build_ext.initialize_options(self) + self.shlib_compiler = None + self.shlibs = [] + self.ext_map = {} + self.editable_mode = False + + def finalize_options(self) -> None: + _build_ext.finalize_options(self) + self.extensions = self.extensions or [] + self.check_extensions_list(self.extensions) + self.shlibs = [ext for ext in self.extensions if isinstance(ext, Library)] + if self.shlibs: + self.setup_shlib_compiler() + for ext in self.extensions: + ext._full_name = self.get_ext_fullname(ext.name) + for ext in self.extensions: + fullname = ext._full_name + self.ext_map[fullname] = ext + + # distutils 3.1 will also ask for module names + # XXX what to do with conflicts? + self.ext_map[fullname.split('.')[-1]] = ext + + ltd = self.shlibs and self.links_to_dynamic(ext) or False + ns = ltd and use_stubs and not isinstance(ext, Library) + ext._links_to_dynamic = ltd + ext._needs_stub = ns + filename = ext._file_name = self.get_ext_filename(fullname) + libdir = os.path.dirname(os.path.join(self.build_lib, filename)) + if ltd and libdir not in ext.library_dirs: + ext.library_dirs.append(libdir) + if ltd and use_stubs and os.curdir not in ext.runtime_library_dirs: + ext.runtime_library_dirs.append(os.curdir) + + if self.editable_mode: + self.inplace = True + + def setup_shlib_compiler(self): + compiler = self.shlib_compiler = new_compiler( + compiler=self.compiler, dry_run=self.dry_run, force=self.force + ) + _customize_compiler_for_shlib(compiler) + + if self.include_dirs is not None: + compiler.set_include_dirs(self.include_dirs) + if self.define is not None: + # 'define' option is a list of (name,value) tuples + for name, value in self.define: + compiler.define_macro(name, value) + if self.undef is not None: + for macro in self.undef: + compiler.undefine_macro(macro) + if self.libraries is not None: + compiler.set_libraries(self.libraries) + if self.library_dirs is not None: + compiler.set_library_dirs(self.library_dirs) + if self.rpath is not None: + compiler.set_runtime_library_dirs(self.rpath) + if self.link_objects is not None: + compiler.set_link_objects(self.link_objects) + + # hack so distutils' build_extension() builds a library instead + compiler.link_shared_object = link_shared_object.__get__(compiler) # type: ignore[method-assign] + + def get_export_symbols(self, ext): + if isinstance(ext, Library): + return ext.export_symbols + return _build_ext.get_export_symbols(self, ext) + + def build_extension(self, ext) -> None: + ext._convert_pyx_sources_to_lang() + _compiler = self.compiler + try: + if isinstance(ext, Library): + self.compiler = self.shlib_compiler + _build_ext.build_extension(self, ext) + if ext._needs_stub: + build_lib = self.get_finalized_command('build_py').build_lib + self.write_stub(build_lib, ext) + finally: + self.compiler = _compiler + + def links_to_dynamic(self, ext): + """Return true if 'ext' links to a dynamic lib in the same package""" + # XXX this should check to ensure the lib is actually being built + # XXX as dynamic, and not just using a locally-found version or a + # XXX static-compiled version + libnames = dict.fromkeys([lib._full_name for lib in self.shlibs]) + pkg = '.'.join(ext._full_name.split('.')[:-1] + ['']) + return any(pkg + libname in libnames for libname in ext.libraries) + + def get_source_files(self) -> list[str]: + return [*_build_ext.get_source_files(self), *self._get_internal_depends()] + + def _get_internal_depends(self) -> Iterator[str]: + """Yield ``ext.depends`` that are contained by the project directory""" + project_root = Path(self.distribution.src_root or os.curdir).resolve() + depends = (dep for ext in self.extensions for dep in ext.depends) + + def skip(orig_path: str, reason: str) -> None: + log.info( + "dependency %s won't be automatically " + "included in the manifest: the path %s", + orig_path, + reason, + ) + + for dep in depends: + path = Path(dep) + + if path.is_absolute(): + skip(dep, "must be relative") + continue + + if ".." in path.parts: + skip(dep, "can't have `..` segments") + continue + + try: + resolved = (project_root / path).resolve(strict=True) + except OSError: + skip(dep, "doesn't exist") + continue + + try: + resolved.relative_to(project_root) + except ValueError: + skip(dep, "must be inside the project root") + continue + + yield path.as_posix() + + def get_outputs(self) -> list[str]: + if self.inplace: + return list(self.get_output_mapping().keys()) + return sorted(_build_ext.get_outputs(self) + self.__get_stubs_outputs()) + + def get_output_mapping(self) -> dict[str, str]: + """See :class:`setuptools.commands.build.SubCommand`""" + mapping = self._get_output_mapping() + return dict(sorted(mapping, key=lambda x: x[0])) + + def __get_stubs_outputs(self): + # assemble the base name for each extension that needs a stub + ns_ext_bases = ( + os.path.join(self.build_lib, *ext._full_name.split('.')) + for ext in self.extensions + if ext._needs_stub + ) + # pair each base with the extension + pairs = itertools.product(ns_ext_bases, self.__get_output_extensions()) + return list(base + fnext for base, fnext in pairs) + + def __get_output_extensions(self): + yield '.py' + yield '.pyc' + if self.get_finalized_command('build_py').optimize: + yield '.pyo' + + def write_stub(self, output_dir, ext, compile=False) -> None: + stub_file = os.path.join(output_dir, *ext._full_name.split('.')) + '.py' + self._write_stub_file(stub_file, ext, compile) + + def _write_stub_file(self, stub_file: str, ext: Extension, compile=False): + log.info("writing stub loader for %s to %s", ext._full_name, stub_file) + if compile and os.path.exists(stub_file): + raise BaseError(stub_file + " already exists! Please delete.") + if not self.dry_run: + with open(stub_file, 'w', encoding="utf-8") as f: + content = '\n'.join([ + "def __bootstrap__():", + " global __bootstrap__, __file__, __loader__", + " import sys, os, pkg_resources, importlib.util" + if_dl(", dl"), + " __file__ = pkg_resources.resource_filename" + f"(__name__,{os.path.basename(ext._file_name)!r})", + " del __bootstrap__", + " if '__loader__' in globals():", + " del __loader__", + if_dl(" old_flags = sys.getdlopenflags()"), + " old_dir = os.getcwd()", + " try:", + " os.chdir(os.path.dirname(__file__))", + if_dl(" sys.setdlopenflags(dl.RTLD_NOW)"), + " spec = importlib.util.spec_from_file_location(", + " __name__, __file__)", + " mod = importlib.util.module_from_spec(spec)", + " spec.loader.exec_module(mod)", + " finally:", + if_dl(" sys.setdlopenflags(old_flags)"), + " os.chdir(old_dir)", + "__bootstrap__()", + "", # terminal \n + ]) + f.write(content) + if compile: + self._compile_and_remove_stub(stub_file) + + def _compile_and_remove_stub(self, stub_file: str): + from distutils.util import byte_compile + + byte_compile([stub_file], optimize=0, force=True, dry_run=self.dry_run) + optimize = self.get_finalized_command('install_lib').optimize + if optimize > 0: + byte_compile( + [stub_file], + optimize=optimize, + force=True, + dry_run=self.dry_run, + ) + if os.path.exists(stub_file) and not self.dry_run: + os.unlink(stub_file) + + +if use_stubs or os.name == 'nt': + # Build shared libraries + # + def link_shared_object( + self, + objects, + output_libname, + output_dir=None, + libraries=None, + library_dirs=None, + runtime_library_dirs=None, + export_symbols=None, + debug: bool = False, + extra_preargs=None, + extra_postargs=None, + build_temp=None, + target_lang=None, + ) -> None: + self.link( + self.SHARED_LIBRARY, + objects, + output_libname, + output_dir, + libraries, + library_dirs, + runtime_library_dirs, + export_symbols, + debug, + extra_preargs, + extra_postargs, + build_temp, + target_lang, + ) + +else: + # Build static libraries everywhere else + libtype = 'static' + + def link_shared_object( + self, + objects, + output_libname, + output_dir=None, + libraries=None, + library_dirs=None, + runtime_library_dirs=None, + export_symbols=None, + debug: bool = False, + extra_preargs=None, + extra_postargs=None, + build_temp=None, + target_lang=None, + ) -> None: + # XXX we need to either disallow these attrs on Library instances, + # or warn/abort here if set, or something... + # libraries=None, library_dirs=None, runtime_library_dirs=None, + # export_symbols=None, extra_preargs=None, extra_postargs=None, + # build_temp=None + + assert output_dir is None # distutils build_ext doesn't pass this + output_dir, filename = os.path.split(output_libname) + basename, _ext = os.path.splitext(filename) + if self.library_filename("x").startswith('lib'): + # strip 'lib' prefix; this is kludgy if some platform uses + # a different prefix + basename = basename[3:] + + self.create_static_lib(objects, basename, output_dir, debug, target_lang) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build_py.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build_py.py new file mode 100644 index 0000000000000000000000000000000000000000..2f6fcb7cdcb39a61007acbb399c84125cce93c9f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/build_py.py @@ -0,0 +1,400 @@ +from __future__ import annotations + +import fnmatch +import itertools +import os +import stat +import textwrap +from collections.abc import Iterable, Iterator +from functools import partial +from glob import glob +from pathlib import Path + +from more_itertools import unique_everseen + +from .._path import StrPath, StrPathT +from ..dist import Distribution +from ..warnings import SetuptoolsDeprecationWarning + +import distutils.command.build_py as orig +import distutils.errors +from distutils.util import convert_path + +_IMPLICIT_DATA_FILES = ('*.pyi', 'py.typed') + + +def make_writable(target) -> None: + os.chmod(target, os.stat(target).st_mode | stat.S_IWRITE) + + +class build_py(orig.build_py): + """Enhanced 'build_py' command that includes data files with packages + + The data files are specified via a 'package_data' argument to 'setup()'. + See 'setuptools.dist.Distribution' for more details. + + Also, this version of the 'build_py' command allows you to specify both + 'py_modules' and 'packages' in the same setup operation. + """ + + distribution: Distribution # override distutils.dist.Distribution with setuptools.dist.Distribution + editable_mode: bool = False + existing_egg_info_dir: StrPath | None = None #: Private API, internal use only. + + def finalize_options(self): + orig.build_py.finalize_options(self) + self.package_data = self.distribution.package_data + self.exclude_package_data = self.distribution.exclude_package_data or {} + if 'data_files' in self.__dict__: + del self.__dict__['data_files'] + + def copy_file( # type: ignore[override] # No overload, no bytes support + self, + infile: StrPath, + outfile: StrPathT, + preserve_mode: bool = True, + preserve_times: bool = True, + link: str | None = None, + level: object = 1, + ) -> tuple[StrPathT | str, bool]: + # Overwrite base class to allow using links + if link: + infile = str(Path(infile).resolve()) + outfile = str(Path(outfile).resolve()) # type: ignore[assignment] # Re-assigning a str when outfile is StrPath is ok + return super().copy_file( # pyright: ignore[reportReturnType] # pypa/distutils#309 + infile, outfile, preserve_mode, preserve_times, link, level + ) + + def run(self) -> None: + """Build modules, packages, and copy data files to build directory""" + if not (self.py_modules or self.packages) or self.editable_mode: + return + + if self.py_modules: + self.build_modules() + + if self.packages: + self.build_packages() + self.build_package_data() + + # Only compile actual .py files, using our base class' idea of what our + # output files are. + self.byte_compile(orig.build_py.get_outputs(self, include_bytecode=False)) + + def __getattr__(self, attr: str): + "lazily compute data files" + if attr == 'data_files': + self.data_files = self._get_data_files() + return self.data_files + return orig.build_py.__getattr__(self, attr) + + def _get_data_files(self): + """Generate list of '(package,src_dir,build_dir,filenames)' tuples""" + self.analyze_manifest() + return list(map(self._get_pkg_data_files, self.packages or ())) + + def get_data_files_without_manifest(self): + """ + Generate list of ``(package,src_dir,build_dir,filenames)`` tuples, + but without triggering any attempt to analyze or build the manifest. + """ + # Prevent eventual errors from unset `manifest_files` + # (that would otherwise be set by `analyze_manifest`) + self.__dict__.setdefault('manifest_files', {}) + return list(map(self._get_pkg_data_files, self.packages or ())) + + def _get_pkg_data_files(self, package): + # Locate package source directory + src_dir = self.get_package_dir(package) + + # Compute package build directory + build_dir = os.path.join(*([self.build_lib] + package.split('.'))) + + # Strip directory from globbed filenames + filenames = [ + os.path.relpath(file, src_dir) + for file in self.find_data_files(package, src_dir) + ] + return package, src_dir, build_dir, filenames + + def find_data_files(self, package, src_dir): + """Return filenames for package's data files in 'src_dir'""" + patterns = self._get_platform_patterns( + self.package_data, + package, + src_dir, + extra_patterns=_IMPLICIT_DATA_FILES, + ) + globs_expanded = map(partial(glob, recursive=True), patterns) + # flatten the expanded globs into an iterable of matches + globs_matches = itertools.chain.from_iterable(globs_expanded) + glob_files = filter(os.path.isfile, globs_matches) + files = itertools.chain( + self.manifest_files.get(package, []), + glob_files, + ) + return self.exclude_data_files(package, src_dir, files) + + def get_outputs(self, include_bytecode: bool = True) -> list[str]: # type: ignore[override] # Using a real boolean instead of 0|1 + """See :class:`setuptools.commands.build.SubCommand`""" + if self.editable_mode: + return list(self.get_output_mapping().keys()) + return super().get_outputs(include_bytecode) + + def get_output_mapping(self) -> dict[str, str]: + """See :class:`setuptools.commands.build.SubCommand`""" + mapping = itertools.chain( + self._get_package_data_output_mapping(), + self._get_module_mapping(), + ) + return dict(sorted(mapping, key=lambda x: x[0])) + + def _get_module_mapping(self) -> Iterator[tuple[str, str]]: + """Iterate over all modules producing (dest, src) pairs.""" + for package, module, module_file in self.find_all_modules(): + package = package.split('.') + filename = self.get_module_outfile(self.build_lib, package, module) + yield (filename, module_file) + + def _get_package_data_output_mapping(self) -> Iterator[tuple[str, str]]: + """Iterate over package data producing (dest, src) pairs.""" + for package, src_dir, build_dir, filenames in self.data_files: + for filename in filenames: + target = os.path.join(build_dir, filename) + srcfile = os.path.join(src_dir, filename) + yield (target, srcfile) + + def build_package_data(self) -> None: + """Copy data files into build directory""" + for target, srcfile in self._get_package_data_output_mapping(): + self.mkpath(os.path.dirname(target)) + _outf, _copied = self.copy_file(srcfile, target) + make_writable(target) + + def analyze_manifest(self) -> None: + self.manifest_files: dict[str, list[str]] = {} + if not self.distribution.include_package_data: + return + src_dirs: dict[str, str] = {} + for package in self.packages or (): + # Locate package source directory + src_dirs[assert_relative(self.get_package_dir(package))] = package + + if ( + self.existing_egg_info_dir + and Path(self.existing_egg_info_dir, "SOURCES.txt").exists() + ): + egg_info_dir = self.existing_egg_info_dir + manifest = Path(egg_info_dir, "SOURCES.txt") + files = manifest.read_text(encoding="utf-8").splitlines() + else: + self.run_command('egg_info') + ei_cmd = self.get_finalized_command('egg_info') + egg_info_dir = ei_cmd.egg_info + files = ei_cmd.filelist.files + + check = _IncludePackageDataAbuse() + for path in self._filter_build_files(files, egg_info_dir): + d, f = os.path.split(assert_relative(path)) + prev = None + oldf = f + while d and d != prev and d not in src_dirs: + prev = d + d, df = os.path.split(d) + f = os.path.join(df, f) + if d in src_dirs: + if f == oldf: + if check.is_module(f): + continue # it's a module, not data + else: + importable = check.importable_subpackage(src_dirs[d], f) + if importable: + check.warn(importable) + self.manifest_files.setdefault(src_dirs[d], []).append(path) + + def _filter_build_files( + self, files: Iterable[str], egg_info: StrPath + ) -> Iterator[str]: + """ + ``build_meta`` may try to create egg_info outside of the project directory, + and this can be problematic for certain plugins (reported in issue #3500). + + Extensions might also include between their sources files created on the + ``build_lib`` and ``build_temp`` directories. + + This function should filter this case of invalid files out. + """ + build = self.get_finalized_command("build") + build_dirs = (egg_info, self.build_lib, build.build_temp, build.build_base) + norm_dirs = [os.path.normpath(p) for p in build_dirs if p] + + for file in files: + norm_path = os.path.normpath(file) + if not os.path.isabs(file) or all(d not in norm_path for d in norm_dirs): + yield file + + def get_data_files(self) -> None: + pass # Lazily compute data files in _get_data_files() function. + + def check_package(self, package, package_dir): + """Check namespace packages' __init__ for declare_namespace""" + try: + return self.packages_checked[package] + except KeyError: + pass + + init_py = orig.build_py.check_package(self, package, package_dir) + self.packages_checked[package] = init_py + + if not init_py or not self.distribution.namespace_packages: + return init_py + + for pkg in self.distribution.namespace_packages: + if pkg == package or pkg.startswith(package + '.'): + break + else: + return init_py + + with open(init_py, 'rb') as f: + contents = f.read() + if b'declare_namespace' not in contents: + raise distutils.errors.DistutilsError( + f"Namespace package problem: {package} is a namespace package, but " + "its\n__init__.py does not call declare_namespace()! Please " + 'fix it.\n(See the setuptools manual under ' + '"Namespace Packages" for details.)\n"' + ) + return init_py + + def initialize_options(self): + self.packages_checked = {} + orig.build_py.initialize_options(self) + self.editable_mode = False + self.existing_egg_info_dir = None + + def get_package_dir(self, package): + res = orig.build_py.get_package_dir(self, package) + if self.distribution.src_root is not None: + return os.path.join(self.distribution.src_root, res) + return res + + def exclude_data_files(self, package, src_dir, files): + """Filter filenames for package's data files in 'src_dir'""" + files = list(files) + patterns = self._get_platform_patterns( + self.exclude_package_data, + package, + src_dir, + ) + match_groups = (fnmatch.filter(files, pattern) for pattern in patterns) + # flatten the groups of matches into an iterable of matches + matches = itertools.chain.from_iterable(match_groups) + bad = set(matches) + keepers = (fn for fn in files if fn not in bad) + # ditch dupes + return list(unique_everseen(keepers)) + + @staticmethod + def _get_platform_patterns(spec, package, src_dir, extra_patterns=()): + """ + yield platform-specific path patterns (suitable for glob + or fn_match) from a glob-based spec (such as + self.package_data or self.exclude_package_data) + matching package in src_dir. + """ + raw_patterns = itertools.chain( + extra_patterns, + spec.get('', []), + spec.get(package, []), + ) + return ( + # Each pattern has to be converted to a platform-specific path + os.path.join(src_dir, convert_path(pattern)) + for pattern in raw_patterns + ) + + +def assert_relative(path): + if not os.path.isabs(path): + return path + from distutils.errors import DistutilsSetupError + + msg = ( + textwrap.dedent( + """ + Error: setup script specifies an absolute path: + + %s + + setup() arguments must *always* be /-separated paths relative to the + setup.py directory, *never* absolute paths. + """ + ).lstrip() + % path + ) + raise DistutilsSetupError(msg) + + +class _IncludePackageDataAbuse: + """Inform users that package or module is included as 'data file'""" + + class _Warning(SetuptoolsDeprecationWarning): + _SUMMARY = """ + Package {importable!r} is absent from the `packages` configuration. + """ + + _DETAILS = """ + ############################ + # Package would be ignored # + ############################ + Python recognizes {importable!r} as an importable package[^1], + but it is absent from setuptools' `packages` configuration. + + This leads to an ambiguous overall configuration. If you want to distribute this + package, please make sure that {importable!r} is explicitly added + to the `packages` configuration field. + + Alternatively, you can also rely on setuptools' discovery methods + (for example by using `find_namespace_packages(...)`/`find_namespace:` + instead of `find_packages(...)`/`find:`). + + You can read more about "package discovery" on setuptools documentation page: + + - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html + + If you don't want {importable!r} to be distributed and are + already explicitly excluding {importable!r} via + `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`, + you can try to use `exclude_package_data`, or `include-package-data=False` in + combination with a more fine grained `package-data` configuration. + + You can read more about "package data files" on setuptools documentation page: + + - https://setuptools.pypa.io/en/latest/userguide/datafiles.html + + + [^1]: For Python, any directory (with suitable naming) can be imported, + even if it does not contain any `.py` files. + On the other hand, currently there is no concept of package data + directory, all directories are treated like packages. + """ + # _DUE_DATE: still not defined as this is particularly controversial. + # Warning initially introduced in May 2022. See issue #3340 for discussion. + + def __init__(self): + self._already_warned = set() + + def is_module(self, file): + return file.endswith(".py") and file[: -len(".py")].isidentifier() + + def importable_subpackage(self, parent, file): + pkg = Path(file).parent + parts = list(itertools.takewhile(str.isidentifier, pkg.parts)) + if parts: + return ".".join([parent, *parts]) + return None + + def warn(self, importable): + if importable not in self._already_warned: + self._Warning.emit(importable=importable) + self._already_warned.add(importable) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/develop.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/develop.py new file mode 100644 index 0000000000000000000000000000000000000000..7eee29d491f72d8b0869493013a781d88c0aa3cd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/develop.py @@ -0,0 +1,195 @@ +import glob +import os + +import setuptools +from setuptools import _normalization, _path, namespaces +from setuptools.command.easy_install import easy_install + +from ..unicode_utils import _read_utf8_with_fallback + +from distutils import log +from distutils.errors import DistutilsOptionError +from distutils.util import convert_path + + +class develop(namespaces.DevelopInstaller, easy_install): + """Set up package for development""" + + description = "install package in 'development mode'" + + user_options = easy_install.user_options + [ + ("uninstall", "u", "Uninstall this source package"), + ("egg-path=", None, "Set the path to be used in the .egg-link file"), + ] + + boolean_options = easy_install.boolean_options + ['uninstall'] + + command_consumes_arguments = False # override base + + def run(self): + if self.uninstall: + self.multi_version = True + self.uninstall_link() + self.uninstall_namespaces() + else: + self.install_for_development() + self.warn_deprecated_options() + + def initialize_options(self): + self.uninstall = None + self.egg_path = None + easy_install.initialize_options(self) + self.setup_path = None + self.always_copy_from = '.' # always copy eggs installed in curdir + + def finalize_options(self) -> None: + import pkg_resources + + ei = self.get_finalized_command("egg_info") + self.args = [ei.egg_name] + + easy_install.finalize_options(self) + self.expand_basedirs() + self.expand_dirs() + # pick up setup-dir .egg files only: no .egg-info + self.package_index.scan(glob.glob('*.egg')) + + egg_link_fn = ( + _normalization.filename_component_broken(ei.egg_name) + '.egg-link' + ) + self.egg_link = os.path.join(self.install_dir, egg_link_fn) + self.egg_base = ei.egg_base + if self.egg_path is None: + self.egg_path = os.path.abspath(ei.egg_base) + + target = _path.normpath(self.egg_base) + egg_path = _path.normpath(os.path.join(self.install_dir, self.egg_path)) + if egg_path != target: + raise DistutilsOptionError( + "--egg-path must be a relative path from the install" + " directory to " + target + ) + + # Make a distribution for the package's source + self.dist = pkg_resources.Distribution( + target, + pkg_resources.PathMetadata(target, os.path.abspath(ei.egg_info)), + project_name=ei.egg_name, + ) + + self.setup_path = self._resolve_setup_path( + self.egg_base, + self.install_dir, + self.egg_path, + ) + + @staticmethod + def _resolve_setup_path(egg_base, install_dir, egg_path): + """ + Generate a path from egg_base back to '.' where the + setup script resides and ensure that path points to the + setup path from $install_dir/$egg_path. + """ + path_to_setup = egg_base.replace(os.sep, '/').rstrip('/') + if path_to_setup != os.curdir: + path_to_setup = '../' * (path_to_setup.count('/') + 1) + resolved = _path.normpath(os.path.join(install_dir, egg_path, path_to_setup)) + curdir = _path.normpath(os.curdir) + if resolved != curdir: + raise DistutilsOptionError( + "Can't get a consistent path to setup script from" + " installation directory", + resolved, + curdir, + ) + return path_to_setup + + def install_for_development(self) -> None: + self.run_command('egg_info') + + # Build extensions in-place + self.reinitialize_command('build_ext', inplace=True) + self.run_command('build_ext') + + if setuptools.bootstrap_install_from: + self.easy_install(setuptools.bootstrap_install_from) + setuptools.bootstrap_install_from = None + + self.install_namespaces() + + # create an .egg-link in the installation dir, pointing to our egg + log.info("Creating %s (link to %s)", self.egg_link, self.egg_base) + if not self.dry_run: + with open(self.egg_link, "w", encoding="utf-8") as f: + f.write(self.egg_path + "\n" + self.setup_path) + # postprocess the installed distro, fixing up .pth, installing scripts, + # and handling requirements + self.process_distribution(None, self.dist, not self.no_deps) + + def uninstall_link(self) -> None: + if os.path.exists(self.egg_link): + log.info("Removing %s (link to %s)", self.egg_link, self.egg_base) + + contents = [ + line.rstrip() + for line in _read_utf8_with_fallback(self.egg_link).splitlines() + ] + + if contents not in ([self.egg_path], [self.egg_path, self.setup_path]): + log.warn("Link points to %s: uninstall aborted", contents) + return + if not self.dry_run: + os.unlink(self.egg_link) + if not self.dry_run: + self.update_pth(self.dist) # remove any .pth link to us + if self.distribution.scripts: + # XXX should also check for entry point scripts! + log.warn("Note: you must uninstall or replace scripts manually!") + + def install_egg_scripts(self, dist): + if dist is not self.dist: + # Installing a dependency, so fall back to normal behavior + return easy_install.install_egg_scripts(self, dist) + + # create wrapper scripts in the script dir, pointing to dist.scripts + + # new-style... + self.install_wrapper_scripts(dist) + + # ...and old-style + for script_name in self.distribution.scripts or []: + script_path = os.path.abspath(convert_path(script_name)) + script_name = os.path.basename(script_path) + script_text = _read_utf8_with_fallback(script_path) + self.install_script(dist, script_name, script_text, script_path) + + return None + + def install_wrapper_scripts(self, dist): + dist = VersionlessRequirement(dist) + return easy_install.install_wrapper_scripts(self, dist) + + +class VersionlessRequirement: + """ + Adapt a pkg_resources.Distribution to simply return the project + name as the 'requirement' so that scripts will work across + multiple versions. + + >>> from pkg_resources import Distribution + >>> dist = Distribution(project_name='foo', version='1.0') + >>> str(dist.as_requirement()) + 'foo==1.0' + >>> adapted_dist = VersionlessRequirement(dist) + >>> str(adapted_dist.as_requirement()) + 'foo' + """ + + def __init__(self, dist) -> None: + self.__dist = dist + + def __getattr__(self, name: str): + return getattr(self.__dist, name) + + def as_requirement(self): + return self.project_name diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/dist_info.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/dist_info.py new file mode 100644 index 0000000000000000000000000000000000000000..dca01ff0ce355dfd375512e4a06b05b909ee41b6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/dist_info.py @@ -0,0 +1,103 @@ +""" +Create a dist_info directory +As defined in the wheel specification +""" + +import os +import shutil +from contextlib import contextmanager +from pathlib import Path +from typing import cast + +from .. import _normalization +from .._shutil import rmdir as _rm +from .egg_info import egg_info as egg_info_cls + +from distutils import log +from distutils.core import Command + + +class dist_info(Command): + """ + This command is private and reserved for internal use of setuptools, + users should rely on ``setuptools.build_meta`` APIs. + """ + + description = "DO NOT CALL DIRECTLY, INTERNAL ONLY: create .dist-info directory" + + user_options = [ + ( + 'output-dir=', + 'o', + "directory inside of which the .dist-info will be" + "created [default: top of the source tree]", + ), + ('tag-date', 'd', "Add date stamp (e.g. 20050528) to version number"), + ('tag-build=', 'b', "Specify explicit tag to add to version number"), + ('no-date', 'D', "Don't include date stamp [default]"), + ('keep-egg-info', None, "*TRANSITIONAL* will be removed in the future"), + ] + + boolean_options = ['tag-date', 'keep-egg-info'] + negative_opt = {'no-date': 'tag-date'} + + def initialize_options(self): + self.output_dir = None + self.name = None + self.dist_info_dir = None + self.tag_date = None + self.tag_build = None + self.keep_egg_info = False + + def finalize_options(self) -> None: + dist = self.distribution + project_dir = dist.src_root or os.curdir + self.output_dir = Path(self.output_dir or project_dir) + + egg_info = cast(egg_info_cls, self.reinitialize_command("egg_info")) + egg_info.egg_base = str(self.output_dir) + + if self.tag_date: + egg_info.tag_date = self.tag_date + else: + self.tag_date = egg_info.tag_date + + if self.tag_build: + egg_info.tag_build = self.tag_build + else: + self.tag_build = egg_info.tag_build + + egg_info.finalize_options() + self.egg_info = egg_info + + name = _normalization.safer_name(dist.get_name()) + version = _normalization.safer_best_effort_version(dist.get_version()) + self.name = f"{name}-{version}" + self.dist_info_dir = os.path.join(self.output_dir, f"{self.name}.dist-info") + + @contextmanager + def _maybe_bkp_dir(self, dir_path: str, requires_bkp: bool): + if requires_bkp: + bkp_name = f"{dir_path}.__bkp__" + _rm(bkp_name, ignore_errors=True) + shutil.copytree(dir_path, bkp_name, dirs_exist_ok=True, symlinks=True) + try: + yield + finally: + _rm(dir_path, ignore_errors=True) + shutil.move(bkp_name, dir_path) + else: + yield + + def run(self) -> None: + self.output_dir.mkdir(parents=True, exist_ok=True) + self.egg_info.run() + egg_info_dir = self.egg_info.egg_info + assert os.path.isdir(egg_info_dir), ".egg-info dir should have been created" + + log.info(f"creating '{os.path.abspath(self.dist_info_dir)}'") + bdist_wheel = self.get_finalized_command('bdist_wheel') + + # TODO: if bdist_wheel if merged into setuptools, just add "keep_egg_info" there + with self._maybe_bkp_dir(egg_info_dir, self.keep_egg_info): + bdist_wheel.egg2dist(egg_info_dir, self.dist_info_dir) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/easy_install.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/easy_install.py new file mode 100644 index 0000000000000000000000000000000000000000..eb1b4c1fcc21dbebf23b5669913a59aaf3ce104a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/easy_install.py @@ -0,0 +1,2365 @@ +""" +Easy Install +------------ + +A tool for doing automatic download/extract/build of distutils-based Python +packages. For detailed documentation, see the accompanying EasyInstall.txt +file, or visit the `EasyInstall home page`__. + +__ https://setuptools.pypa.io/en/latest/deprecated/easy_install.html + +""" + +from __future__ import annotations + +import configparser +import contextlib +import io +import os +import random +import re +import shlex +import shutil +import site +import stat +import struct +import subprocess +import sys +import sysconfig +import tempfile +import textwrap +import warnings +import zipfile +import zipimport +from collections.abc import Iterable +from glob import glob +from sysconfig import get_path +from typing import TYPE_CHECKING, NoReturn, TypedDict + +from jaraco.text import yield_lines + +import pkg_resources +from pkg_resources import ( + DEVELOP_DIST, + Distribution, + DistributionNotFound, + EggMetadata, + Environment, + PathMetadata, + Requirement, + VersionConflict, + WorkingSet, + find_distributions, + get_distribution, + normalize_path, + resource_string, +) +from setuptools import Command +from setuptools.archive_util import unpack_archive +from setuptools.command import bdist_egg, egg_info, setopt +from setuptools.package_index import URL_SCHEME, PackageIndex, parse_requirement_arg +from setuptools.sandbox import run_setup +from setuptools.warnings import SetuptoolsDeprecationWarning, SetuptoolsWarning +from setuptools.wheel import Wheel + +from .._path import ensure_directory +from .._shutil import attempt_chmod_verbose as chmod, rmtree as _rmtree +from ..compat import py39, py312 + +from distutils import dir_util, log +from distutils.command import install +from distutils.command.build_scripts import first_line_re +from distutils.errors import ( + DistutilsArgError, + DistutilsError, + DistutilsOptionError, + DistutilsPlatformError, +) +from distutils.util import convert_path, get_platform, subst_vars + +if TYPE_CHECKING: + from typing_extensions import Self + +# Turn on PEP440Warnings +warnings.filterwarnings("default", category=pkg_resources.PEP440Warning) + +__all__ = [ + 'easy_install', + 'PthDistributions', + 'extract_wininst_cfg', + 'get_exe_prefixes', +] + + +def is_64bit(): + return struct.calcsize("P") == 8 + + +def _to_bytes(s): + return s.encode('utf8') + + +def isascii(s): + try: + s.encode('ascii') + except UnicodeError: + return False + return True + + +def _one_liner(text): + return textwrap.dedent(text).strip().replace('\n', '; ') + + +class easy_install(Command): + """Manage a download/build/install process""" + + description = "Find/get/install Python packages" + command_consumes_arguments = True + + user_options = [ + ('prefix=', None, "installation prefix"), + ("zip-ok", "z", "install package as a zipfile"), + ("multi-version", "m", "make apps have to require() a version"), + ("upgrade", "U", "force upgrade (searches PyPI for latest versions)"), + ("install-dir=", "d", "install package to DIR"), + ("script-dir=", "s", "install scripts to DIR"), + ("exclude-scripts", "x", "Don't install scripts"), + ("always-copy", "a", "Copy all needed packages to install dir"), + ("index-url=", "i", "base URL of Python Package Index"), + ("find-links=", "f", "additional URL(s) to search for packages"), + ("build-directory=", "b", "download/extract/build in DIR; keep the results"), + ( + 'optimize=', + 'O', + 'also compile with optimization: -O1 for "python -O", ' + '-O2 for "python -OO", and -O0 to disable [default: -O0]', + ), + ('record=', None, "filename in which to record list of installed files"), + ('always-unzip', 'Z', "don't install as a zipfile, no matter what"), + ('site-dirs=', 'S', "list of directories where .pth files work"), + ('editable', 'e', "Install specified packages in editable form"), + ('no-deps', 'N', "don't install dependencies"), + ('allow-hosts=', 'H', "pattern(s) that hostnames must match"), + ('local-snapshots-ok', 'l', "allow building eggs from local checkouts"), + ('version', None, "print version information and exit"), + ( + 'no-find-links', + None, + "Don't load find-links defined in packages being installed", + ), + ('user', None, f"install in user site-package '{site.USER_SITE}'"), + ] + boolean_options = [ + 'zip-ok', + 'multi-version', + 'exclude-scripts', + 'upgrade', + 'always-copy', + 'editable', + 'no-deps', + 'local-snapshots-ok', + 'version', + 'user', + ] + + negative_opt = {'always-unzip': 'zip-ok'} + create_index = PackageIndex + + def initialize_options(self): + EasyInstallDeprecationWarning.emit() + + # the --user option seems to be an opt-in one, + # so the default should be False. + self.user = False + self.zip_ok = self.local_snapshots_ok = None + self.install_dir = self.script_dir = self.exclude_scripts = None + self.index_url = None + self.find_links = None + self.build_directory = None + self.args = None + self.optimize = self.record = None + self.upgrade = self.always_copy = self.multi_version = None + self.editable = self.no_deps = self.allow_hosts = None + self.root = self.prefix = self.no_report = None + self.version = None + self.install_purelib = None # for pure module distributions + self.install_platlib = None # non-pure (dists w/ extensions) + self.install_headers = None # for C/C++ headers + self.install_lib = None # set to either purelib or platlib + self.install_scripts = None + self.install_data = None + self.install_base = None + self.install_platbase = None + self.install_userbase = site.USER_BASE + self.install_usersite = site.USER_SITE + self.no_find_links = None + + # Options not specifiable via command line + self.package_index = None + self.pth_file = self.always_copy_from = None + self.site_dirs = None + self.installed_projects = {} + # Always read easy_install options, even if we are subclassed, or have + # an independent instance created. This ensures that defaults will + # always come from the standard configuration file(s)' "easy_install" + # section, even if this is a "develop" or "install" command, or some + # other embedding. + self._dry_run = None + self.verbose = self.distribution.verbose + self.distribution._set_command_options( + self, self.distribution.get_option_dict('easy_install') + ) + + def delete_blockers(self, blockers) -> None: + extant_blockers = ( + filename + for filename in blockers + if os.path.exists(filename) or os.path.islink(filename) + ) + list(map(self._delete_path, extant_blockers)) + + def _delete_path(self, path): + log.info("Deleting %s", path) + if self.dry_run: + return + + is_tree = os.path.isdir(path) and not os.path.islink(path) + remover = _rmtree if is_tree else os.unlink + remover(path) + + @staticmethod + def _render_version(): + """ + Render the Setuptools version and installation details, then exit. + """ + ver = f'{sys.version_info.major}.{sys.version_info.minor}' + dist = get_distribution('setuptools') + print(f'setuptools {dist.version} from {dist.location} (Python {ver})') + raise SystemExit + + def finalize_options(self) -> None: # noqa: C901 # is too complex (25) # FIXME + self.version and self._render_version() + + py_version = sys.version.split()[0] + + self.config_vars = dict(sysconfig.get_config_vars()) + + self.config_vars.update({ + 'dist_name': self.distribution.get_name(), + 'dist_version': self.distribution.get_version(), + 'dist_fullname': self.distribution.get_fullname(), + 'py_version': py_version, + 'py_version_short': f'{sys.version_info.major}.{sys.version_info.minor}', + 'py_version_nodot': f'{sys.version_info.major}{sys.version_info.minor}', + 'sys_prefix': self.config_vars['prefix'], + 'sys_exec_prefix': self.config_vars['exec_prefix'], + # Only POSIX systems have abiflags + 'abiflags': getattr(sys, 'abiflags', ''), + # Only python 3.9+ has platlibdir + 'platlibdir': getattr(sys, 'platlibdir', 'lib'), + }) + with contextlib.suppress(AttributeError): + # only for distutils outside stdlib + self.config_vars.update({ + 'implementation_lower': install._get_implementation().lower(), + 'implementation': install._get_implementation(), + }) + + # pypa/distutils#113 Python 3.9 compat + self.config_vars.setdefault( + 'py_version_nodot_plat', + getattr(sys, 'windir', '').replace('.', ''), + ) + + self.config_vars['userbase'] = self.install_userbase + self.config_vars['usersite'] = self.install_usersite + if self.user and not site.ENABLE_USER_SITE: + log.warn("WARNING: The user site-packages directory is disabled.") + + self._fix_install_dir_for_user_site() + + self.expand_basedirs() + self.expand_dirs() + + self._expand( + 'install_dir', + 'script_dir', + 'build_directory', + 'site_dirs', + ) + # If a non-default installation directory was specified, default the + # script directory to match it. + if self.script_dir is None: + self.script_dir = self.install_dir + + if self.no_find_links is None: + self.no_find_links = False + + # Let install_dir get set by install_lib command, which in turn + # gets its info from the install command, and takes into account + # --prefix and --home and all that other crud. + self.set_undefined_options('install_lib', ('install_dir', 'install_dir')) + # Likewise, set default script_dir from 'install_scripts.install_dir' + self.set_undefined_options('install_scripts', ('install_dir', 'script_dir')) + + if self.user and self.install_purelib: + self.install_dir = self.install_purelib + self.script_dir = self.install_scripts + # default --record from the install command + self.set_undefined_options('install', ('record', 'record')) + self.all_site_dirs = get_site_dirs() + self.all_site_dirs.extend(self._process_site_dirs(self.site_dirs)) + + if not self.editable: + self.check_site_dir() + default_index = os.getenv("__EASYINSTALL_INDEX", "https://pypi.org/simple/") + # ^ Private API for testing purposes only + self.index_url = self.index_url or default_index + self.shadow_path = self.all_site_dirs[:] + for path_item in self.install_dir, normalize_path(self.script_dir): + if path_item not in self.shadow_path: + self.shadow_path.insert(0, path_item) + + if self.allow_hosts is not None: + hosts = [s.strip() for s in self.allow_hosts.split(',')] + else: + hosts = ['*'] + if self.package_index is None: + self.package_index = self.create_index( + self.index_url, + search_path=self.shadow_path, + hosts=hosts, + ) + self.local_index = Environment(self.shadow_path + sys.path) + + if self.find_links is not None: + if isinstance(self.find_links, str): + self.find_links = self.find_links.split() + else: + self.find_links = [] + if self.local_snapshots_ok: + self.package_index.scan_egg_links(self.shadow_path + sys.path) + if not self.no_find_links: + self.package_index.add_find_links(self.find_links) + self.set_undefined_options('install_lib', ('optimize', 'optimize')) + self.optimize = self._validate_optimize(self.optimize) + + if self.editable and not self.build_directory: + raise DistutilsArgError( + "Must specify a build directory (-b) when using --editable" + ) + if not self.args: + raise DistutilsArgError( + "No urls, filenames, or requirements specified (see --help)" + ) + + self.outputs: list[str] = [] + + @staticmethod + def _process_site_dirs(site_dirs): + if site_dirs is None: + return + + normpath = map(normalize_path, sys.path) + site_dirs = [os.path.expanduser(s.strip()) for s in site_dirs.split(',')] + for d in site_dirs: + if not os.path.isdir(d): + log.warn("%s (in --site-dirs) does not exist", d) + elif normalize_path(d) not in normpath: + raise DistutilsOptionError(d + " (in --site-dirs) is not on sys.path") + else: + yield normalize_path(d) + + @staticmethod + def _validate_optimize(value): + try: + value = int(value) + if value not in range(3): + raise ValueError + except ValueError as e: + raise DistutilsOptionError("--optimize must be 0, 1, or 2") from e + + return value + + def _fix_install_dir_for_user_site(self): + """ + Fix the install_dir if "--user" was used. + """ + if not self.user: + return + + self.create_home_path() + if self.install_userbase is None: + msg = "User base directory is not specified" + raise DistutilsPlatformError(msg) + self.install_base = self.install_platbase = self.install_userbase + scheme_name = f'{os.name}_user' + self.select_scheme(scheme_name) + + def _expand_attrs(self, attrs): + for attr in attrs: + val = getattr(self, attr) + if val is not None: + if os.name == 'posix' or os.name == 'nt': + val = os.path.expanduser(val) + val = subst_vars(val, self.config_vars) + setattr(self, attr, val) + + def expand_basedirs(self) -> None: + """Calls `os.path.expanduser` on install_base, install_platbase and + root.""" + self._expand_attrs(['install_base', 'install_platbase', 'root']) + + def expand_dirs(self) -> None: + """Calls `os.path.expanduser` on install dirs.""" + dirs = [ + 'install_purelib', + 'install_platlib', + 'install_lib', + 'install_headers', + 'install_scripts', + 'install_data', + ] + self._expand_attrs(dirs) + + def run(self, show_deprecation: bool = True) -> None: + if show_deprecation: + self.announce( + "WARNING: The easy_install command is deprecated " + "and will be removed in a future version.", + log.WARN, + ) + if self.verbose != self.distribution.verbose: + log.set_verbosity(self.verbose) + try: + for spec in self.args: + self.easy_install(spec, not self.no_deps) + if self.record: + outputs = self.outputs + if self.root: # strip any package prefix + root_len = len(self.root) + for counter in range(len(outputs)): + outputs[counter] = outputs[counter][root_len:] + from distutils import file_util + + self.execute( + file_util.write_file, + (self.record, outputs), + f"writing list of installed files to '{self.record}'", + ) + self.warn_deprecated_options() + finally: + log.set_verbosity(self.distribution.verbose) + + def pseudo_tempname(self): + """Return a pseudo-tempname base in the install directory. + This code is intentionally naive; if a malicious party can write to + the target directory you're already in deep doodoo. + """ + try: + pid = os.getpid() + except Exception: + pid = random.randint(0, sys.maxsize) + return os.path.join(self.install_dir, f"test-easy-install-{pid}") + + def warn_deprecated_options(self) -> None: + pass + + def check_site_dir(self) -> None: # is too complex (12) # FIXME + """Verify that self.install_dir is .pth-capable dir, if needed""" + + instdir = normalize_path(self.install_dir) + pth_file = os.path.join(instdir, 'easy-install.pth') + + if not os.path.exists(instdir): + try: + os.makedirs(instdir) + except OSError: + self.cant_write_to_target() + + # Is it a configured, PYTHONPATH, implicit, or explicit site dir? + is_site_dir = instdir in self.all_site_dirs + + if not is_site_dir and not self.multi_version: + # No? Then directly test whether it does .pth file processing + is_site_dir = self.check_pth_processing() + else: + # make sure we can write to target dir + testfile = self.pseudo_tempname() + '.write-test' + test_exists = os.path.exists(testfile) + try: + if test_exists: + os.unlink(testfile) + open(testfile, 'wb').close() + os.unlink(testfile) + except OSError: + self.cant_write_to_target() + + if not is_site_dir and not self.multi_version: + # Can't install non-multi to non-site dir with easy_install + pythonpath = os.environ.get('PYTHONPATH', '') + log.warn(self.__no_default_msg, self.install_dir, pythonpath) + + if is_site_dir: + if self.pth_file is None: + self.pth_file = PthDistributions(pth_file, self.all_site_dirs) + else: + self.pth_file = None + + if self.multi_version and not os.path.exists(pth_file): + self.pth_file = None # don't create a .pth file + self.install_dir = instdir + + __cant_write_msg = textwrap.dedent( + """ + can't create or remove files in install directory + + The following error occurred while trying to add or remove files in the + installation directory: + + %s + + The installation directory you specified (via --install-dir, --prefix, or + the distutils default setting) was: + + %s + """ + ).lstrip() + + __not_exists_id = textwrap.dedent( + """ + This directory does not currently exist. Please create it and try again, or + choose a different installation directory (using the -d or --install-dir + option). + """ + ).lstrip() + + __access_msg = textwrap.dedent( + """ + Perhaps your account does not have write access to this directory? If the + installation directory is a system-owned directory, you may need to sign in + as the administrator or "root" account. If you do not have administrative + access to this machine, you may wish to choose a different installation + directory, preferably one that is listed in your PYTHONPATH environment + variable. + + For information on other options, you may wish to consult the + documentation at: + + https://setuptools.pypa.io/en/latest/deprecated/easy_install.html + + Please make the appropriate changes for your system and try again. + """ + ).lstrip() + + def cant_write_to_target(self) -> NoReturn: + msg = self.__cant_write_msg % ( + sys.exc_info()[1], + self.install_dir, + ) + + if not os.path.exists(self.install_dir): + msg += '\n' + self.__not_exists_id + else: + msg += '\n' + self.__access_msg + raise DistutilsError(msg) + + def check_pth_processing(self): # noqa: C901 + """Empirically verify whether .pth files are supported in inst. dir""" + instdir = self.install_dir + log.info("Checking .pth file support in %s", instdir) + pth_file = self.pseudo_tempname() + ".pth" + ok_file = pth_file + '.ok' + ok_exists = os.path.exists(ok_file) + tmpl = ( + _one_liner( + """ + import os + f = open({ok_file!r}, 'w', encoding="utf-8") + f.write('OK') + f.close() + """ + ) + + '\n' + ) + try: + if ok_exists: + os.unlink(ok_file) + dirname = os.path.dirname(ok_file) + os.makedirs(dirname, exist_ok=True) + f = open(pth_file, 'w', encoding=py312.PTH_ENCODING) + # ^-- Python<3.13 require encoding="locale" instead of "utf-8", + # see python/cpython#77102. + except OSError: + self.cant_write_to_target() + else: + try: + f.write(tmpl.format(**locals())) + f.close() + f = None + executable = sys.executable + if os.name == 'nt': + dirname, basename = os.path.split(executable) + alt = os.path.join(dirname, 'pythonw.exe') + use_alt = basename.lower() == 'python.exe' and os.path.exists(alt) + if use_alt: + # use pythonw.exe to avoid opening a console window + executable = alt + + from distutils.spawn import spawn + + spawn([executable, '-E', '-c', 'pass'], 0) + + if os.path.exists(ok_file): + log.info("TEST PASSED: %s appears to support .pth files", instdir) + return True + finally: + if f: + f.close() + if os.path.exists(ok_file): + os.unlink(ok_file) + if os.path.exists(pth_file): + os.unlink(pth_file) + if not self.multi_version: + log.warn("TEST FAILED: %s does NOT support .pth files", instdir) + return False + + def install_egg_scripts(self, dist) -> None: + """Write all the scripts for `dist`, unless scripts are excluded""" + if not self.exclude_scripts and dist.metadata_isdir('scripts'): + for script_name in dist.metadata_listdir('scripts'): + if dist.metadata_isdir('scripts/' + script_name): + # The "script" is a directory, likely a Python 3 + # __pycache__ directory, so skip it. + continue + self.install_script( + dist, script_name, dist.get_metadata('scripts/' + script_name) + ) + self.install_wrapper_scripts(dist) + + def add_output(self, path) -> None: + if os.path.isdir(path): + for base, dirs, files in os.walk(path): + for filename in files: + self.outputs.append(os.path.join(base, filename)) + else: + self.outputs.append(path) + + def not_editable(self, spec) -> None: + if self.editable: + raise DistutilsArgError( + f"Invalid argument {spec!r}: you can't use filenames or URLs " + "with --editable (except via the --find-links option)." + ) + + def check_editable(self, spec) -> None: + if not self.editable: + return + + if os.path.exists(os.path.join(self.build_directory, spec.key)): + raise DistutilsArgError( + f"{spec.key!r} already exists in {self.build_directory}; can't do a checkout there" + ) + + @contextlib.contextmanager + def _tmpdir(self): + tmpdir = tempfile.mkdtemp(prefix="easy_install-") + try: + # cast to str as workaround for #709 and #710 and #712 + yield str(tmpdir) + finally: + os.path.exists(tmpdir) and _rmtree(tmpdir) + + def easy_install(self, spec, deps: bool = False) -> Distribution | None: + with self._tmpdir() as tmpdir: + if not isinstance(spec, Requirement): + if URL_SCHEME(spec): + # It's a url, download it to tmpdir and process + self.not_editable(spec) + dl = self.package_index.download(spec, tmpdir) + return self.install_item(None, dl, tmpdir, deps, True) + + elif os.path.exists(spec): + # Existing file or directory, just process it directly + self.not_editable(spec) + return self.install_item(None, spec, tmpdir, deps, True) + else: + spec = parse_requirement_arg(spec) + + self.check_editable(spec) + dist = self.package_index.fetch_distribution( + spec, + tmpdir, + self.upgrade, + self.editable, + not self.always_copy, + self.local_index, + ) + if dist is None: + msg = f"Could not find suitable distribution for {spec!r}" + if self.always_copy: + msg += " (--always-copy skips system and development eggs)" + raise DistutilsError(msg) + elif dist.precedence == DEVELOP_DIST: + # .egg-info dists don't need installing, just process deps + self.process_distribution(spec, dist, deps, "Using") + return dist + else: + return self.install_item(spec, dist.location, tmpdir, deps) + + def install_item( + self, spec, download, tmpdir, deps, install_needed: bool = False + ) -> Distribution | None: + # Installation is also needed if file in tmpdir or is not an egg + install_needed = install_needed or bool(self.always_copy) + install_needed = install_needed or os.path.dirname(download) == tmpdir + install_needed = install_needed or not download.endswith('.egg') + install_needed = install_needed or ( + self.always_copy_from is not None + and os.path.dirname(normalize_path(download)) + == normalize_path(self.always_copy_from) + ) + + if spec and not install_needed: + # at this point, we know it's a local .egg, we just don't know if + # it's already installed. + for dist in self.local_index[spec.project_name]: + if dist.location == download: + break + else: + install_needed = True # it's not in the local index + + log.info("Processing %s", os.path.basename(download)) + + if install_needed: + dists = self.install_eggs(spec, download, tmpdir) + for dist in dists: + self.process_distribution(spec, dist, deps) + else: + dists = [self.egg_distribution(download)] + self.process_distribution(spec, dists[0], deps, "Using") + + if spec is not None: + for dist in dists: + if dist in spec: + return dist + return None + + def select_scheme(self, name): + try: + install._select_scheme(self, name) + except AttributeError: + # stdlib distutils + install.install.select_scheme(self, name.replace('posix', 'unix')) + + # FIXME: 'easy_install.process_distribution' is too complex (12) + def process_distribution( # noqa: C901 + self, + requirement, + dist, + deps: bool = True, + *info, + ) -> None: + self.update_pth(dist) + self.package_index.add(dist) + if dist in self.local_index[dist.key]: + self.local_index.remove(dist) + self.local_index.add(dist) + self.install_egg_scripts(dist) + self.installed_projects[dist.key] = dist + log.info(self.installation_report(requirement, dist, *info)) + if dist.has_metadata('dependency_links.txt') and not self.no_find_links: + self.package_index.add_find_links( + dist.get_metadata_lines('dependency_links.txt') + ) + if not deps and not self.always_copy: + return + elif requirement is not None and dist.key != requirement.key: + log.warn("Skipping dependencies for %s", dist) + return # XXX this is not the distribution we were looking for + elif requirement is None or dist not in requirement: + # if we wound up with a different version, resolve what we've got + distreq = dist.as_requirement() + requirement = Requirement(str(distreq)) + log.info("Processing dependencies for %s", requirement) + try: + distros = WorkingSet([]).resolve( + [requirement], self.local_index, self.easy_install + ) + except DistributionNotFound as e: + raise DistutilsError(str(e)) from e + except VersionConflict as e: + raise DistutilsError(e.report()) from e + if self.always_copy or self.always_copy_from: + # Force all the relevant distros to be copied or activated + for dist in distros: + if dist.key not in self.installed_projects: + self.easy_install(dist.as_requirement()) + log.info("Finished processing dependencies for %s", requirement) + + def should_unzip(self, dist) -> bool: + if self.zip_ok is not None: + return not self.zip_ok + if dist.has_metadata('not-zip-safe'): + return True + if not dist.has_metadata('zip-safe'): + return True + return False + + def maybe_move(self, spec, dist_filename, setup_base): + dst = os.path.join(self.build_directory, spec.key) + if os.path.exists(dst): + msg = "%r already exists in %s; build directory %s will not be kept" + log.warn(msg, spec.key, self.build_directory, setup_base) + return setup_base + if os.path.isdir(dist_filename): + setup_base = dist_filename + else: + if os.path.dirname(dist_filename) == setup_base: + os.unlink(dist_filename) # get it out of the tmp dir + contents = os.listdir(setup_base) + if len(contents) == 1: + dist_filename = os.path.join(setup_base, contents[0]) + if os.path.isdir(dist_filename): + # if the only thing there is a directory, move it instead + setup_base = dist_filename + ensure_directory(dst) + shutil.move(setup_base, dst) + return dst + + def install_wrapper_scripts(self, dist) -> None: + if self.exclude_scripts: + return + for args in ScriptWriter.best().get_args(dist): + self.write_script(*args) + + def install_script(self, dist, script_name, script_text, dev_path=None) -> None: + """Generate a legacy script wrapper and install it""" + spec = str(dist.as_requirement()) + is_script = is_python_script(script_text, script_name) + + if is_script: + body = self._load_template(dev_path) % locals() + script_text = ScriptWriter.get_header(script_text) + body + self.write_script(script_name, _to_bytes(script_text), 'b') + + @staticmethod + def _load_template(dev_path): + """ + There are a couple of template scripts in the package. This + function loads one of them and prepares it for use. + """ + # See https://github.com/pypa/setuptools/issues/134 for info + # on script file naming and downstream issues with SVR4 + name = 'script.tmpl' + if dev_path: + name = name.replace('.tmpl', ' (dev).tmpl') + + raw_bytes = resource_string('setuptools', name) + return raw_bytes.decode('utf-8') + + def write_script(self, script_name, contents, mode: str = "t", blockers=()) -> None: + """Write an executable file to the scripts directory""" + self.delete_blockers( # clean up old .py/.pyw w/o a script + [os.path.join(self.script_dir, x) for x in blockers] + ) + log.info("Installing %s script to %s", script_name, self.script_dir) + target = os.path.join(self.script_dir, script_name) + self.add_output(target) + + if self.dry_run: + return + + mask = current_umask() + ensure_directory(target) + if os.path.exists(target): + os.unlink(target) + + encoding = None if "b" in mode else "utf-8" + with open(target, "w" + mode, encoding=encoding) as f: + f.write(contents) + chmod(target, 0o777 - mask) + + def install_eggs(self, spec, dist_filename, tmpdir) -> list[Distribution]: + # .egg dirs or files are already built, so just return them + installer_map = { + '.egg': self.install_egg, + '.exe': self.install_exe, + '.whl': self.install_wheel, + } + try: + install_dist = installer_map[dist_filename.lower()[-4:]] + except KeyError: + pass + else: + return [install_dist(dist_filename, tmpdir)] + + # Anything else, try to extract and build + setup_base = tmpdir + if os.path.isfile(dist_filename) and not dist_filename.endswith('.py'): + unpack_archive(dist_filename, tmpdir, self.unpack_progress) + elif os.path.isdir(dist_filename): + setup_base = os.path.abspath(dist_filename) + + if ( + setup_base.startswith(tmpdir) # something we downloaded + and self.build_directory + and spec is not None + ): + setup_base = self.maybe_move(spec, dist_filename, setup_base) + + # Find the setup.py file + setup_script = os.path.join(setup_base, 'setup.py') + + if not os.path.exists(setup_script): + setups = glob(os.path.join(setup_base, '*', 'setup.py')) + if not setups: + raise DistutilsError( + f"Couldn't find a setup script in {os.path.abspath(dist_filename)}" + ) + if len(setups) > 1: + raise DistutilsError( + f"Multiple setup scripts in {os.path.abspath(dist_filename)}" + ) + setup_script = setups[0] + + # Now run it, and return the result + if self.editable: + log.info(self.report_editable(spec, setup_script)) + return [] + else: + return self.build_and_install(setup_script, setup_base) + + def egg_distribution(self, egg_path): + if os.path.isdir(egg_path): + metadata = PathMetadata(egg_path, os.path.join(egg_path, 'EGG-INFO')) + else: + metadata = EggMetadata(zipimport.zipimporter(egg_path)) + return Distribution.from_filename(egg_path, metadata=metadata) + + # FIXME: 'easy_install.install_egg' is too complex (11) + def install_egg(self, egg_path, tmpdir): + destination = os.path.join( + self.install_dir, + os.path.basename(egg_path), + ) + destination = os.path.abspath(destination) + if not self.dry_run: + ensure_directory(destination) + + dist = self.egg_distribution(egg_path) + if not ( + os.path.exists(destination) and os.path.samefile(egg_path, destination) + ): + if os.path.isdir(destination) and not os.path.islink(destination): + dir_util.remove_tree(destination, dry_run=self.dry_run) + elif os.path.exists(destination): + self.execute( + os.unlink, + (destination,), + "Removing " + destination, + ) + try: + new_dist_is_zipped = False + if os.path.isdir(egg_path): + if egg_path.startswith(tmpdir): + f, m = shutil.move, "Moving" + else: + f, m = shutil.copytree, "Copying" + elif self.should_unzip(dist): + self.mkpath(destination) + f, m = self.unpack_and_compile, "Extracting" + else: + new_dist_is_zipped = True + if egg_path.startswith(tmpdir): + f, m = shutil.move, "Moving" + else: + f, m = shutil.copy2, "Copying" + self.execute( + f, + (egg_path, destination), + (m + " %s to %s") + % (os.path.basename(egg_path), os.path.dirname(destination)), + ) + update_dist_caches( + destination, + fix_zipimporter_caches=new_dist_is_zipped, + ) + except Exception: + update_dist_caches(destination, fix_zipimporter_caches=False) + raise + + self.add_output(destination) + return self.egg_distribution(destination) + + def install_exe(self, dist_filename, tmpdir): + # See if it's valid, get data + cfg = extract_wininst_cfg(dist_filename) + if cfg is None: + raise DistutilsError( + f"{dist_filename} is not a valid distutils Windows .exe" + ) + # Create a dummy distribution object until we build the real distro + dist = Distribution( + None, + project_name=cfg.get('metadata', 'name'), + version=cfg.get('metadata', 'version'), + platform=get_platform(), + ) + + # Convert the .exe to an unpacked egg + egg_path = os.path.join(tmpdir, dist.egg_name() + '.egg') + dist.location = egg_path + egg_tmp = egg_path + '.tmp' + _egg_info = os.path.join(egg_tmp, 'EGG-INFO') + pkg_inf = os.path.join(_egg_info, 'PKG-INFO') + ensure_directory(pkg_inf) # make sure EGG-INFO dir exists + dist._provider = PathMetadata(egg_tmp, _egg_info) # XXX + self.exe_to_egg(dist_filename, egg_tmp) + + # Write EGG-INFO/PKG-INFO + if not os.path.exists(pkg_inf): + with open(pkg_inf, 'w', encoding="utf-8") as f: + f.write('Metadata-Version: 1.0\n') + for k, v in cfg.items('metadata'): + if k != 'target_version': + k = k.replace('_', '-').title() + f.write(f'{k}: {v}\n') + script_dir = os.path.join(_egg_info, 'scripts') + # delete entry-point scripts to avoid duping + self.delete_blockers([ + os.path.join(script_dir, args[0]) for args in ScriptWriter.get_args(dist) + ]) + # Build .egg file from tmpdir + bdist_egg.make_zipfile( + egg_path, + egg_tmp, + verbose=self.verbose, + dry_run=self.dry_run, + ) + # install the .egg + return self.install_egg(egg_path, tmpdir) + + # FIXME: 'easy_install.exe_to_egg' is too complex (12) + def exe_to_egg(self, dist_filename, egg_tmp) -> None: # noqa: C901 + """Extract a bdist_wininst to the directories an egg would use""" + # Check for .pth file and set up prefix translations + prefixes = get_exe_prefixes(dist_filename) + to_compile = [] + native_libs = [] + top_level = set() + + def process(src, dst): + s = src.lower() + for old, new in prefixes: + if s.startswith(old): + src = new + src[len(old) :] + parts = src.split('/') + dst = os.path.join(egg_tmp, *parts) + dl = dst.lower() + if dl.endswith('.pyd') or dl.endswith('.dll'): + parts[-1] = bdist_egg.strip_module(parts[-1]) + top_level.add([os.path.splitext(parts[0])[0]]) + native_libs.append(src) + elif dl.endswith('.py') and old != 'SCRIPTS/': + top_level.add([os.path.splitext(parts[0])[0]]) + to_compile.append(dst) + return dst + if not src.endswith('.pth'): + log.warn("WARNING: can't process %s", src) + return None + + # extract, tracking .pyd/.dll->native_libs and .py -> to_compile + unpack_archive(dist_filename, egg_tmp, process) + stubs = [] + for res in native_libs: + if res.lower().endswith('.pyd'): # create stubs for .pyd's + parts = res.split('/') + resource = parts[-1] + parts[-1] = bdist_egg.strip_module(parts[-1]) + '.py' + pyfile = os.path.join(egg_tmp, *parts) + to_compile.append(pyfile) + stubs.append(pyfile) + bdist_egg.write_stub(resource, pyfile) + self.byte_compile(to_compile) # compile .py's + bdist_egg.write_safety_flag( + os.path.join(egg_tmp, 'EGG-INFO'), bdist_egg.analyze_egg(egg_tmp, stubs) + ) # write zip-safety flag + + for name in 'top_level', 'native_libs': + if locals()[name]: + txt = os.path.join(egg_tmp, 'EGG-INFO', name + '.txt') + if not os.path.exists(txt): + with open(txt, 'w', encoding="utf-8") as f: + f.write('\n'.join(locals()[name]) + '\n') + + def install_wheel(self, wheel_path, tmpdir): + wheel = Wheel(wheel_path) + assert wheel.is_compatible() + destination = os.path.join(self.install_dir, wheel.egg_name()) + destination = os.path.abspath(destination) + if not self.dry_run: + ensure_directory(destination) + if os.path.isdir(destination) and not os.path.islink(destination): + dir_util.remove_tree(destination, dry_run=self.dry_run) + elif os.path.exists(destination): + self.execute( + os.unlink, + (destination,), + "Removing " + destination, + ) + try: + self.execute( + wheel.install_as_egg, + (destination,), + ( + f"Installing {os.path.basename(wheel_path)} to {os.path.dirname(destination)}" + ), + ) + finally: + update_dist_caches(destination, fix_zipimporter_caches=False) + self.add_output(destination) + return self.egg_distribution(destination) + + __mv_warning = textwrap.dedent( + """ + Because this distribution was installed --multi-version, before you can + import modules from this package in an application, you will need to + 'import pkg_resources' and then use a 'require()' call similar to one of + these examples, in order to select the desired version: + + pkg_resources.require("%(name)s") # latest installed version + pkg_resources.require("%(name)s==%(version)s") # this exact version + pkg_resources.require("%(name)s>=%(version)s") # this version or higher + """ + ).lstrip() + + __id_warning = textwrap.dedent( + """ + Note also that the installation directory must be on sys.path at runtime for + this to work. (e.g. by being the application's script directory, by being on + PYTHONPATH, or by being added to sys.path by your code.) + """ + ) + + def installation_report(self, req, dist, what: str = "Installed") -> str: + """Helpful installation message for display to package users""" + msg = "\n%(what)s %(eggloc)s%(extras)s" + if self.multi_version and not self.no_report: + msg += '\n' + self.__mv_warning + if self.install_dir not in map(normalize_path, sys.path): + msg += '\n' + self.__id_warning + + eggloc = dist.location + name = dist.project_name + version = dist.version + extras = '' # TODO: self.report_extras(req, dist) + return msg % locals() + + __editable_msg = textwrap.dedent( + """ + Extracted editable version of %(spec)s to %(dirname)s + + If it uses setuptools in its setup script, you can activate it in + "development" mode by going to that directory and running:: + + %(python)s setup.py develop + + See the setuptools documentation for the "develop" command for more info. + """ + ).lstrip() + + def report_editable(self, spec, setup_script): + dirname = os.path.dirname(setup_script) + python = sys.executable + return '\n' + self.__editable_msg % locals() + + def run_setup(self, setup_script, setup_base, args) -> None: + sys.modules.setdefault('distutils.command.bdist_egg', bdist_egg) + sys.modules.setdefault('distutils.command.egg_info', egg_info) + + args = list(args) + if self.verbose > 2: + v = 'v' * (self.verbose - 1) + args.insert(0, '-' + v) + elif self.verbose < 2: + args.insert(0, '-q') + if self.dry_run: + args.insert(0, '-n') + log.info("Running %s %s", setup_script[len(setup_base) + 1 :], ' '.join(args)) + try: + run_setup(setup_script, args) + except SystemExit as v: + raise DistutilsError(f"Setup script exited with {v.args[0]}") from v + + def build_and_install(self, setup_script, setup_base): + args = ['bdist_egg', '--dist-dir'] + + dist_dir = tempfile.mkdtemp( + prefix='egg-dist-tmp-', dir=os.path.dirname(setup_script) + ) + try: + self._set_fetcher_options(os.path.dirname(setup_script)) + args.append(dist_dir) + + self.run_setup(setup_script, setup_base, args) + all_eggs = Environment([dist_dir]) + eggs = [ + self.install_egg(dist.location, setup_base) + for key in all_eggs + for dist in all_eggs[key] + ] + if not eggs and not self.dry_run: + log.warn("No eggs found in %s (setup script problem?)", dist_dir) + return eggs + finally: + _rmtree(dist_dir) + log.set_verbosity(self.verbose) # restore our log verbosity + + def _set_fetcher_options(self, base): + """ + When easy_install is about to run bdist_egg on a source dist, that + source dist might have 'setup_requires' directives, requiring + additional fetching. Ensure the fetcher options given to easy_install + are available to that command as well. + """ + # find the fetch options from easy_install and write them out + # to the setup.cfg file. + ei_opts = self.distribution.get_option_dict('easy_install').copy() + fetch_directives = ( + 'find_links', + 'site_dirs', + 'index_url', + 'optimize', + 'allow_hosts', + ) + fetch_options = {} + for key, val in ei_opts.items(): + if key not in fetch_directives: + continue + fetch_options[key] = val[1] + # create a settings dictionary suitable for `edit_config` + settings = dict(easy_install=fetch_options) + cfg_filename = os.path.join(base, 'setup.cfg') + setopt.edit_config(cfg_filename, settings) + + def update_pth(self, dist) -> None: # noqa: C901 # is too complex (11) # FIXME + if self.pth_file is None: + return + + for d in self.pth_file[dist.key]: # drop old entries + if not self.multi_version and d.location == dist.location: + continue + + log.info("Removing %s from easy-install.pth file", d) + self.pth_file.remove(d) + if d.location in self.shadow_path: + self.shadow_path.remove(d.location) + + if not self.multi_version: + if dist.location in self.pth_file.paths: + log.info( + "%s is already the active version in easy-install.pth", + dist, + ) + else: + log.info("Adding %s to easy-install.pth file", dist) + self.pth_file.add(dist) # add new entry + if dist.location not in self.shadow_path: + self.shadow_path.append(dist.location) + + if self.dry_run: + return + + self.pth_file.save() + + if dist.key != 'setuptools': + return + + # Ensure that setuptools itself never becomes unavailable! + # XXX should this check for latest version? + filename = os.path.join(self.install_dir, 'setuptools.pth') + if os.path.islink(filename): + os.unlink(filename) + + with open(filename, 'wt', encoding=py312.PTH_ENCODING) as f: + # ^-- Python<3.13 require encoding="locale" instead of "utf-8", + # see python/cpython#77102. + f.write(self.pth_file.make_relative(dist.location) + '\n') + + def unpack_progress(self, src, dst): + # Progress filter for unpacking + log.debug("Unpacking %s to %s", src, dst) + return dst # only unpack-and-compile skips files for dry run + + def unpack_and_compile(self, egg_path, destination) -> None: + to_compile = [] + to_chmod = [] + + def pf(src, dst): + if dst.endswith('.py') and not src.startswith('EGG-INFO/'): + to_compile.append(dst) + elif dst.endswith('.dll') or dst.endswith('.so'): + to_chmod.append(dst) + self.unpack_progress(src, dst) + return not self.dry_run and dst or None + + unpack_archive(egg_path, destination, pf) + self.byte_compile(to_compile) + if not self.dry_run: + for f in to_chmod: + mode = ((os.stat(f)[stat.ST_MODE]) | 0o555) & 0o7755 + chmod(f, mode) + + def byte_compile(self, to_compile) -> None: + if sys.dont_write_bytecode: + return + + from distutils.util import byte_compile + + try: + # try to make the byte compile messages quieter + log.set_verbosity(self.verbose - 1) + + byte_compile(to_compile, optimize=0, force=True, dry_run=self.dry_run) + if self.optimize: + byte_compile( + to_compile, + optimize=self.optimize, + force=True, + dry_run=self.dry_run, + ) + finally: + log.set_verbosity(self.verbose) # restore original verbosity + + __no_default_msg = textwrap.dedent( + """ + bad install directory or PYTHONPATH + + You are attempting to install a package to a directory that is not + on PYTHONPATH and which Python does not read ".pth" files from. The + installation directory you specified (via --install-dir, --prefix, or + the distutils default setting) was: + + %s + + and your PYTHONPATH environment variable currently contains: + + %r + + Here are some of your options for correcting the problem: + + * You can choose a different installation directory, i.e., one that is + on PYTHONPATH or supports .pth files + + * You can add the installation directory to the PYTHONPATH environment + variable. (It must then also be on PYTHONPATH whenever you run + Python and want to use the package(s) you are installing.) + + * You can set up the installation directory to support ".pth" files by + using one of the approaches described here: + + https://setuptools.pypa.io/en/latest/deprecated/easy_install.html#custom-installation-locations + + + Please make the appropriate changes for your system and try again. + """ + ).strip() + + def create_home_path(self) -> None: + """Create directories under ~.""" + if not self.user: + return + home = convert_path(os.path.expanduser("~")) + for path in only_strs(self.config_vars.values()): + if path.startswith(home) and not os.path.isdir(path): + self.debug_print(f"os.makedirs('{path}', 0o700)") + os.makedirs(path, 0o700) + + INSTALL_SCHEMES = dict( + posix=dict( + install_dir='$base/lib/python$py_version_short/site-packages', + script_dir='$base/bin', + ), + ) + + DEFAULT_SCHEME = dict( + install_dir='$base/Lib/site-packages', + script_dir='$base/Scripts', + ) + + def _expand(self, *attrs): + config_vars = self.get_finalized_command('install').config_vars + + if self.prefix: + # Set default install_dir/scripts from --prefix + config_vars = dict(config_vars) + config_vars['base'] = self.prefix + scheme = self.INSTALL_SCHEMES.get(os.name, self.DEFAULT_SCHEME) + for attr, val in scheme.items(): + if getattr(self, attr, None) is None: + setattr(self, attr, val) + + from distutils.util import subst_vars + + for attr in attrs: + val = getattr(self, attr) + if val is not None: + val = subst_vars(val, config_vars) + if os.name == 'posix': + val = os.path.expanduser(val) + setattr(self, attr, val) + + +def _pythonpath(): + items = os.environ.get('PYTHONPATH', '').split(os.pathsep) + return filter(None, items) + + +def get_site_dirs(): + """ + Return a list of 'site' dirs + """ + + sitedirs = [] + + # start with PYTHONPATH + sitedirs.extend(_pythonpath()) + + prefixes = [sys.prefix] + if sys.exec_prefix != sys.prefix: + prefixes.append(sys.exec_prefix) + for prefix in prefixes: + if not prefix: + continue + + if sys.platform in ('os2emx', 'riscos'): + sitedirs.append(os.path.join(prefix, "Lib", "site-packages")) + elif os.sep == '/': + sitedirs.extend([ + os.path.join( + prefix, + "lib", + f"python{sys.version_info.major}.{sys.version_info.minor}", + "site-packages", + ), + os.path.join(prefix, "lib", "site-python"), + ]) + else: + sitedirs.extend([ + prefix, + os.path.join(prefix, "lib", "site-packages"), + ]) + if sys.platform != 'darwin': + continue + + # for framework builds *only* we add the standard Apple + # locations. Currently only per-user, but /Library and + # /Network/Library could be added too + if 'Python.framework' not in prefix: + continue + + home = os.environ.get('HOME') + if not home: + continue + + home_sp = os.path.join( + home, + 'Library', + 'Python', + f'{sys.version_info.major}.{sys.version_info.minor}', + 'site-packages', + ) + sitedirs.append(home_sp) + lib_paths = get_path('purelib'), get_path('platlib') + + sitedirs.extend(s for s in lib_paths if s not in sitedirs) + + if site.ENABLE_USER_SITE: + sitedirs.append(site.USER_SITE) + + with contextlib.suppress(AttributeError): + sitedirs.extend(site.getsitepackages()) + + return list(map(normalize_path, sitedirs)) + + +def expand_paths(inputs): # noqa: C901 # is too complex (11) # FIXME + """Yield sys.path directories that might contain "old-style" packages""" + + seen = set() + + for dirname in inputs: + dirname = normalize_path(dirname) + if dirname in seen: + continue + + seen.add(dirname) + if not os.path.isdir(dirname): + continue + + files = os.listdir(dirname) + yield dirname, files + + for name in files: + if not name.endswith('.pth'): + # We only care about the .pth files + continue + if name in ('easy-install.pth', 'setuptools.pth'): + # Ignore .pth files that we control + continue + + # Read the .pth file + content = _read_pth(os.path.join(dirname, name)) + lines = list(yield_lines(content)) + + # Yield existing non-dupe, non-import directory lines from it + for line in lines: + if line.startswith("import"): + continue + + line = normalize_path(line.rstrip()) + if line in seen: + continue + + seen.add(line) + if not os.path.isdir(line): + continue + + yield line, os.listdir(line) + + +def extract_wininst_cfg(dist_filename): + """Extract configuration data from a bdist_wininst .exe + + Returns a configparser.RawConfigParser, or None + """ + f = open(dist_filename, 'rb') + try: + endrec = zipfile._EndRecData(f) + if endrec is None: + return None + + prepended = (endrec[9] - endrec[5]) - endrec[6] + if prepended < 12: # no wininst data here + return None + f.seek(prepended - 12) + + tag, cfglen, _bmlen = struct.unpack("egg path translations for a given .exe file""" + + prefixes = [ + ('PURELIB/', ''), + ('PLATLIB/pywin32_system32', ''), + ('PLATLIB/', ''), + ('SCRIPTS/', 'EGG-INFO/scripts/'), + ('DATA/lib/site-packages', ''), + ] + z = zipfile.ZipFile(exe_filename) + try: + for info in z.infolist(): + name = info.filename + parts = name.split('/') + if len(parts) == 3 and parts[2] == 'PKG-INFO': + if parts[1].endswith('.egg-info'): + prefixes.insert(0, ('/'.join(parts[:2]), 'EGG-INFO/')) + break + if len(parts) != 2 or not name.endswith('.pth'): + continue + if name.endswith('-nspkg.pth'): + continue + if parts[0].upper() in ('PURELIB', 'PLATLIB'): + contents = z.read(name).decode() + for pth in yield_lines(contents): + pth = pth.strip().replace('\\', '/') + if not pth.startswith('import'): + prefixes.append(((f'{parts[0]}/{pth}/'), '')) + finally: + z.close() + prefixes = [(x.lower(), y) for x, y in prefixes] + prefixes.sort() + prefixes.reverse() + return prefixes + + +class PthDistributions(Environment): + """A .pth file with Distribution paths in it""" + + def __init__(self, filename, sitedirs=()) -> None: + self.filename = filename + self.sitedirs = list(map(normalize_path, sitedirs)) + self.basedir = normalize_path(os.path.dirname(self.filename)) + self.paths, self.dirty = self._load() + # keep a copy if someone manually updates the paths attribute on the instance + self._init_paths = self.paths[:] + super().__init__([], None, None) + for path in yield_lines(self.paths): + list(map(self.add, find_distributions(path, True))) + + def _load_raw(self): + paths = [] + dirty = saw_import = False + seen = set(self.sitedirs) + content = _read_pth(self.filename) + for line in content.splitlines(): + path = line.rstrip() + # still keep imports and empty/commented lines for formatting + paths.append(path) + if line.startswith(('import ', 'from ')): + saw_import = True + continue + stripped_path = path.strip() + if not stripped_path or stripped_path.startswith('#'): + continue + # skip non-existent paths, in case somebody deleted a package + # manually, and duplicate paths as well + normalized_path = normalize_path(os.path.join(self.basedir, path)) + if normalized_path in seen or not os.path.exists(normalized_path): + log.debug("cleaned up dirty or duplicated %r", path) + dirty = True + paths.pop() + continue + seen.add(normalized_path) + # remove any trailing empty/blank line + while paths and not paths[-1].strip(): + paths.pop() + dirty = True + return paths, dirty or (paths and saw_import) + + def _load(self): + if os.path.isfile(self.filename): + return self._load_raw() + return [], False + + def save(self) -> None: + """Write changed .pth file back to disk""" + # first reload the file + last_paths, last_dirty = self._load() + # and check that there are no difference with what we have. + # there can be difference if someone else has written to the file + # since we first loaded it. + # we don't want to lose the eventual new paths added since then. + for path in last_paths[:]: + if path not in self.paths: + self.paths.append(path) + log.info("detected new path %r", path) + last_dirty = True + else: + last_paths.remove(path) + # also, re-check that all paths are still valid before saving them + for path in self.paths[:]: + if path not in last_paths and not path.startswith(( + 'import ', + 'from ', + '#', + )): + absolute_path = os.path.join(self.basedir, path) + if not os.path.exists(absolute_path): + self.paths.remove(path) + log.info("removing now non-existent path %r", path) + last_dirty = True + + self.dirty |= last_dirty or self.paths != self._init_paths + if not self.dirty: + return + + rel_paths = list(map(self.make_relative, self.paths)) + if rel_paths: + log.debug("Saving %s", self.filename) + lines = self._wrap_lines(rel_paths) + data = '\n'.join(lines) + '\n' + if os.path.islink(self.filename): + os.unlink(self.filename) + with open(self.filename, 'wt', encoding=py312.PTH_ENCODING) as f: + # ^-- Python<3.13 require encoding="locale" instead of "utf-8", + # see python/cpython#77102. + f.write(data) + elif os.path.exists(self.filename): + log.debug("Deleting empty %s", self.filename) + os.unlink(self.filename) + + self.dirty = False + self._init_paths[:] = self.paths[:] + + @staticmethod + def _wrap_lines(lines): + return lines + + def add(self, dist) -> None: + """Add `dist` to the distribution map""" + new_path = dist.location not in self.paths and ( + dist.location not in self.sitedirs + or + # account for '.' being in PYTHONPATH + dist.location == os.getcwd() + ) + if new_path: + self.paths.append(dist.location) + self.dirty = True + super().add(dist) + + def remove(self, dist) -> None: + """Remove `dist` from the distribution map""" + while dist.location in self.paths: + self.paths.remove(dist.location) + self.dirty = True + super().remove(dist) + + def make_relative(self, path): + npath, last = os.path.split(normalize_path(path)) + baselen = len(self.basedir) + parts = [last] + sep = os.altsep == '/' and '/' or os.sep + while len(npath) >= baselen: + if npath == self.basedir: + parts.append(os.curdir) + parts.reverse() + return sep.join(parts) + npath, last = os.path.split(npath) + parts.append(last) + else: + return path + + +class RewritePthDistributions(PthDistributions): + @classmethod + def _wrap_lines(cls, lines): + yield cls.prelude + yield from lines + yield cls.postlude + + prelude = _one_liner( + """ + import sys + sys.__plen = len(sys.path) + """ + ) + postlude = _one_liner( + """ + import sys + new = sys.path[sys.__plen:] + del sys.path[sys.__plen:] + p = getattr(sys, '__egginsert', 0) + sys.path[p:p] = new + sys.__egginsert = p + len(new) + """ + ) + + +if os.environ.get('SETUPTOOLS_SYS_PATH_TECHNIQUE', 'raw') == 'rewrite': + PthDistributions = RewritePthDistributions # type: ignore[misc] # Overwriting type + + +def _first_line_re(): + """ + Return a regular expression based on first_line_re suitable for matching + strings. + """ + if isinstance(first_line_re.pattern, str): + return first_line_re + + # first_line_re in Python >=3.1.4 and >=3.2.1 is a bytes pattern. + return re.compile(first_line_re.pattern.decode()) + + +def update_dist_caches(dist_path, fix_zipimporter_caches): + """ + Fix any globally cached `dist_path` related data + + `dist_path` should be a path of a newly installed egg distribution (zipped + or unzipped). + + sys.path_importer_cache contains finder objects that have been cached when + importing data from the original distribution. Any such finders need to be + cleared since the replacement distribution might be packaged differently, + e.g. a zipped egg distribution might get replaced with an unzipped egg + folder or vice versa. Having the old finders cached may then cause Python + to attempt loading modules from the replacement distribution using an + incorrect loader. + + zipimport.zipimporter objects are Python loaders charged with importing + data packaged inside zip archives. If stale loaders referencing the + original distribution, are left behind, they can fail to load modules from + the replacement distribution. E.g. if an old zipimport.zipimporter instance + is used to load data from a new zipped egg archive, it may cause the + operation to attempt to locate the requested data in the wrong location - + one indicated by the original distribution's zip archive directory + information. Such an operation may then fail outright, e.g. report having + read a 'bad local file header', or even worse, it may fail silently & + return invalid data. + + zipimport._zip_directory_cache contains cached zip archive directory + information for all existing zipimport.zipimporter instances and all such + instances connected to the same archive share the same cached directory + information. + + If asked, and the underlying Python implementation allows it, we can fix + all existing zipimport.zipimporter instances instead of having to track + them down and remove them one by one, by updating their shared cached zip + archive directory information. This, of course, assumes that the + replacement distribution is packaged as a zipped egg. + + If not asked to fix existing zipimport.zipimporter instances, we still do + our best to clear any remaining zipimport.zipimporter related cached data + that might somehow later get used when attempting to load data from the new + distribution and thus cause such load operations to fail. Note that when + tracking down such remaining stale data, we can not catch every conceivable + usage from here, and we clear only those that we know of and have found to + cause problems if left alive. Any remaining caches should be updated by + whomever is in charge of maintaining them, i.e. they should be ready to + handle us replacing their zip archives with new distributions at runtime. + + """ + # There are several other known sources of stale zipimport.zipimporter + # instances that we do not clear here, but might if ever given a reason to + # do so: + # * Global setuptools pkg_resources.working_set (a.k.a. 'master working + # set') may contain distributions which may in turn contain their + # zipimport.zipimporter loaders. + # * Several zipimport.zipimporter loaders held by local variables further + # up the function call stack when running the setuptools installation. + # * Already loaded modules may have their __loader__ attribute set to the + # exact loader instance used when importing them. Python 3.4 docs state + # that this information is intended mostly for introspection and so is + # not expected to cause us problems. + normalized_path = normalize_path(dist_path) + _uncache(normalized_path, sys.path_importer_cache) + if fix_zipimporter_caches: + _replace_zip_directory_cache_data(normalized_path) + else: + # Here, even though we do not want to fix existing and now stale + # zipimporter cache information, we still want to remove it. Related to + # Python's zip archive directory information cache, we clear each of + # its stale entries in two phases: + # 1. Clear the entry so attempting to access zip archive information + # via any existing stale zipimport.zipimporter instances fails. + # 2. Remove the entry from the cache so any newly constructed + # zipimport.zipimporter instances do not end up using old stale + # zip archive directory information. + # This whole stale data removal step does not seem strictly necessary, + # but has been left in because it was done before we started replacing + # the zip archive directory information cache content if possible, and + # there are no relevant unit tests that we can depend on to tell us if + # this is really needed. + _remove_and_clear_zip_directory_cache_data(normalized_path) + + +def _collect_zipimporter_cache_entries(normalized_path, cache): + """ + Return zipimporter cache entry keys related to a given normalized path. + + Alternative path spellings (e.g. those using different character case or + those using alternative path separators) related to the same path are + included. Any sub-path entries are included as well, i.e. those + corresponding to zip archives embedded in other zip archives. + + """ + result = [] + prefix_len = len(normalized_path) + for p in cache: + np = normalize_path(p) + if np.startswith(normalized_path) and np[prefix_len : prefix_len + 1] in ( + os.sep, + '', + ): + result.append(p) + return result + + +def _update_zipimporter_cache(normalized_path, cache, updater=None): + """ + Update zipimporter cache data for a given normalized path. + + Any sub-path entries are processed as well, i.e. those corresponding to zip + archives embedded in other zip archives. + + Given updater is a callable taking a cache entry key and the original entry + (after already removing the entry from the cache), and expected to update + the entry and possibly return a new one to be inserted in its place. + Returning None indicates that the entry should not be replaced with a new + one. If no updater is given, the cache entries are simply removed without + any additional processing, the same as if the updater simply returned None. + + """ + for p in _collect_zipimporter_cache_entries(normalized_path, cache): + # N.B. pypy's custom zipimport._zip_directory_cache implementation does + # not support the complete dict interface: + # * Does not support item assignment, thus not allowing this function + # to be used only for removing existing cache entries. + # * Does not support the dict.pop() method, forcing us to use the + # get/del patterns instead. For more detailed information see the + # following links: + # https://github.com/pypa/setuptools/issues/202#issuecomment-202913420 + # https://foss.heptapod.net/pypy/pypy/-/blob/144c4e65cb6accb8e592f3a7584ea38265d1873c/pypy/module/zipimport/interp_zipimport.py + old_entry = cache[p] + del cache[p] + new_entry = updater and updater(p, old_entry) + if new_entry is not None: + cache[p] = new_entry + + +def _uncache(normalized_path, cache): + _update_zipimporter_cache(normalized_path, cache) + + +def _remove_and_clear_zip_directory_cache_data(normalized_path): + def clear_and_remove_cached_zip_archive_directory_data(path, old_entry): + old_entry.clear() + + _update_zipimporter_cache( + normalized_path, + zipimport._zip_directory_cache, + updater=clear_and_remove_cached_zip_archive_directory_data, + ) + + +# PyPy Python implementation does not allow directly writing to the +# zipimport._zip_directory_cache and so prevents us from attempting to correct +# its content. The best we can do there is clear the problematic cache content +# and have PyPy repopulate it as needed. The downside is that if there are any +# stale zipimport.zipimporter instances laying around, attempting to use them +# will fail due to not having its zip archive directory information available +# instead of being automatically corrected to use the new correct zip archive +# directory information. +if '__pypy__' in sys.builtin_module_names: + _replace_zip_directory_cache_data = _remove_and_clear_zip_directory_cache_data +else: + + def _replace_zip_directory_cache_data(normalized_path): + def replace_cached_zip_archive_directory_data(path, old_entry): + # N.B. In theory, we could load the zip directory information just + # once for all updated path spellings, and then copy it locally and + # update its contained path strings to contain the correct + # spelling, but that seems like a way too invasive move (this cache + # structure is not officially documented anywhere and could in + # theory change with new Python releases) for no significant + # benefit. + old_entry.clear() + zipimport.zipimporter(path) + old_entry.update(zipimport._zip_directory_cache[path]) + return old_entry + + _update_zipimporter_cache( + normalized_path, + zipimport._zip_directory_cache, + updater=replace_cached_zip_archive_directory_data, + ) + + +def is_python(text, filename=''): + "Is this string a valid Python script?" + try: + compile(text, filename, 'exec') + except (SyntaxError, TypeError): + return False + else: + return True + + +def is_sh(executable): + """Determine if the specified executable is a .sh (contains a #! line)""" + try: + with open(executable, encoding='latin-1') as fp: + magic = fp.read(2) + except OSError: + return executable + return magic == '#!' + + +def nt_quote_arg(arg): + """Quote a command line argument according to Windows parsing rules""" + return subprocess.list2cmdline([arg]) + + +def is_python_script(script_text, filename): + """Is this text, as a whole, a Python script? (as opposed to shell/bat/etc.""" + if filename.endswith('.py') or filename.endswith('.pyw'): + return True # extension says it's Python + if is_python(script_text, filename): + return True # it's syntactically valid Python + if script_text.startswith('#!'): + # It begins with a '#!' line, so check if 'python' is in it somewhere + return 'python' in script_text.splitlines()[0].lower() + + return False # Not any Python I can recognize + + +class _SplitArgs(TypedDict, total=False): + comments: bool + posix: bool + + +class CommandSpec(list): + """ + A command spec for a #! header, specified as a list of arguments akin to + those passed to Popen. + """ + + options: list[str] = [] + split_args = _SplitArgs() + + @classmethod + def best(cls): + """ + Choose the best CommandSpec class based on environmental conditions. + """ + return cls + + @classmethod + def _sys_executable(cls): + _default = os.path.normpath(sys.executable) + return os.environ.get('__PYVENV_LAUNCHER__', _default) + + @classmethod + def from_param(cls, param: Self | str | Iterable[str] | None) -> Self: + """ + Construct a CommandSpec from a parameter to build_scripts, which may + be None. + """ + if isinstance(param, cls): + return param + if isinstance(param, str): + return cls.from_string(param) + if isinstance(param, Iterable): + return cls(param) + if param is None: + return cls.from_environment() + raise TypeError(f"Argument has an unsupported type {type(param)}") + + @classmethod + def from_environment(cls): + return cls([cls._sys_executable()]) + + @classmethod + def from_string(cls, string: str) -> Self: + """ + Construct a command spec from a simple string representing a command + line parseable by shlex.split. + """ + items = shlex.split(string, **cls.split_args) + return cls(items) + + def install_options(self, script_text: str): + self.options = shlex.split(self._extract_options(script_text)) + cmdline = subprocess.list2cmdline(self) + if not isascii(cmdline): + self.options[:0] = ['-x'] + + @staticmethod + def _extract_options(orig_script): + """ + Extract any options from the first line of the script. + """ + first = (orig_script + '\n').splitlines()[0] + match = _first_line_re().match(first) + options = match.group(1) or '' if match else '' + return options.strip() + + def as_header(self): + return self._render(self + list(self.options)) + + @staticmethod + def _strip_quotes(item): + _QUOTES = '"\'' + for q in _QUOTES: + if item.startswith(q) and item.endswith(q): + return item[1:-1] + return item + + @staticmethod + def _render(items): + cmdline = subprocess.list2cmdline( + CommandSpec._strip_quotes(item.strip()) for item in items + ) + return '#!' + cmdline + '\n' + + +# For pbr compat; will be removed in a future version. +sys_executable = CommandSpec._sys_executable() + + +class WindowsCommandSpec(CommandSpec): + split_args = _SplitArgs(posix=False) + + +class ScriptWriter: + """ + Encapsulates behavior around writing entry point scripts for console and + gui apps. + """ + + template = textwrap.dedent( + r""" + # EASY-INSTALL-ENTRY-SCRIPT: %(spec)r,%(group)r,%(name)r + import re + import sys + + # for compatibility with easy_install; see #2198 + __requires__ = %(spec)r + + try: + from importlib.metadata import distribution + except ImportError: + try: + from importlib_metadata import distribution + except ImportError: + from pkg_resources import load_entry_point + + + def importlib_load_entry_point(spec, group, name): + dist_name, _, _ = spec.partition('==') + matches = ( + entry_point + for entry_point in distribution(dist_name).entry_points + if entry_point.group == group and entry_point.name == name + ) + return next(matches).load() + + + globals().setdefault('load_entry_point', importlib_load_entry_point) + + + if __name__ == '__main__': + sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) + sys.exit(load_entry_point(%(spec)r, %(group)r, %(name)r)()) + """ + ).lstrip() + + command_spec_class = CommandSpec + + @classmethod + def get_args(cls, dist, header=None): + """ + Yield write_script() argument tuples for a distribution's + console_scripts and gui_scripts entry points. + """ + if header is None: + header = cls.get_header() + spec = str(dist.as_requirement()) + for type_ in 'console', 'gui': + group = type_ + '_scripts' + for name in dist.get_entry_map(group).keys(): + cls._ensure_safe_name(name) + script_text = cls.template % locals() + args = cls._get_script_args(type_, name, header, script_text) + yield from args + + @staticmethod + def _ensure_safe_name(name): + """ + Prevent paths in *_scripts entry point names. + """ + has_path_sep = re.search(r'[\\/]', name) + if has_path_sep: + raise ValueError("Path separators not allowed in script names") + + @classmethod + def best(cls): + """ + Select the best ScriptWriter for this environment. + """ + if sys.platform == 'win32' or (os.name == 'java' and os._name == 'nt'): + return WindowsScriptWriter.best() + else: + return cls + + @classmethod + def _get_script_args(cls, type_, name, header, script_text): + # Simply write the stub with no extension. + yield (name, header + script_text) + + @classmethod + def get_header( + cls, + script_text: str = "", + executable: str | CommandSpec | Iterable[str] | None = None, + ) -> str: + """Create a #! line, getting options (if any) from script_text""" + cmd = cls.command_spec_class.best().from_param(executable) + cmd.install_options(script_text) + return cmd.as_header() + + +class WindowsScriptWriter(ScriptWriter): + command_spec_class = WindowsCommandSpec + + @classmethod + def best(cls): + """ + Select the best ScriptWriter suitable for Windows + """ + writer_lookup = dict( + executable=WindowsExecutableLauncherWriter, + natural=cls, + ) + # for compatibility, use the executable launcher by default + launcher = os.environ.get('SETUPTOOLS_LAUNCHER', 'executable') + return writer_lookup[launcher] + + @classmethod + def _get_script_args(cls, type_, name, header, script_text): + "For Windows, add a .py extension" + ext = dict(console='.pya', gui='.pyw')[type_] + if ext not in os.environ['PATHEXT'].lower().split(';'): + msg = ( + "{ext} not listed in PATHEXT; scripts will not be " + "recognized as executables." + ).format(**locals()) + SetuptoolsWarning.emit(msg) + old = ['.pya', '.py', '-script.py', '.pyc', '.pyo', '.pyw', '.exe'] + old.remove(ext) + header = cls._adjust_header(type_, header) + blockers = [name + x for x in old] + yield name + ext, header + script_text, 't', blockers + + @classmethod + def _adjust_header(cls, type_, orig_header): + """ + Make sure 'pythonw' is used for gui and 'python' is used for + console (regardless of what sys.executable is). + """ + pattern = 'pythonw.exe' + repl = 'python.exe' + if type_ == 'gui': + pattern, repl = repl, pattern + pattern_ob = re.compile(re.escape(pattern), re.IGNORECASE) + new_header = pattern_ob.sub(string=orig_header, repl=repl) + return new_header if cls._use_header(new_header) else orig_header + + @staticmethod + def _use_header(new_header): + """ + Should _adjust_header use the replaced header? + + On non-windows systems, always use. On + Windows systems, only use the replaced header if it resolves + to an executable on the system. + """ + clean_header = new_header[2:-1].strip('"') + return sys.platform != 'win32' or shutil.which(clean_header) + + +class WindowsExecutableLauncherWriter(WindowsScriptWriter): + @classmethod + def _get_script_args(cls, type_, name, header, script_text): + """ + For Windows, add a .py extension and an .exe launcher + """ + if type_ == 'gui': + launcher_type = 'gui' + ext = '-script.pyw' + old = ['.pyw'] + else: + launcher_type = 'cli' + ext = '-script.py' + old = ['.py', '.pyc', '.pyo'] + hdr = cls._adjust_header(type_, header) + blockers = [name + x for x in old] + yield (name + ext, hdr + script_text, 't', blockers) + yield ( + name + '.exe', + get_win_launcher(launcher_type), + 'b', # write in binary mode + ) + if not is_64bit(): + # install a manifest for the launcher to prevent Windows + # from detecting it as an installer (which it will for + # launchers like easy_install.exe). Consider only + # adding a manifest for launchers detected as installers. + # See Distribute #143 for details. + m_name = name + '.exe.manifest' + yield (m_name, load_launcher_manifest(name), 't') + + +def get_win_launcher(type): + """ + Load the Windows launcher (executable) suitable for launching a script. + + `type` should be either 'cli' or 'gui' + + Returns the executable as a byte string. + """ + launcher_fn = f'{type}.exe' + if is_64bit(): + if get_platform() == "win-arm64": + launcher_fn = launcher_fn.replace(".", "-arm64.") + else: + launcher_fn = launcher_fn.replace(".", "-64.") + else: + launcher_fn = launcher_fn.replace(".", "-32.") + return resource_string('setuptools', launcher_fn) + + +def load_launcher_manifest(name): + manifest = pkg_resources.resource_string(__name__, 'launcher manifest.xml') + return manifest.decode('utf-8') % vars() + + +def current_umask(): + tmp = os.umask(0o022) + os.umask(tmp) + return tmp + + +def only_strs(values): + """ + Exclude non-str values. Ref #3063. + """ + return filter(lambda val: isinstance(val, str), values) + + +def _read_pth(fullname: str) -> str: + # Python<3.13 require encoding="locale" instead of "utf-8", see python/cpython#77102 + # In the case old versions of setuptools are producing `pth` files with + # different encodings that might be problematic... So we fallback to "locale". + + try: + with open(fullname, encoding=py312.PTH_ENCODING) as f: + return f.read() + except UnicodeDecodeError: # pragma: no cover + # This error may only happen for Python >= 3.13 + # TODO: Possible deprecation warnings to be added in the future: + # ``.pth file {fullname!r} is not UTF-8.`` + # Your environment contain {fullname!r} that cannot be read as UTF-8. + # This is likely to have been produced with an old version of setuptools. + # Please be mindful that this is deprecated and in the future, non-utf8 + # .pth files may cause setuptools to fail. + with open(fullname, encoding=py39.LOCALE_ENCODING) as f: + return f.read() + + +class EasyInstallDeprecationWarning(SetuptoolsDeprecationWarning): + _SUMMARY = "easy_install command is deprecated." + _DETAILS = """ + Please avoid running ``setup.py`` and ``easy_install``. + Instead, use pypa/build, pypa/installer or other + standards-based tools. + """ + _SEE_URL = "https://github.com/pypa/setuptools/issues/917" + # _DUE_DATE not defined yet diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/editable_wheel.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/editable_wheel.py new file mode 100644 index 0000000000000000000000000000000000000000..b03e677757a9b808265e419b03da302e52a41df8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/editable_wheel.py @@ -0,0 +1,925 @@ +""" +Create a wheel that, when installed, will make the source package 'editable' +(add it to the interpreter's path, including metadata) per PEP 660. Replaces +'setup.py develop'. + +.. note:: + One of the mechanisms briefly mentioned in PEP 660 to implement editable installs is + to create a separated directory inside ``build`` and use a .pth file to point to that + directory. In the context of this file such directory is referred as + *auxiliary build directory* or ``auxiliary_dir``. +""" + +from __future__ import annotations + +import io +import logging +import os +import shutil +import traceback +from collections.abc import Iterable, Iterator, Mapping +from contextlib import suppress +from enum import Enum +from inspect import cleandoc +from itertools import chain, starmap +from pathlib import Path +from tempfile import TemporaryDirectory +from types import TracebackType +from typing import TYPE_CHECKING, Protocol, TypeVar, cast + +from .. import Command, _normalization, _path, _shutil, errors, namespaces +from .._path import StrPath +from ..compat import py312 +from ..discovery import find_package_path +from ..dist import Distribution +from ..warnings import InformationOnly, SetuptoolsDeprecationWarning, SetuptoolsWarning +from .build import build as build_cls +from .build_py import build_py as build_py_cls +from .dist_info import dist_info as dist_info_cls +from .egg_info import egg_info as egg_info_cls +from .install import install as install_cls +from .install_scripts import install_scripts as install_scripts_cls + +if TYPE_CHECKING: + from typing_extensions import Self + + from .._vendor.wheel.wheelfile import WheelFile + +_P = TypeVar("_P", bound=StrPath) +_logger = logging.getLogger(__name__) + + +class _EditableMode(Enum): + """ + Possible editable installation modes: + `lenient` (new files automatically added to the package - DEFAULT); + `strict` (requires a new installation when files are added/removed); or + `compat` (attempts to emulate `python setup.py develop` - DEPRECATED). + """ + + STRICT = "strict" + LENIENT = "lenient" + COMPAT = "compat" # TODO: Remove `compat` after Dec/2022. + + @classmethod + def convert(cls, mode: str | None) -> _EditableMode: + if not mode: + return _EditableMode.LENIENT # default + + _mode = mode.upper() + if _mode not in _EditableMode.__members__: + raise errors.OptionError(f"Invalid editable mode: {mode!r}. Try: 'strict'.") + + if _mode == "COMPAT": + SetuptoolsDeprecationWarning.emit( + "Compat editable installs", + """ + The 'compat' editable mode is transitional and will be removed + in future versions of `setuptools`. + Please adapt your code accordingly to use either the 'strict' or the + 'lenient' modes. + """, + see_docs="userguide/development_mode.html", + # TODO: define due_date + # There is a series of shortcomings with the available editable install + # methods, and they are very controversial. This is something that still + # needs work. + # Moreover, `pip` is still hiding this warning, so users are not aware. + ) + + return _EditableMode[_mode] + + +_STRICT_WARNING = """ +New or renamed files may not be automatically picked up without a new installation. +""" + +_LENIENT_WARNING = """ +Options like `package-data`, `include/exclude-package-data` or +`packages.find.exclude/include` may have no effect. +""" + + +class editable_wheel(Command): + """Build 'editable' wheel for development. + This command is private and reserved for internal use of setuptools, + users should rely on ``setuptools.build_meta`` APIs. + """ + + description = "DO NOT CALL DIRECTLY, INTERNAL ONLY: create PEP 660 editable wheel" + + user_options = [ + ("dist-dir=", "d", "directory to put final built distributions in"), + ("dist-info-dir=", "I", "path to a pre-build .dist-info directory"), + ("mode=", None, cleandoc(_EditableMode.__doc__ or "")), + ] + + def initialize_options(self): + self.dist_dir = None + self.dist_info_dir = None + self.project_dir = None + self.mode = None + + def finalize_options(self) -> None: + dist = self.distribution + self.project_dir = dist.src_root or os.curdir + self.package_dir = dist.package_dir or {} + self.dist_dir = Path(self.dist_dir or os.path.join(self.project_dir, "dist")) + + def run(self) -> None: + try: + self.dist_dir.mkdir(exist_ok=True) + self._ensure_dist_info() + + # Add missing dist_info files + self.reinitialize_command("bdist_wheel") + bdist_wheel = self.get_finalized_command("bdist_wheel") + bdist_wheel.write_wheelfile(self.dist_info_dir) + + self._create_wheel_file(bdist_wheel) + except Exception: + traceback.print_exc() + project = self.distribution.name or self.distribution.get_name() + _DebuggingTips.emit(project=project) + raise + + def _ensure_dist_info(self): + if self.dist_info_dir is None: + dist_info = cast(dist_info_cls, self.reinitialize_command("dist_info")) + dist_info.output_dir = self.dist_dir + dist_info.ensure_finalized() + dist_info.run() + self.dist_info_dir = dist_info.dist_info_dir + else: + assert str(self.dist_info_dir).endswith(".dist-info") + assert Path(self.dist_info_dir, "METADATA").exists() + + def _install_namespaces(self, installation_dir, pth_prefix): + # XXX: Only required to support the deprecated namespace practice + dist = self.distribution + if not dist.namespace_packages: + return + + src_root = Path(self.project_dir, self.package_dir.get("", ".")).resolve() + installer = _NamespaceInstaller(dist, installation_dir, pth_prefix, src_root) + installer.install_namespaces() + + def _find_egg_info_dir(self) -> str | None: + parent_dir = Path(self.dist_info_dir).parent if self.dist_info_dir else Path() + candidates = map(str, parent_dir.glob("*.egg-info")) + return next(candidates, None) + + def _configure_build( + self, name: str, unpacked_wheel: StrPath, build_lib: StrPath, tmp_dir: StrPath + ): + """Configure commands to behave in the following ways: + + - Build commands can write to ``build_lib`` if they really want to... + (but this folder is expected to be ignored and modules are expected to live + in the project directory...) + - Binary extensions should be built in-place (editable_mode = True) + - Data/header/script files are not part of the "editable" specification + so they are written directly to the unpacked_wheel directory. + """ + # Non-editable files (data, headers, scripts) are written directly to the + # unpacked_wheel + + dist = self.distribution + wheel = str(unpacked_wheel) + build_lib = str(build_lib) + data = str(Path(unpacked_wheel, f"{name}.data", "data")) + headers = str(Path(unpacked_wheel, f"{name}.data", "headers")) + scripts = str(Path(unpacked_wheel, f"{name}.data", "scripts")) + + # egg-info may be generated again to create a manifest (used for package data) + egg_info = cast( + egg_info_cls, dist.reinitialize_command("egg_info", reinit_subcommands=True) + ) + egg_info.egg_base = str(tmp_dir) + egg_info.ignore_egg_info_in_manifest = True + + build = cast( + build_cls, dist.reinitialize_command("build", reinit_subcommands=True) + ) + install = cast( + install_cls, dist.reinitialize_command("install", reinit_subcommands=True) + ) + + build.build_platlib = build.build_purelib = build.build_lib = build_lib + install.install_purelib = install.install_platlib = install.install_lib = wheel + install.install_scripts = build.build_scripts = scripts + install.install_headers = headers + install.install_data = data + + install_scripts = cast( + install_scripts_cls, dist.get_command_obj("install_scripts") + ) + install_scripts.no_ep = True + + build.build_temp = str(tmp_dir) + + build_py = cast(build_py_cls, dist.get_command_obj("build_py")) + build_py.compile = False + build_py.existing_egg_info_dir = self._find_egg_info_dir() + + self._set_editable_mode() + + build.ensure_finalized() + install.ensure_finalized() + + def _set_editable_mode(self): + """Set the ``editable_mode`` flag in the build sub-commands""" + dist = self.distribution + build = dist.get_command_obj("build") + for cmd_name in build.get_sub_commands(): + cmd = dist.get_command_obj(cmd_name) + if hasattr(cmd, "editable_mode"): + cmd.editable_mode = True + elif hasattr(cmd, "inplace"): + cmd.inplace = True # backward compatibility with distutils + + def _collect_build_outputs(self) -> tuple[list[str], dict[str, str]]: + files: list[str] = [] + mapping: dict[str, str] = {} + build = self.get_finalized_command("build") + + for cmd_name in build.get_sub_commands(): + cmd = self.get_finalized_command(cmd_name) + if hasattr(cmd, "get_outputs"): + files.extend(cmd.get_outputs() or []) + if hasattr(cmd, "get_output_mapping"): + mapping.update(cmd.get_output_mapping() or {}) + + return files, mapping + + def _run_build_commands( + self, + dist_name: str, + unpacked_wheel: StrPath, + build_lib: StrPath, + tmp_dir: StrPath, + ) -> tuple[list[str], dict[str, str]]: + self._configure_build(dist_name, unpacked_wheel, build_lib, tmp_dir) + self._run_build_subcommands() + files, mapping = self._collect_build_outputs() + self._run_install("headers") + self._run_install("scripts") + self._run_install("data") + return files, mapping + + def _run_build_subcommands(self) -> None: + """ + Issue #3501 indicates that some plugins/customizations might rely on: + + 1. ``build_py`` not running + 2. ``build_py`` always copying files to ``build_lib`` + + However both these assumptions may be false in editable_wheel. + This method implements a temporary workaround to support the ecosystem + while the implementations catch up. + """ + # TODO: Once plugins/customisations had the chance to catch up, replace + # `self._run_build_subcommands()` with `self.run_command("build")`. + # Also remove _safely_run, TestCustomBuildPy. Suggested date: Aug/2023. + build = self.get_finalized_command("build") + for name in build.get_sub_commands(): + cmd = self.get_finalized_command(name) + if name == "build_py" and type(cmd) is not build_py_cls: + self._safely_run(name) + else: + self.run_command(name) + + def _safely_run(self, cmd_name: str): + try: + return self.run_command(cmd_name) + except Exception: + SetuptoolsDeprecationWarning.emit( + "Customization incompatible with editable install", + f""" + {traceback.format_exc()} + + If you are seeing this warning it is very likely that a setuptools + plugin or customization overrides the `{cmd_name}` command, without + taking into consideration how editable installs run build steps + starting from setuptools v64.0.0. + + Plugin authors and developers relying on custom build steps are + encouraged to update their `{cmd_name}` implementation considering the + information about editable installs in + https://setuptools.pypa.io/en/latest/userguide/extension.html. + + For the time being `setuptools` will silence this error and ignore + the faulty command, but this behaviour will change in future versions. + """, + # TODO: define due_date + # There is a series of shortcomings with the available editable install + # methods, and they are very controversial. This is something that still + # needs work. + ) + + def _create_wheel_file(self, bdist_wheel): + from wheel.wheelfile import WheelFile + + dist_info = self.get_finalized_command("dist_info") + dist_name = dist_info.name + tag = "-".join(bdist_wheel.get_tag()) + build_tag = "0.editable" # According to PEP 427 needs to start with digit + archive_name = f"{dist_name}-{build_tag}-{tag}.whl" + wheel_path = Path(self.dist_dir, archive_name) + if wheel_path.exists(): + wheel_path.unlink() + + unpacked_wheel = TemporaryDirectory(suffix=archive_name) + build_lib = TemporaryDirectory(suffix=".build-lib") + build_tmp = TemporaryDirectory(suffix=".build-temp") + + with unpacked_wheel as unpacked, build_lib as lib, build_tmp as tmp: + unpacked_dist_info = Path(unpacked, Path(self.dist_info_dir).name) + shutil.copytree(self.dist_info_dir, unpacked_dist_info) + self._install_namespaces(unpacked, dist_name) + files, mapping = self._run_build_commands(dist_name, unpacked, lib, tmp) + strategy = self._select_strategy(dist_name, tag, lib) + with strategy, WheelFile(wheel_path, "w") as wheel_obj: + strategy(wheel_obj, files, mapping) + wheel_obj.write_files(unpacked) + + return wheel_path + + def _run_install(self, category: str): + has_category = getattr(self.distribution, f"has_{category}", None) + if has_category and has_category(): + _logger.info(f"Installing {category} as non editable") + self.run_command(f"install_{category}") + + def _select_strategy( + self, + name: str, + tag: str, + build_lib: StrPath, + ) -> EditableStrategy: + """Decides which strategy to use to implement an editable installation.""" + build_name = f"__editable__.{name}-{tag}" + project_dir = Path(self.project_dir) + mode = _EditableMode.convert(self.mode) + + if mode is _EditableMode.STRICT: + auxiliary_dir = _empty_dir(Path(self.project_dir, "build", build_name)) + return _LinkTree(self.distribution, name, auxiliary_dir, build_lib) + + packages = _find_packages(self.distribution) + has_simple_layout = _simple_layout(packages, self.package_dir, project_dir) + is_compat_mode = mode is _EditableMode.COMPAT + if set(self.package_dir) == {""} and has_simple_layout or is_compat_mode: + # src-layout(ish) is relatively safe for a simple pth file + src_dir = self.package_dir.get("", ".") + return _StaticPth(self.distribution, name, [Path(project_dir, src_dir)]) + + # Use a MetaPathFinder to avoid adding accidental top-level packages/modules + return _TopLevelFinder(self.distribution, name) + + +class EditableStrategy(Protocol): + def __call__( + self, wheel: WheelFile, files: list[str], mapping: Mapping[str, str] + ) -> object: ... + def __enter__(self) -> Self: ... + def __exit__( + self, + _exc_type: type[BaseException] | None, + _exc_value: BaseException | None, + _traceback: TracebackType | None, + ) -> object: ... + + +class _StaticPth: + def __init__(self, dist: Distribution, name: str, path_entries: list[Path]) -> None: + self.dist = dist + self.name = name + self.path_entries = path_entries + + def __call__(self, wheel: WheelFile, files: list[str], mapping: Mapping[str, str]): + entries = "\n".join(str(p.resolve()) for p in self.path_entries) + contents = _encode_pth(f"{entries}\n") + wheel.writestr(f"__editable__.{self.name}.pth", contents) + + def __enter__(self) -> Self: + msg = f""" + Editable install will be performed using .pth file to extend `sys.path` with: + {list(map(os.fspath, self.path_entries))!r} + """ + _logger.warning(msg + _LENIENT_WARNING) + return self + + def __exit__( + self, + _exc_type: object, + _exc_value: object, + _traceback: object, + ) -> None: + pass + + +class _LinkTree(_StaticPth): + """ + Creates a ``.pth`` file that points to a link tree in the ``auxiliary_dir``. + + This strategy will only link files (not dirs), so it can be implemented in + any OS, even if that means using hardlinks instead of symlinks. + + By collocating ``auxiliary_dir`` and the original source code, limitations + with hardlinks should be avoided. + """ + + def __init__( + self, + dist: Distribution, + name: str, + auxiliary_dir: StrPath, + build_lib: StrPath, + ) -> None: + self.auxiliary_dir = Path(auxiliary_dir) + self.build_lib = Path(build_lib).resolve() + self._file = dist.get_command_obj("build_py").copy_file + super().__init__(dist, name, [self.auxiliary_dir]) + + def __call__(self, wheel: WheelFile, files: list[str], mapping: Mapping[str, str]): + self._create_links(files, mapping) + super().__call__(wheel, files, mapping) + + def _normalize_output(self, file: str) -> str | None: + # Files relative to build_lib will be normalized to None + with suppress(ValueError): + path = Path(file).resolve().relative_to(self.build_lib) + return str(path).replace(os.sep, '/') + return None + + def _create_file(self, relative_output: str, src_file: str, link=None): + dest = self.auxiliary_dir / relative_output + if not dest.parent.is_dir(): + dest.parent.mkdir(parents=True) + self._file(src_file, dest, link=link) + + def _create_links(self, outputs, output_mapping: Mapping[str, str]): + self.auxiliary_dir.mkdir(parents=True, exist_ok=True) + link_type = "sym" if _can_symlink_files(self.auxiliary_dir) else "hard" + normalised = ((self._normalize_output(k), v) for k, v in output_mapping.items()) + # remove files that are not relative to build_lib + mappings = {k: v for k, v in normalised if k is not None} + + for output in outputs: + relative = self._normalize_output(output) + if relative and relative not in mappings: + self._create_file(relative, output) + + for relative, src in mappings.items(): + self._create_file(relative, src, link=link_type) + + def __enter__(self) -> Self: + msg = "Strict editable install will be performed using a link tree.\n" + _logger.warning(msg + _STRICT_WARNING) + return self + + def __exit__( + self, + _exc_type: object, + _exc_value: object, + _traceback: object, + ) -> None: + msg = f"""\n + Strict editable installation performed using the auxiliary directory: + {self.auxiliary_dir} + + Please be careful to not remove this directory, otherwise you might not be able + to import/use your package. + """ + InformationOnly.emit("Editable installation.", msg) + + +class _TopLevelFinder: + def __init__(self, dist: Distribution, name: str) -> None: + self.dist = dist + self.name = name + + def template_vars(self) -> tuple[str, str, dict[str, str], dict[str, list[str]]]: + src_root = self.dist.src_root or os.curdir + top_level = chain(_find_packages(self.dist), _find_top_level_modules(self.dist)) + package_dir = self.dist.package_dir or {} + roots = _find_package_roots(top_level, package_dir, src_root) + + namespaces_ = dict( + chain( + _find_namespaces(self.dist.packages or [], roots), + ((ns, []) for ns in _find_virtual_namespaces(roots)), + ) + ) + + legacy_namespaces = { + pkg: find_package_path(pkg, roots, self.dist.src_root or "") + for pkg in self.dist.namespace_packages or [] + } + + mapping = {**roots, **legacy_namespaces} + # ^-- We need to explicitly add the legacy_namespaces to the mapping to be + # able to import their modules even if another package sharing the same + # namespace is installed in a conventional (non-editable) way. + + name = f"__editable__.{self.name}.finder" + finder = _normalization.safe_identifier(name) + return finder, name, mapping, namespaces_ + + def get_implementation(self) -> Iterator[tuple[str, bytes]]: + finder, name, mapping, namespaces_ = self.template_vars() + + content = bytes(_finder_template(name, mapping, namespaces_), "utf-8") + yield (f"{finder}.py", content) + + content = _encode_pth(f"import {finder}; {finder}.install()") + yield (f"__editable__.{self.name}.pth", content) + + def __call__(self, wheel: WheelFile, files: list[str], mapping: Mapping[str, str]): + for file, content in self.get_implementation(): + wheel.writestr(file, content) + + def __enter__(self) -> Self: + msg = "Editable install will be performed using a meta path finder.\n" + _logger.warning(msg + _LENIENT_WARNING) + return self + + def __exit__( + self, + _exc_type: object, + _exc_value: object, + _traceback: object, + ) -> None: + msg = """\n + Please be careful with folders in your working directory with the same + name as your package as they may take precedence during imports. + """ + InformationOnly.emit("Editable installation.", msg) + + +def _encode_pth(content: str) -> bytes: + """ + Prior to Python 3.13 (see https://github.com/python/cpython/issues/77102), + .pth files are always read with 'locale' encoding, the recommendation + from the cpython core developers is to write them as ``open(path, "w")`` + and ignore warnings (see python/cpython#77102, pypa/setuptools#3937). + This function tries to simulate this behaviour without having to create an + actual file, in a way that supports a range of active Python versions. + (There seems to be some variety in the way different version of Python handle + ``encoding=None``, not all of them use ``locale.getpreferredencoding(False)`` + or ``locale.getencoding()``). + """ + with io.BytesIO() as buffer: + wrapper = io.TextIOWrapper(buffer, encoding=py312.PTH_ENCODING) + # TODO: Python 3.13 replace the whole function with `bytes(content, "utf-8")` + wrapper.write(content) + wrapper.flush() + buffer.seek(0) + return buffer.read() + + +def _can_symlink_files(base_dir: Path) -> bool: + with TemporaryDirectory(dir=str(base_dir.resolve())) as tmp: + path1, path2 = Path(tmp, "file1.txt"), Path(tmp, "file2.txt") + path1.write_text("file1", encoding="utf-8") + with suppress(AttributeError, NotImplementedError, OSError): + os.symlink(path1, path2) + if path2.is_symlink() and path2.read_text(encoding="utf-8") == "file1": + return True + + try: + os.link(path1, path2) # Ensure hard links can be created + except Exception as ex: + msg = ( + "File system does not seem to support either symlinks or hard links. " + "Strict editable installs require one of them to be supported." + ) + raise LinksNotSupported(msg) from ex + return False + + +def _simple_layout( + packages: Iterable[str], package_dir: dict[str, str], project_dir: StrPath +) -> bool: + """Return ``True`` if: + - all packages are contained by the same parent directory, **and** + - all packages become importable if the parent directory is added to ``sys.path``. + + >>> _simple_layout(['a'], {"": "src"}, "/tmp/myproj") + True + >>> _simple_layout(['a', 'a.b'], {"": "src"}, "/tmp/myproj") + True + >>> _simple_layout(['a', 'a.b'], {}, "/tmp/myproj") + True + >>> _simple_layout(['a', 'a.a1', 'a.a1.a2', 'b'], {"": "src"}, "/tmp/myproj") + True + >>> _simple_layout(['a', 'a.a1', 'a.a1.a2', 'b'], {"a": "a", "b": "b"}, ".") + True + >>> _simple_layout(['a', 'a.a1', 'a.a1.a2', 'b'], {"a": "_a", "b": "_b"}, ".") + False + >>> _simple_layout(['a', 'a.a1', 'a.a1.a2', 'b'], {"a": "_a"}, "/tmp/myproj") + False + >>> _simple_layout(['a', 'a.a1', 'a.a1.a2', 'b'], {"a.a1.a2": "_a2"}, ".") + False + >>> _simple_layout(['a', 'a.b'], {"": "src", "a.b": "_ab"}, "/tmp/myproj") + False + >>> # Special cases, no packages yet: + >>> _simple_layout([], {"": "src"}, "/tmp/myproj") + True + >>> _simple_layout([], {"a": "_a", "": "src"}, "/tmp/myproj") + False + """ + layout = {pkg: find_package_path(pkg, package_dir, project_dir) for pkg in packages} + if not layout: + return set(package_dir) in ({}, {""}) + parent = os.path.commonpath(starmap(_parent_path, layout.items())) + return all( + _path.same_path(Path(parent, *key.split('.')), value) + for key, value in layout.items() + ) + + +def _parent_path(pkg, pkg_path): + """Infer the parent path containing a package, that if added to ``sys.path`` would + allow importing that package. + When ``pkg`` is directly mapped into a directory with a different name, return its + own path. + >>> _parent_path("a", "src/a") + 'src' + >>> _parent_path("b", "src/c") + 'src/c' + """ + parent = pkg_path[: -len(pkg)] if pkg_path.endswith(pkg) else pkg_path + return parent.rstrip("/" + os.sep) + + +def _find_packages(dist: Distribution) -> Iterator[str]: + yield from iter(dist.packages or []) + + py_modules = dist.py_modules or [] + nested_modules = [mod for mod in py_modules if "." in mod] + if dist.ext_package: + yield dist.ext_package + else: + ext_modules = dist.ext_modules or [] + nested_modules += [x.name for x in ext_modules if "." in x.name] + + for module in nested_modules: + package, _, _ = module.rpartition(".") + yield package + + +def _find_top_level_modules(dist: Distribution) -> Iterator[str]: + py_modules = dist.py_modules or [] + yield from (mod for mod in py_modules if "." not in mod) + + if not dist.ext_package: + ext_modules = dist.ext_modules or [] + yield from (x.name for x in ext_modules if "." not in x.name) + + +def _find_package_roots( + packages: Iterable[str], + package_dir: Mapping[str, str], + src_root: StrPath, +) -> dict[str, str]: + pkg_roots: dict[str, str] = { + pkg: _absolute_root(find_package_path(pkg, package_dir, src_root)) + for pkg in sorted(packages) + } + + return _remove_nested(pkg_roots) + + +def _absolute_root(path: StrPath) -> str: + """Works for packages and top-level modules""" + path_ = Path(path) + parent = path_.parent + + if path_.exists(): + return str(path_.resolve()) + else: + return str(parent.resolve() / path_.name) + + +def _find_virtual_namespaces(pkg_roots: dict[str, str]) -> Iterator[str]: + """By carefully designing ``package_dir``, it is possible to implement the logical + structure of PEP 420 in a package without the corresponding directories. + + Moreover a parent package can be purposefully/accidentally skipped in the discovery + phase (e.g. ``find_packages(include=["mypkg.*"])``, when ``mypkg.foo`` is included + by ``mypkg`` itself is not). + We consider this case to also be a virtual namespace (ignoring the original + directory) to emulate a non-editable installation. + + This function will try to find these kinds of namespaces. + """ + for pkg in pkg_roots: + if "." not in pkg: + continue + parts = pkg.split(".") + for i in range(len(parts) - 1, 0, -1): + partial_name = ".".join(parts[:i]) + path = Path(find_package_path(partial_name, pkg_roots, "")) + if not path.exists() or partial_name not in pkg_roots: + # partial_name not in pkg_roots ==> purposefully/accidentally skipped + yield partial_name + + +def _find_namespaces( + packages: list[str], pkg_roots: dict[str, str] +) -> Iterator[tuple[str, list[str]]]: + for pkg in packages: + path = find_package_path(pkg, pkg_roots, "") + if Path(path).exists() and not Path(path, "__init__.py").exists(): + yield (pkg, [path]) + + +def _remove_nested(pkg_roots: dict[str, str]) -> dict[str, str]: + output = dict(pkg_roots.copy()) + + for pkg, path in reversed(list(pkg_roots.items())): + if any( + pkg != other and _is_nested(pkg, path, other, other_path) + for other, other_path in pkg_roots.items() + ): + output.pop(pkg) + + return output + + +def _is_nested(pkg: str, pkg_path: str, parent: str, parent_path: str) -> bool: + """ + Return ``True`` if ``pkg`` is nested inside ``parent`` both logically and in the + file system. + >>> _is_nested("a.b", "path/a/b", "a", "path/a") + True + >>> _is_nested("a.b", "path/a/b", "a", "otherpath/a") + False + >>> _is_nested("a.b", "path/a/b", "c", "path/c") + False + >>> _is_nested("a.a", "path/a/a", "a", "path/a") + True + >>> _is_nested("b.a", "path/b/a", "a", "path/a") + False + """ + norm_pkg_path = _path.normpath(pkg_path) + rest = pkg.replace(parent, "", 1).strip(".").split(".") + return pkg.startswith(parent) and norm_pkg_path == _path.normpath( + Path(parent_path, *rest) + ) + + +def _empty_dir(dir_: _P) -> _P: + """Create a directory ensured to be empty. Existing files may be removed.""" + _shutil.rmtree(dir_, ignore_errors=True) + os.makedirs(dir_) + return dir_ + + +class _NamespaceInstaller(namespaces.Installer): + def __init__(self, distribution, installation_dir, editable_name, src_root) -> None: + self.distribution = distribution + self.src_root = src_root + self.installation_dir = installation_dir + self.editable_name = editable_name + self.outputs: list[str] = [] + self.dry_run = False + + def _get_nspkg_file(self): + """Installation target.""" + return os.path.join(self.installation_dir, self.editable_name + self.nspkg_ext) + + def _get_root(self): + """Where the modules/packages should be loaded from.""" + return repr(str(self.src_root)) + + +_FINDER_TEMPLATE = """\ +from __future__ import annotations +import sys +from importlib.machinery import ModuleSpec, PathFinder +from importlib.machinery import all_suffixes as module_suffixes +from importlib.util import spec_from_file_location +from itertools import chain +from pathlib import Path + +MAPPING: dict[str, str] = {mapping!r} +NAMESPACES: dict[str, list[str]] = {namespaces!r} +PATH_PLACEHOLDER = {name!r} + ".__path_hook__" + + +class _EditableFinder: # MetaPathFinder + @classmethod + def find_spec(cls, fullname: str, path=None, target=None) -> ModuleSpec | None: # type: ignore + # Top-level packages and modules (we know these exist in the FS) + if fullname in MAPPING: + pkg_path = MAPPING[fullname] + return cls._find_spec(fullname, Path(pkg_path)) + + # Handle immediate children modules (required for namespaces to work) + # To avoid problems with case sensitivity in the file system we delegate + # to the importlib.machinery implementation. + parent, _, child = fullname.rpartition(".") + if parent and parent in MAPPING: + return PathFinder.find_spec(fullname, path=[MAPPING[parent]]) + + # Other levels of nesting should be handled automatically by importlib + # using the parent path. + return None + + @classmethod + def _find_spec(cls, fullname: str, candidate_path: Path) -> ModuleSpec | None: + init = candidate_path / "__init__.py" + candidates = (candidate_path.with_suffix(x) for x in module_suffixes()) + for candidate in chain([init], candidates): + if candidate.exists(): + return spec_from_file_location(fullname, candidate) + return None + + +class _EditableNamespaceFinder: # PathEntryFinder + @classmethod + def _path_hook(cls, path) -> type[_EditableNamespaceFinder]: + if path == PATH_PLACEHOLDER: + return cls + raise ImportError + + @classmethod + def _paths(cls, fullname: str) -> list[str]: + paths = NAMESPACES[fullname] + if not paths and fullname in MAPPING: + paths = [MAPPING[fullname]] + # Always add placeholder, for 2 reasons: + # 1. __path__ cannot be empty for the spec to be considered namespace. + # 2. In the case of nested namespaces, we need to force + # import machinery to query _EditableNamespaceFinder again. + return [*paths, PATH_PLACEHOLDER] + + @classmethod + def find_spec(cls, fullname: str, target=None) -> ModuleSpec | None: # type: ignore + if fullname in NAMESPACES: + spec = ModuleSpec(fullname, None, is_package=True) + spec.submodule_search_locations = cls._paths(fullname) + return spec + return None + + @classmethod + def find_module(cls, _fullname) -> None: + return None + + +def install(): + if not any(finder == _EditableFinder for finder in sys.meta_path): + sys.meta_path.append(_EditableFinder) + + if not NAMESPACES: + return + + if not any(hook == _EditableNamespaceFinder._path_hook for hook in sys.path_hooks): + # PathEntryFinder is needed to create NamespaceSpec without private APIS + sys.path_hooks.append(_EditableNamespaceFinder._path_hook) + if PATH_PLACEHOLDER not in sys.path: + sys.path.append(PATH_PLACEHOLDER) # Used just to trigger the path hook +""" + + +def _finder_template( + name: str, mapping: Mapping[str, str], namespaces: dict[str, list[str]] +) -> str: + """Create a string containing the code for the``MetaPathFinder`` and + ``PathEntryFinder``. + """ + mapping = dict(sorted(mapping.items(), key=lambda p: p[0])) + return _FINDER_TEMPLATE.format(name=name, mapping=mapping, namespaces=namespaces) + + +class LinksNotSupported(errors.FileError): + """File system does not seem to support either symlinks or hard links.""" + + +class _DebuggingTips(SetuptoolsWarning): + _SUMMARY = "Problem in editable installation." + _DETAILS = """ + An error happened while installing `{project}` in editable mode. + + The following steps are recommended to help debug this problem: + + - Try to install the project normally, without using the editable mode. + Does the error still persist? + (If it does, try fixing the problem before attempting the editable mode). + - If you are using binary extensions, make sure you have all OS-level + dependencies installed (e.g. compilers, toolchains, binary libraries, ...). + - Try the latest version of setuptools (maybe the error was already fixed). + - If you (or your project dependencies) are using any setuptools extension + or customization, make sure they support the editable mode. + + After following the steps above, if the problem still persists and + you think this is related to how setuptools handles editable installations, + please submit a reproducible example + (see https://stackoverflow.com/help/minimal-reproducible-example) to: + + https://github.com/pypa/setuptools/issues + """ + _SEE_DOCS = "userguide/development_mode.html" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/egg_info.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/egg_info.py new file mode 100644 index 0000000000000000000000000000000000000000..f77631168fcdb60e14ddd757e18b4c301473aa82 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/egg_info.py @@ -0,0 +1,720 @@ +"""setuptools.command.egg_info + +Create a distribution's .egg-info directory and contents""" + +import functools +import os +import re +import sys +import time +from collections.abc import Callable + +import packaging +import packaging.requirements +import packaging.version + +import setuptools.unicode_utils as unicode_utils +from setuptools import Command +from setuptools.command import bdist_egg +from setuptools.command.sdist import sdist, walk_revctrl +from setuptools.command.setopt import edit_config +from setuptools.glob import glob + +from .. import _entry_points, _normalization +from .._importlib import metadata +from ..warnings import SetuptoolsDeprecationWarning +from . import _requirestxt + +import distutils.errors +import distutils.filelist +from distutils import log +from distutils.errors import DistutilsInternalError +from distutils.filelist import FileList as _FileList +from distutils.util import convert_path + +PY_MAJOR = f'{sys.version_info.major}.{sys.version_info.minor}' + + +def translate_pattern(glob): # noqa: C901 # is too complex (14) # FIXME + """ + Translate a file path glob like '*.txt' in to a regular expression. + This differs from fnmatch.translate which allows wildcards to match + directory separators. It also knows about '**/' which matches any number of + directories. + """ + pat = '' + + # This will split on '/' within [character classes]. This is deliberate. + chunks = glob.split(os.path.sep) + + sep = re.escape(os.sep) + valid_char = f'[^{sep}]' + + for c, chunk in enumerate(chunks): + last_chunk = c == len(chunks) - 1 + + # Chunks that are a literal ** are globstars. They match anything. + if chunk == '**': + if last_chunk: + # Match anything if this is the last component + pat += '.*' + else: + # Match '(name/)*' + pat += f'(?:{valid_char}+{sep})*' + continue # Break here as the whole path component has been handled + + # Find any special characters in the remainder + i = 0 + chunk_len = len(chunk) + while i < chunk_len: + char = chunk[i] + if char == '*': + # Match any number of name characters + pat += valid_char + '*' + elif char == '?': + # Match a name character + pat += valid_char + elif char == '[': + # Character class + inner_i = i + 1 + # Skip initial !/] chars + if inner_i < chunk_len and chunk[inner_i] == '!': + inner_i = inner_i + 1 + if inner_i < chunk_len and chunk[inner_i] == ']': + inner_i = inner_i + 1 + + # Loop till the closing ] is found + while inner_i < chunk_len and chunk[inner_i] != ']': + inner_i = inner_i + 1 + + if inner_i >= chunk_len: + # Got to the end of the string without finding a closing ] + # Do not treat this as a matching group, but as a literal [ + pat += re.escape(char) + else: + # Grab the insides of the [brackets] + inner = chunk[i + 1 : inner_i] + char_class = '' + + # Class negation + if inner[0] == '!': + char_class = '^' + inner = inner[1:] + + char_class += re.escape(inner) + pat += f'[{char_class}]' + + # Skip to the end ] + i = inner_i + else: + pat += re.escape(char) + i += 1 + + # Join each chunk with the dir separator + if not last_chunk: + pat += sep + + pat += r'\Z' + return re.compile(pat, flags=re.MULTILINE | re.DOTALL) + + +class InfoCommon: + tag_build = None + tag_date = None + + @property + def name(self): + return _normalization.safe_name(self.distribution.get_name()) + + def tagged_version(self): + tagged = self._maybe_tag(self.distribution.get_version()) + return _normalization.safe_version(tagged) + + def _maybe_tag(self, version): + """ + egg_info may be called more than once for a distribution, + in which case the version string already contains all tags. + """ + return ( + version + if self.vtags and self._already_tagged(version) + else version + self.vtags + ) + + def _already_tagged(self, version: str) -> bool: + # Depending on their format, tags may change with version normalization. + # So in addition the regular tags, we have to search for the normalized ones. + return version.endswith(self.vtags) or version.endswith(self._safe_tags()) + + def _safe_tags(self) -> str: + # To implement this we can rely on `safe_version` pretending to be version 0 + # followed by tags. Then we simply discard the starting 0 (fake version number) + try: + return _normalization.safe_version(f"0{self.vtags}")[1:] + except packaging.version.InvalidVersion: + return _normalization.safe_name(self.vtags.replace(' ', '.')) + + def tags(self) -> str: + version = '' + if self.tag_build: + version += self.tag_build + if self.tag_date: + version += time.strftime("%Y%m%d") + return version + + vtags = property(tags) + + +class egg_info(InfoCommon, Command): + description = "create a distribution's .egg-info directory" + + user_options = [ + ( + 'egg-base=', + 'e', + "directory containing .egg-info directories" + " [default: top of the source tree]", + ), + ('tag-date', 'd', "Add date stamp (e.g. 20050528) to version number"), + ('tag-build=', 'b', "Specify explicit tag to add to version number"), + ('no-date', 'D', "Don't include date stamp [default]"), + ] + + boolean_options = ['tag-date'] + negative_opt = { + 'no-date': 'tag-date', + } + + def initialize_options(self): + self.egg_base = None + self.egg_name = None + self.egg_info = None + self.egg_version = None + self.ignore_egg_info_in_manifest = False + + #################################### + # allow the 'tag_svn_revision' to be detected and + # set, supporting sdists built on older Setuptools. + @property + def tag_svn_revision(self) -> None: + pass + + @tag_svn_revision.setter + def tag_svn_revision(self, value): + pass + + #################################### + + def save_version_info(self, filename) -> None: + """ + Materialize the value of date into the + build tag. Install build keys in a deterministic order + to avoid arbitrary reordering on subsequent builds. + """ + # follow the order these keys would have been added + # when PYTHONHASHSEED=0 + egg_info = dict(tag_build=self.tags(), tag_date=0) + edit_config(filename, dict(egg_info=egg_info)) + + def finalize_options(self) -> None: + # Note: we need to capture the current value returned + # by `self.tagged_version()`, so we can later update + # `self.distribution.metadata.version` without + # repercussions. + self.egg_name = self.name + self.egg_version = self.tagged_version() + parsed_version = packaging.version.Version(self.egg_version) + + try: + is_version = isinstance(parsed_version, packaging.version.Version) + spec = "%s==%s" if is_version else "%s===%s" + packaging.requirements.Requirement(spec % (self.egg_name, self.egg_version)) + except ValueError as e: + raise distutils.errors.DistutilsOptionError( + f"Invalid distribution name or version syntax: {self.egg_name}-{self.egg_version}" + ) from e + + if self.egg_base is None: + dirs = self.distribution.package_dir + self.egg_base = (dirs or {}).get('', os.curdir) + + self.ensure_dirname('egg_base') + self.egg_info = _normalization.filename_component(self.egg_name) + '.egg-info' + if self.egg_base != os.curdir: + self.egg_info = os.path.join(self.egg_base, self.egg_info) + + # Set package version for the benefit of dumber commands + # (e.g. sdist, bdist_wininst, etc.) + # + self.distribution.metadata.version = self.egg_version + + def _get_egg_basename(self, py_version=PY_MAJOR, platform=None): + """Compute filename of the output egg. Private API.""" + return _egg_basename(self.egg_name, self.egg_version, py_version, platform) + + def write_or_delete_file(self, what, filename, data, force: bool = False) -> None: + """Write `data` to `filename` or delete if empty + + If `data` is non-empty, this routine is the same as ``write_file()``. + If `data` is empty but not ``None``, this is the same as calling + ``delete_file(filename)`. If `data` is ``None``, then this is a no-op + unless `filename` exists, in which case a warning is issued about the + orphaned file (if `force` is false), or deleted (if `force` is true). + """ + if data: + self.write_file(what, filename, data) + elif os.path.exists(filename): + if data is None and not force: + log.warn("%s not set in setup(), but %s exists", what, filename) + return + else: + self.delete_file(filename) + + def write_file(self, what, filename, data) -> None: + """Write `data` to `filename` (if not a dry run) after announcing it + + `what` is used in a log message to identify what is being written + to the file. + """ + log.info("writing %s to %s", what, filename) + data = data.encode("utf-8") + if not self.dry_run: + f = open(filename, 'wb') + f.write(data) + f.close() + + def delete_file(self, filename) -> None: + """Delete `filename` (if not a dry run) after announcing it""" + log.info("deleting %s", filename) + if not self.dry_run: + os.unlink(filename) + + def run(self) -> None: + # Pre-load to avoid iterating over entry-points while an empty .egg-info + # exists in sys.path. See pypa/pyproject-hooks#206 + writers = list(metadata.entry_points(group='egg_info.writers')) + + self.mkpath(self.egg_info) + try: + os.utime(self.egg_info, None) + except OSError as e: + msg = f"Cannot update time stamp of directory '{self.egg_info}'" + raise distutils.errors.DistutilsFileError(msg) from e + for ep in writers: + writer = ep.load() + writer(self, ep.name, os.path.join(self.egg_info, ep.name)) + + # Get rid of native_libs.txt if it was put there by older bdist_egg + nl = os.path.join(self.egg_info, "native_libs.txt") + if os.path.exists(nl): + self.delete_file(nl) + + self.find_sources() + + def find_sources(self) -> None: + """Generate SOURCES.txt manifest file""" + manifest_filename = os.path.join(self.egg_info, "SOURCES.txt") + mm = manifest_maker(self.distribution) + mm.ignore_egg_info_dir = self.ignore_egg_info_in_manifest + mm.manifest = manifest_filename + mm.run() + self.filelist = mm.filelist + + +class FileList(_FileList): + # Implementations of the various MANIFEST.in commands + + def __init__( + self, warn=None, debug_print=None, ignore_egg_info_dir: bool = False + ) -> None: + super().__init__(warn, debug_print) + self.ignore_egg_info_dir = ignore_egg_info_dir + + def process_template_line(self, line) -> None: + # Parse the line: split it up, make sure the right number of words + # is there, and return the relevant words. 'action' is always + # defined: it's the first word of the line. Which of the other + # three are defined depends on the action; it'll be either + # patterns, (dir and patterns), or (dir_pattern). + (action, patterns, dir, dir_pattern) = self._parse_template_line(line) + + action_map: dict[str, Callable] = { + 'include': self.include, + 'exclude': self.exclude, + 'global-include': self.global_include, + 'global-exclude': self.global_exclude, + 'recursive-include': functools.partial( + self.recursive_include, + dir, + ), + 'recursive-exclude': functools.partial( + self.recursive_exclude, + dir, + ), + 'graft': self.graft, + 'prune': self.prune, + } + log_map = { + 'include': "warning: no files found matching '%s'", + 'exclude': ("warning: no previously-included files found matching '%s'"), + 'global-include': ( + "warning: no files found matching '%s' anywhere in distribution" + ), + 'global-exclude': ( + "warning: no previously-included files matching " + "'%s' found anywhere in distribution" + ), + 'recursive-include': ( + "warning: no files found matching '%s' under directory '%s'" + ), + 'recursive-exclude': ( + "warning: no previously-included files matching " + "'%s' found under directory '%s'" + ), + 'graft': "warning: no directories found matching '%s'", + 'prune': "no previously-included directories found matching '%s'", + } + + try: + process_action = action_map[action] + except KeyError: + msg = f"Invalid MANIFEST.in: unknown action {action!r} in {line!r}" + raise DistutilsInternalError(msg) from None + + # OK, now we know that the action is valid and we have the + # right number of words on the line for that action -- so we + # can proceed with minimal error-checking. + + action_is_recursive = action.startswith('recursive-') + if action in {'graft', 'prune'}: + patterns = [dir_pattern] + extra_log_args = (dir,) if action_is_recursive else () + log_tmpl = log_map[action] + + self.debug_print( + ' '.join( + [action] + ([dir] if action_is_recursive else []) + patterns, + ) + ) + for pattern in patterns: + if not process_action(pattern): + log.warn(log_tmpl, pattern, *extra_log_args) + + def _remove_files(self, predicate): + """ + Remove all files from the file list that match the predicate. + Return True if any matching files were removed + """ + found = False + for i in range(len(self.files) - 1, -1, -1): + if predicate(self.files[i]): + self.debug_print(" removing " + self.files[i]) + del self.files[i] + found = True + return found + + def include(self, pattern): + """Include files that match 'pattern'.""" + found = [f for f in glob(pattern) if not os.path.isdir(f)] + self.extend(found) + return bool(found) + + def exclude(self, pattern): + """Exclude files that match 'pattern'.""" + match = translate_pattern(pattern) + return self._remove_files(match.match) + + def recursive_include(self, dir, pattern): + """ + Include all files anywhere in 'dir/' that match the pattern. + """ + full_pattern = os.path.join(dir, '**', pattern) + found = [f for f in glob(full_pattern, recursive=True) if not os.path.isdir(f)] + self.extend(found) + return bool(found) + + def recursive_exclude(self, dir, pattern): + """ + Exclude any file anywhere in 'dir/' that match the pattern. + """ + match = translate_pattern(os.path.join(dir, '**', pattern)) + return self._remove_files(match.match) + + def graft(self, dir): + """Include all files from 'dir/'.""" + found = [ + item + for match_dir in glob(dir) + for item in distutils.filelist.findall(match_dir) + ] + self.extend(found) + return bool(found) + + def prune(self, dir): + """Filter out files from 'dir/'.""" + match = translate_pattern(os.path.join(dir, '**')) + return self._remove_files(match.match) + + def global_include(self, pattern): + """ + Include all files anywhere in the current directory that match the + pattern. This is very inefficient on large file trees. + """ + if self.allfiles is None: + self.findall() + match = translate_pattern(os.path.join('**', pattern)) + found = [f for f in self.allfiles if match.match(f)] + self.extend(found) + return bool(found) + + def global_exclude(self, pattern): + """ + Exclude all files anywhere that match the pattern. + """ + match = translate_pattern(os.path.join('**', pattern)) + return self._remove_files(match.match) + + def append(self, item) -> None: + if item.endswith('\r'): # Fix older sdists built on Windows + item = item[:-1] + path = convert_path(item) + + if self._safe_path(path): + self.files.append(path) + + def extend(self, paths) -> None: + self.files.extend(filter(self._safe_path, paths)) + + def _repair(self): + """ + Replace self.files with only safe paths + + Because some owners of FileList manipulate the underlying + ``files`` attribute directly, this method must be called to + repair those paths. + """ + self.files = list(filter(self._safe_path, self.files)) + + def _safe_path(self, path): + enc_warn = "'%s' not %s encodable -- skipping" + + # To avoid accidental trans-codings errors, first to unicode + u_path = unicode_utils.filesys_decode(path) + if u_path is None: + log.warn(f"'{path}' in unexpected encoding -- skipping") + return False + + # Must ensure utf-8 encodability + utf8_path = unicode_utils.try_encode(u_path, "utf-8") + if utf8_path is None: + log.warn(enc_warn, path, 'utf-8') + return False + + try: + # ignore egg-info paths + is_egg_info = ".egg-info" in u_path or b".egg-info" in utf8_path + if self.ignore_egg_info_dir and is_egg_info: + return False + # accept is either way checks out + if os.path.exists(u_path) or os.path.exists(utf8_path): + return True + # this will catch any encode errors decoding u_path + except UnicodeEncodeError: + log.warn(enc_warn, path, sys.getfilesystemencoding()) + + +class manifest_maker(sdist): + template = "MANIFEST.in" + + def initialize_options(self) -> None: + self.use_defaults = True + self.prune = True + self.manifest_only = True + self.force_manifest = True + self.ignore_egg_info_dir = False + + def finalize_options(self) -> None: + pass + + def run(self) -> None: + self.filelist = FileList(ignore_egg_info_dir=self.ignore_egg_info_dir) + if not os.path.exists(self.manifest): + self.write_manifest() # it must exist so it'll get in the list + self.add_defaults() + if os.path.exists(self.template): + self.read_template() + self.add_license_files() + self._add_referenced_files() + self.prune_file_list() + self.filelist.sort() + self.filelist.remove_duplicates() + self.write_manifest() + + def _manifest_normalize(self, path): + path = unicode_utils.filesys_decode(path) + return path.replace(os.sep, '/') + + def write_manifest(self) -> None: + """ + Write the file list in 'self.filelist' to the manifest file + named by 'self.manifest'. + """ + self.filelist._repair() + + # Now _repairs should encodability, but not unicode + files = [self._manifest_normalize(f) for f in self.filelist.files] + msg = f"writing manifest file '{self.manifest}'" + self.execute(write_file, (self.manifest, files), msg) + + def warn(self, msg) -> None: + if not self._should_suppress_warning(msg): + sdist.warn(self, msg) + + @staticmethod + def _should_suppress_warning(msg): + """ + suppress missing-file warnings from sdist + """ + return re.match(r"standard file .*not found", msg) + + def add_defaults(self) -> None: + sdist.add_defaults(self) + self.filelist.append(self.template) + self.filelist.append(self.manifest) + rcfiles = list(walk_revctrl()) + if rcfiles: + self.filelist.extend(rcfiles) + elif os.path.exists(self.manifest): + self.read_manifest() + + if os.path.exists("setup.py"): + # setup.py should be included by default, even if it's not + # the script called to create the sdist + self.filelist.append("setup.py") + + ei_cmd = self.get_finalized_command('egg_info') + self.filelist.graft(ei_cmd.egg_info) + + def add_license_files(self) -> None: + license_files = self.distribution.metadata.license_files or [] + for lf in license_files: + log.info("adding license file '%s'", lf) + self.filelist.extend(license_files) + + def _add_referenced_files(self): + """Add files referenced by the config (e.g. `file:` directive) to filelist""" + referenced = getattr(self.distribution, '_referenced_files', []) + # ^-- fallback if dist comes from distutils or is a custom class + for rf in referenced: + log.debug("adding file referenced by config '%s'", rf) + self.filelist.extend(referenced) + + def _safe_data_files(self, build_py): + """ + The parent class implementation of this method + (``sdist``) will try to include data files, which + might cause recursion problems when + ``include_package_data=True``. + + Therefore, avoid triggering any attempt of + analyzing/building the manifest again. + """ + if hasattr(build_py, 'get_data_files_without_manifest'): + return build_py.get_data_files_without_manifest() + + SetuptoolsDeprecationWarning.emit( + "`build_py` command does not inherit from setuptools' `build_py`.", + """ + Custom 'build_py' does not implement 'get_data_files_without_manifest'. + Please extend command classes from setuptools instead of distutils. + """, + see_url="https://peps.python.org/pep-0632/", + # due_date not defined yet, old projects might still do it? + ) + return build_py.get_data_files() + + +def write_file(filename, contents) -> None: + """Create a file with the specified name and write 'contents' (a + sequence of strings without line terminators) to it. + """ + contents = "\n".join(contents) + + # assuming the contents has been vetted for utf-8 encoding + contents = contents.encode("utf-8") + + with open(filename, "wb") as f: # always write POSIX-style manifest + f.write(contents) + + +def write_pkg_info(cmd, basename, filename) -> None: + log.info("writing %s", filename) + if not cmd.dry_run: + metadata = cmd.distribution.metadata + metadata.version, oldver = cmd.egg_version, metadata.version + metadata.name, oldname = cmd.egg_name, metadata.name + + try: + # write unescaped data to PKG-INFO, so older pkg_resources + # can still parse it + metadata.write_pkg_info(cmd.egg_info) + finally: + metadata.name, metadata.version = oldname, oldver + + safe = getattr(cmd.distribution, 'zip_safe', None) + + bdist_egg.write_safety_flag(cmd.egg_info, safe) + + +def warn_depends_obsolete(cmd, basename, filename) -> None: + """ + Unused: left to avoid errors when updating (from source) from <= 67.8. + Old installations have a .dist-info directory with the entry-point + ``depends.txt = setuptools.command.egg_info:warn_depends_obsolete``. + This may trigger errors when running the first egg_info in build_meta. + TODO: Remove this function in a version sufficiently > 68. + """ + + +# Export API used in entry_points +write_requirements = _requirestxt.write_requirements +write_setup_requirements = _requirestxt.write_setup_requirements + + +def write_toplevel_names(cmd, basename, filename) -> None: + pkgs = dict.fromkeys([ + k.split('.', 1)[0] for k in cmd.distribution.iter_distribution_names() + ]) + cmd.write_file("top-level names", filename, '\n'.join(sorted(pkgs)) + '\n') + + +def overwrite_arg(cmd, basename, filename) -> None: + write_arg(cmd, basename, filename, True) + + +def write_arg(cmd, basename, filename, force: bool = False) -> None: + argname = os.path.splitext(basename)[0] + value = getattr(cmd.distribution, argname, None) + if value is not None: + value = '\n'.join(value) + '\n' + cmd.write_or_delete_file(argname, filename, value, force) + + +def write_entries(cmd, basename, filename) -> None: + eps = _entry_points.load(cmd.distribution.entry_points) + defn = _entry_points.render(eps) + cmd.write_or_delete_file('entry points', filename, defn, True) + + +def _egg_basename(egg_name, egg_version, py_version=None, platform=None): + """Compute filename of the output egg. Private API.""" + name = _normalization.filename_component(egg_name) + version = _normalization.filename_component(egg_version) + egg = f"{name}-{version}-py{py_version or PY_MAJOR}" + if platform: + egg += f"-{platform}" + return egg + + +class EggInfoDeprecationWarning(SetuptoolsDeprecationWarning): + """Deprecated behavior warning for EggInfo, bypassing suppression.""" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install.py new file mode 100644 index 0000000000000000000000000000000000000000..741b140c7027bf5686474101c84e3912b389813f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +import glob +import inspect +import platform +from collections.abc import Callable +from typing import TYPE_CHECKING, Any, ClassVar, cast + +import setuptools + +from ..dist import Distribution +from ..warnings import SetuptoolsDeprecationWarning, SetuptoolsWarning +from .bdist_egg import bdist_egg as bdist_egg_cls + +import distutils.command.install as orig +from distutils.errors import DistutilsArgError + +if TYPE_CHECKING: + # This is only used for a type-cast, don't import at runtime or it'll cause deprecation warnings + from .easy_install import easy_install as easy_install_cls +else: + easy_install_cls = None + + +def __getattr__(name: str): # pragma: no cover + if name == "_install": + SetuptoolsDeprecationWarning.emit( + "`setuptools.command._install` was an internal implementation detail " + + "that was left in for numpy<1.9 support.", + due_date=(2025, 5, 2), # Originally added on 2024-11-01 + ) + return orig.install + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +class install(orig.install): + """Use easy_install to install the package, w/dependencies""" + + distribution: Distribution # override distutils.dist.Distribution with setuptools.dist.Distribution + + user_options = orig.install.user_options + [ + ('old-and-unmanageable', None, "Try not to use this!"), + ( + 'single-version-externally-managed', + None, + "used by system package builders to create 'flat' eggs", + ), + ] + boolean_options = orig.install.boolean_options + [ + 'old-and-unmanageable', + 'single-version-externally-managed', + ] + # Type the same as distutils.command.install.install.sub_commands + # Must keep the second tuple item potentially None due to invariance + new_commands: ClassVar[list[tuple[str, Callable[[Any], bool] | None]]] = [ + ('install_egg_info', lambda self: True), + ('install_scripts', lambda self: True), + ] + _nc = dict(new_commands) + + def initialize_options(self): + SetuptoolsDeprecationWarning.emit( + "setup.py install is deprecated.", + """ + Please avoid running ``setup.py`` directly. + Instead, use pypa/build, pypa/installer or other + standards-based tools. + """, + see_url="https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html", + # TODO: Document how to bootstrap setuptools without install + # (e.g. by unziping the wheel file) + # and then add a due_date to this warning. + ) + + super().initialize_options() + self.old_and_unmanageable = None + self.single_version_externally_managed = None + + def finalize_options(self) -> None: + super().finalize_options() + if self.root: + self.single_version_externally_managed = True + elif self.single_version_externally_managed: + if not self.root and not self.record: + raise DistutilsArgError( + "You must specify --record or --root when building system packages" + ) + + def handle_extra_path(self): + if self.root or self.single_version_externally_managed: + # explicit backward-compatibility mode, allow extra_path to work + return orig.install.handle_extra_path(self) + + # Ignore extra_path when installing an egg (or being run by another + # command without --root or --single-version-externally-managed + self.path_file = None + self.extra_dirs = '' + return None + + def run(self): + # Explicit request for old-style install? Just do it + if self.old_and_unmanageable or self.single_version_externally_managed: + return super().run() + + if not self._called_from_setup(inspect.currentframe()): + # Run in backward-compatibility mode to support bdist_* commands. + super().run() + else: + self.do_egg_install() + + return None + + @staticmethod + def _called_from_setup(run_frame): + """ + Attempt to detect whether run() was called from setup() or by another + command. If called by setup(), the parent caller will be the + 'run_command' method in 'distutils.dist', and *its* caller will be + the 'run_commands' method. If called any other way, the + immediate caller *might* be 'run_command', but it won't have been + called by 'run_commands'. Return True in that case or if a call stack + is unavailable. Return False otherwise. + """ + if run_frame is None: + msg = "Call stack not available. bdist_* commands may fail." + SetuptoolsWarning.emit(msg) + if platform.python_implementation() == 'IronPython': + msg = "For best results, pass -X:Frames to enable call stack." + SetuptoolsWarning.emit(msg) + return True + + frames = inspect.getouterframes(run_frame) + for frame in frames[2:4]: + (caller,) = frame[:1] + info = inspect.getframeinfo(caller) + caller_module = caller.f_globals.get('__name__', '') + + if caller_module == "setuptools.dist" and info.function == "run_command": + # Starting from v61.0.0 setuptools overwrites dist.run_command + continue + + return caller_module == 'distutils.dist' and info.function == 'run_commands' + + return False + + def do_egg_install(self) -> None: + easy_install = self.distribution.get_command_class('easy_install') + + cmd = cast( + # We'd want to cast easy_install as type[easy_install_cls] but a bug in + # mypy makes it think easy_install() returns a Command on Python 3.12+ + # https://github.com/python/mypy/issues/18088 + easy_install_cls, + easy_install( # type: ignore[call-arg] + self.distribution, + args="x", + root=self.root, + record=self.record, + ), + ) + cmd.ensure_finalized() # finalize before bdist_egg munges install cmd + cmd.always_copy_from = '.' # make sure local-dir eggs get installed + + # pick up setup-dir .egg files only: no .egg-info + cmd.package_index.scan(glob.glob('*.egg')) + + self.run_command('bdist_egg') + bdist_egg = cast(bdist_egg_cls, self.distribution.get_command_obj('bdist_egg')) + args = [bdist_egg.egg_output] + + if setuptools.bootstrap_install_from: + # Bootstrap self-installation of setuptools + args.insert(0, setuptools.bootstrap_install_from) + + cmd.args = args + cmd.run(show_deprecation=False) + setuptools.bootstrap_install_from = None + + +# XXX Python 3.1 doesn't see _nc if this is inside the class +install.sub_commands = [ + cmd for cmd in orig.install.sub_commands if cmd[0] not in install._nc +] + install.new_commands diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install_egg_info.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install_egg_info.py new file mode 100644 index 0000000000000000000000000000000000000000..44f22ccf51cc3fb1662b326c3150845356d77fc3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install_egg_info.py @@ -0,0 +1,58 @@ +import os + +from setuptools import Command, namespaces +from setuptools.archive_util import unpack_archive + +from .._path import ensure_directory + +from distutils import dir_util, log + + +class install_egg_info(namespaces.Installer, Command): + """Install an .egg-info directory for the package""" + + description = "Install an .egg-info directory for the package" + + user_options = [ + ('install-dir=', 'd', "directory to install to"), + ] + + def initialize_options(self): + self.install_dir = None + + def finalize_options(self) -> None: + self.set_undefined_options('install_lib', ('install_dir', 'install_dir')) + ei_cmd = self.get_finalized_command("egg_info") + basename = f"{ei_cmd._get_egg_basename()}.egg-info" + self.source = ei_cmd.egg_info + self.target = os.path.join(self.install_dir, basename) + self.outputs: list[str] = [] + + def run(self) -> None: + self.run_command('egg_info') + if os.path.isdir(self.target) and not os.path.islink(self.target): + dir_util.remove_tree(self.target, dry_run=self.dry_run) + elif os.path.exists(self.target): + self.execute(os.unlink, (self.target,), "Removing " + self.target) + if not self.dry_run: + ensure_directory(self.target) + self.execute(self.copytree, (), f"Copying {self.source} to {self.target}") + self.install_namespaces() + + def get_outputs(self): + return self.outputs + + def copytree(self) -> None: + # Copy the .egg-info tree to site-packages + def skimmer(src, dst): + # filter out source-control directories; note that 'src' is always + # a '/'-separated path, regardless of platform. 'dst' is a + # platform-specific path. + for skip in '.svn/', 'CVS/': + if src.startswith(skip) or '/' + skip in src: + return None + self.outputs.append(dst) + log.debug("Copying %s to %s", src, dst) + return dst + + unpack_archive(self.source, self.target, skimmer) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install_lib.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install_lib.py new file mode 100644 index 0000000000000000000000000000000000000000..8e1e07271028386e138585cac7619ab0338017cc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install_lib.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +import os +import sys +from itertools import product, starmap + +from .._path import StrPath +from ..dist import Distribution + +import distutils.command.install_lib as orig + + +class install_lib(orig.install_lib): + """Don't add compiled flags to filenames of non-Python files""" + + distribution: Distribution # override distutils.dist.Distribution with setuptools.dist.Distribution + + def run(self) -> None: + self.build() + outfiles = self.install() + if outfiles is not None: + # always compile, in case we have any extension stubs to deal with + self.byte_compile(outfiles) + + def get_exclusions(self): + """ + Return a collections.Sized collections.Container of paths to be + excluded for single_version_externally_managed installations. + """ + all_packages = ( + pkg + for ns_pkg in self._get_SVEM_NSPs() + for pkg in self._all_packages(ns_pkg) + ) + + excl_specs = product(all_packages, self._gen_exclusion_paths()) + return set(starmap(self._exclude_pkg_path, excl_specs)) + + def _exclude_pkg_path(self, pkg, exclusion_path): + """ + Given a package name and exclusion path within that package, + compute the full exclusion path. + """ + parts = pkg.split('.') + [exclusion_path] + return os.path.join(self.install_dir, *parts) + + @staticmethod + def _all_packages(pkg_name): + """ + >>> list(install_lib._all_packages('foo.bar.baz')) + ['foo.bar.baz', 'foo.bar', 'foo'] + """ + while pkg_name: + yield pkg_name + pkg_name, _sep, _child = pkg_name.rpartition('.') + + def _get_SVEM_NSPs(self): + """ + Get namespace packages (list) but only for + single_version_externally_managed installations and empty otherwise. + """ + # TODO: is it necessary to short-circuit here? i.e. what's the cost + # if get_finalized_command is called even when namespace_packages is + # False? + if not self.distribution.namespace_packages: + return [] + + install_cmd = self.get_finalized_command('install') + svem = install_cmd.single_version_externally_managed + + return self.distribution.namespace_packages if svem else [] + + @staticmethod + def _gen_exclusion_paths(): + """ + Generate file paths to be excluded for namespace packages (bytecode + cache files). + """ + # always exclude the package module itself + yield '__init__.py' + + yield '__init__.pyc' + yield '__init__.pyo' + + if not hasattr(sys, 'implementation'): + return + + base = os.path.join('__pycache__', '__init__.' + sys.implementation.cache_tag) + yield base + '.pyc' + yield base + '.pyo' + yield base + '.opt-1.pyc' + yield base + '.opt-2.pyc' + + def copy_tree( + self, + infile: StrPath, + outfile: str, + # override: Using actual booleans + preserve_mode: bool = True, # type: ignore[override] + preserve_times: bool = True, # type: ignore[override] + preserve_symlinks: bool = False, # type: ignore[override] + level: object = 1, + ) -> list[str]: + assert preserve_mode + assert preserve_times + assert not preserve_symlinks + exclude = self.get_exclusions() + + if not exclude: + return orig.install_lib.copy_tree(self, infile, outfile) + + # Exclude namespace package __init__.py* files from the output + + from setuptools.archive_util import unpack_directory + + from distutils import log + + outfiles: list[str] = [] + + def pf(src: str, dst: str): + if dst in exclude: + log.warn("Skipping installation of %s (namespace package)", dst) + return False + + log.info("copying %s -> %s", src, os.path.dirname(dst)) + outfiles.append(dst) + return dst + + unpack_directory(infile, outfile, pf) + return outfiles + + def get_outputs(self): + outputs = orig.install_lib.get_outputs(self) + exclude = self.get_exclusions() + if exclude: + return [f for f in outputs if f not in exclude] + return outputs diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install_scripts.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install_scripts.py new file mode 100644 index 0000000000000000000000000000000000000000..4401cf693d2642abce449dee238c037038633fec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/install_scripts.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +import os +import sys + +from .._path import ensure_directory +from ..dist import Distribution + +import distutils.command.install_scripts as orig +from distutils import log + + +class install_scripts(orig.install_scripts): + """Do normal script install, plus any egg_info wrapper scripts""" + + distribution: Distribution # override distutils.dist.Distribution with setuptools.dist.Distribution + + def initialize_options(self) -> None: + orig.install_scripts.initialize_options(self) + self.no_ep = False + + def run(self) -> None: + self.run_command("egg_info") + if self.distribution.scripts: + orig.install_scripts.run(self) # run first to set up self.outfiles + else: + self.outfiles: list[str] = [] + if self.no_ep: + # don't install entry point scripts into .egg file! + return + self._install_ep_scripts() + + def _install_ep_scripts(self): + # Delay import side-effects + from pkg_resources import Distribution, PathMetadata + + from . import easy_install as ei + + ei_cmd = self.get_finalized_command("egg_info") + dist = Distribution( + ei_cmd.egg_base, + PathMetadata(ei_cmd.egg_base, ei_cmd.egg_info), + ei_cmd.egg_name, + ei_cmd.egg_version, + ) + bs_cmd = self.get_finalized_command('build_scripts') + exec_param = getattr(bs_cmd, 'executable', None) + writer = ei.ScriptWriter + if exec_param == sys.executable: + # In case the path to the Python executable contains a space, wrap + # it so it's not split up. + exec_param = [exec_param] + # resolve the writer to the environment + writer = writer.best() + cmd = writer.command_spec_class.best().from_param(exec_param) + for args in writer.get_args(dist, cmd.as_header()): + self.write_script(*args) + + def write_script(self, script_name, contents, mode: str = "t", *ignored) -> None: + """Write an executable file to the scripts directory""" + from setuptools.command.easy_install import chmod, current_umask + + log.info("Installing %s script to %s", script_name, self.install_dir) + target = os.path.join(self.install_dir, script_name) + self.outfiles.append(target) + + encoding = None if "b" in mode else "utf-8" + mask = current_umask() + if not self.dry_run: + ensure_directory(target) + with open(target, "w" + mode, encoding=encoding) as f: + f.write(contents) + chmod(target, 0o777 - mask) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/launcher manifest.xml b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/launcher manifest.xml new file mode 100644 index 0000000000000000000000000000000000000000..5972a96d8ded85cc14147ffc1400ec67c3b5a578 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/launcher manifest.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/rotate.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/rotate.py new file mode 100644 index 0000000000000000000000000000000000000000..acdce07baaf3e4fc87552c39d7ee6e0271fe2a72 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/rotate.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +import os +from typing import ClassVar + +from .. import Command, _shutil + +from distutils import log +from distutils.errors import DistutilsOptionError +from distutils.util import convert_path + + +class rotate(Command): + """Delete older distributions""" + + description = "delete older distributions, keeping N newest files" + user_options = [ + ('match=', 'm', "patterns to match (required)"), + ('dist-dir=', 'd', "directory where the distributions are"), + ('keep=', 'k', "number of matching distributions to keep"), + ] + + boolean_options: ClassVar[list[str]] = [] + + def initialize_options(self): + self.match = None + self.dist_dir = None + self.keep = None + + def finalize_options(self) -> None: + if self.match is None: + raise DistutilsOptionError( + "Must specify one or more (comma-separated) match patterns " + "(e.g. '.zip' or '.egg')" + ) + if self.keep is None: + raise DistutilsOptionError("Must specify number of files to keep") + try: + self.keep = int(self.keep) + except ValueError as e: + raise DistutilsOptionError("--keep must be an integer") from e + if isinstance(self.match, str): + self.match = [convert_path(p.strip()) for p in self.match.split(',')] + self.set_undefined_options('bdist', ('dist_dir', 'dist_dir')) + + def run(self) -> None: + self.run_command("egg_info") + from glob import glob + + for pattern in self.match: + pattern = self.distribution.get_name() + '*' + pattern + files = glob(os.path.join(self.dist_dir, pattern)) + files = [(os.path.getmtime(f), f) for f in files] + files.sort() + files.reverse() + + log.info("%d file(s) matching %s", len(files), pattern) + files = files[self.keep :] + for t, f in files: + log.info("Deleting %s", f) + if not self.dry_run: + if os.path.isdir(f): + _shutil.rmtree(f) + else: + os.unlink(f) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/saveopts.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/saveopts.py new file mode 100644 index 0000000000000000000000000000000000000000..2a2cbce6e2aad396f4a5ee9788d53387237df96d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/saveopts.py @@ -0,0 +1,21 @@ +from setuptools.command.setopt import edit_config, option_base + + +class saveopts(option_base): + """Save command-line options to a file""" + + description = "save supplied options to setup.cfg or other config file" + + def run(self) -> None: + dist = self.distribution + settings: dict[str, dict[str, str]] = {} + + for cmd in dist.command_options: + if cmd == 'saveopts': + continue # don't save our own options! + + for opt, (src, val) in dist.get_option_dict(cmd).items(): + if src == "command line": + settings.setdefault(cmd, {})[opt] = val + + edit_config(self.filename, settings, self.dry_run) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/sdist.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/sdist.py new file mode 100644 index 0000000000000000000000000000000000000000..9631cf3114778a8865917d02878525fc0127a253 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/sdist.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import contextlib +import os +import re +from itertools import chain +from typing import ClassVar + +from .._importlib import metadata +from ..dist import Distribution +from .build import _ORIGINAL_SUBCOMMANDS + +import distutils.command.sdist as orig +from distutils import log + +_default_revctrl = list + + +def walk_revctrl(dirname=''): + """Find all files under revision control""" + for ep in metadata.entry_points(group='setuptools.file_finders'): + yield from ep.load()(dirname) + + +class sdist(orig.sdist): + """Smart sdist that finds anything supported by revision control""" + + user_options = [ + ('formats=', None, "formats for source distribution (comma-separated list)"), + ( + 'keep-temp', + 'k', + "keep the distribution tree around after creating " + "archive file(s)", + ), + ( + 'dist-dir=', + 'd', + "directory to put the source distribution archive(s) in [default: dist]", + ), + ( + 'owner=', + 'u', + "Owner name used when creating a tar file [default: current user]", + ), + ( + 'group=', + 'g', + "Group name used when creating a tar file [default: current group]", + ), + ] + + distribution: Distribution # override distutils.dist.Distribution with setuptools.dist.Distribution + negative_opt: ClassVar[dict[str, str]] = {} + + README_EXTENSIONS = ['', '.rst', '.txt', '.md'] + READMES = tuple(f'README{ext}' for ext in README_EXTENSIONS) + + def run(self) -> None: + self.run_command('egg_info') + ei_cmd = self.get_finalized_command('egg_info') + self.filelist = ei_cmd.filelist + self.filelist.append(os.path.join(ei_cmd.egg_info, 'SOURCES.txt')) + self.check_readme() + + # Run sub commands + for cmd_name in self.get_sub_commands(): + self.run_command(cmd_name) + + self.make_distribution() + + dist_files = getattr(self.distribution, 'dist_files', []) + for file in self.archive_files: + data = ('sdist', '', file) + if data not in dist_files: + dist_files.append(data) + + def initialize_options(self) -> None: + orig.sdist.initialize_options(self) + + def make_distribution(self) -> None: + """ + Workaround for #516 + """ + with self._remove_os_link(): + orig.sdist.make_distribution(self) + + @staticmethod + @contextlib.contextmanager + def _remove_os_link(): + """ + In a context, remove and restore os.link if it exists + """ + + class NoValue: + pass + + orig_val = getattr(os, 'link', NoValue) + try: + del os.link + except Exception: + pass + try: + yield + finally: + if orig_val is not NoValue: + os.link = orig_val + + def add_defaults(self) -> None: + super().add_defaults() + self._add_defaults_build_sub_commands() + + def _add_defaults_optional(self): + super()._add_defaults_optional() + if os.path.isfile('pyproject.toml'): + self.filelist.append('pyproject.toml') + + def _add_defaults_python(self): + """getting python files""" + if self.distribution.has_pure_modules(): + build_py = self.get_finalized_command('build_py') + self.filelist.extend(build_py.get_source_files()) + self._add_data_files(self._safe_data_files(build_py)) + + def _add_defaults_build_sub_commands(self): + build = self.get_finalized_command("build") + missing_cmds = set(build.get_sub_commands()) - _ORIGINAL_SUBCOMMANDS + # ^-- the original built-in sub-commands are already handled by default. + cmds = (self.get_finalized_command(c) for c in missing_cmds) + files = (c.get_source_files() for c in cmds if hasattr(c, "get_source_files")) + self.filelist.extend(chain.from_iterable(files)) + + def _safe_data_files(self, build_py): + """ + Since the ``sdist`` class is also used to compute the MANIFEST + (via :obj:`setuptools.command.egg_info.manifest_maker`), + there might be recursion problems when trying to obtain the list of + data_files and ``include_package_data=True`` (which in turn depends on + the files included in the MANIFEST). + + To avoid that, ``manifest_maker`` should be able to overwrite this + method and avoid recursive attempts to build/analyze the MANIFEST. + """ + return build_py.data_files + + def _add_data_files(self, data_files): + """ + Add data files as found in build_py.data_files. + """ + self.filelist.extend( + os.path.join(src_dir, name) + for _, src_dir, _, filenames in data_files + for name in filenames + ) + + def _add_defaults_data_files(self): + try: + super()._add_defaults_data_files() + except TypeError: + log.warn("data_files contains unexpected objects") + + def prune_file_list(self) -> None: + super().prune_file_list() + # Prevent accidental inclusion of test-related cache dirs at the project root + sep = re.escape(os.sep) + self.filelist.exclude_pattern(r"^(\.tox|\.nox|\.venv)" + sep, is_regex=True) + + def check_readme(self) -> None: + for f in self.READMES: + if os.path.exists(f): + return + else: + self.warn( + "standard file not found: should have one of " + ', '.join(self.READMES) + ) + + def make_release_tree(self, base_dir, files) -> None: + orig.sdist.make_release_tree(self, base_dir, files) + + # Save any egg_info command line options used to create this sdist + dest = os.path.join(base_dir, 'setup.cfg') + if hasattr(os, 'link') and os.path.exists(dest): + # unlink and re-copy, since it might be hard-linked, and + # we don't want to change the source version + os.unlink(dest) + self.copy_file('setup.cfg', dest) + + self.get_finalized_command('egg_info').save_version_info(dest) + + def _manifest_is_not_generated(self): + # check for special comment used in 2.7.1 and higher + if not os.path.isfile(self.manifest): + return False + + with open(self.manifest, 'rb') as fp: + first_line = fp.readline() + return first_line != b'# file GENERATED by distutils, do NOT edit\n' + + def read_manifest(self): + """Read the manifest file (named by 'self.manifest') and use it to + fill in 'self.filelist', the list of files to include in the source + distribution. + """ + log.info("reading manifest file '%s'", self.manifest) + manifest = open(self.manifest, 'rb') + for bytes_line in manifest: + # The manifest must contain UTF-8. See #303. + try: + line = bytes_line.decode('UTF-8') + except UnicodeDecodeError: + log.warn(f"{line!r} not UTF-8 decodable -- skipping") + continue + # ignore comments and blank lines + line = line.strip() + if line.startswith('#') or not line: + continue + self.filelist.append(line) + manifest.close() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/setopt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/setopt.py new file mode 100644 index 0000000000000000000000000000000000000000..678a0593d6739db316bd4008860a613b59892f1f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/setopt.py @@ -0,0 +1,141 @@ +import configparser +import os + +from .. import Command +from ..unicode_utils import _cfg_read_utf8_with_fallback + +import distutils +from distutils import log +from distutils.errors import DistutilsOptionError +from distutils.util import convert_path + +__all__ = ['config_file', 'edit_config', 'option_base', 'setopt'] + + +def config_file(kind="local"): + """Get the filename of the distutils, local, global, or per-user config + + `kind` must be one of "local", "global", or "user" + """ + if kind == 'local': + return 'setup.cfg' + if kind == 'global': + return os.path.join(os.path.dirname(distutils.__file__), 'distutils.cfg') + if kind == 'user': + dot = os.name == 'posix' and '.' or '' + return os.path.expanduser(convert_path(f"~/{dot}pydistutils.cfg")) + raise ValueError("config_file() type must be 'local', 'global', or 'user'", kind) + + +def edit_config(filename, settings, dry_run=False): + """Edit a configuration file to include `settings` + + `settings` is a dictionary of dictionaries or ``None`` values, keyed by + command/section name. A ``None`` value means to delete the entire section, + while a dictionary lists settings to be changed or deleted in that section. + A setting of ``None`` means to delete that setting. + """ + log.debug("Reading configuration from %s", filename) + opts = configparser.RawConfigParser() + opts.optionxform = lambda optionstr: optionstr # type: ignore[method-assign] # overriding method + _cfg_read_utf8_with_fallback(opts, filename) + + for section, options in settings.items(): + if options is None: + log.info("Deleting section [%s] from %s", section, filename) + opts.remove_section(section) + else: + if not opts.has_section(section): + log.debug("Adding new section [%s] to %s", section, filename) + opts.add_section(section) + for option, value in options.items(): + if value is None: + log.debug("Deleting %s.%s from %s", section, option, filename) + opts.remove_option(section, option) + if not opts.options(section): + log.info( + "Deleting empty [%s] section from %s", section, filename + ) + opts.remove_section(section) + else: + log.debug( + "Setting %s.%s to %r in %s", section, option, value, filename + ) + opts.set(section, option, value) + + log.info("Writing %s", filename) + if not dry_run: + with open(filename, 'w', encoding="utf-8") as f: + opts.write(f) + + +class option_base(Command): + """Abstract base class for commands that mess with config files""" + + user_options = [ + ('global-config', 'g', "save options to the site-wide distutils.cfg file"), + ('user-config', 'u', "save options to the current user's pydistutils.cfg file"), + ('filename=', 'f', "configuration file to use (default=setup.cfg)"), + ] + + boolean_options = [ + 'global-config', + 'user-config', + ] + + def initialize_options(self): + self.global_config = None + self.user_config = None + self.filename = None + + def finalize_options(self): + filenames = [] + if self.global_config: + filenames.append(config_file('global')) + if self.user_config: + filenames.append(config_file('user')) + if self.filename is not None: + filenames.append(self.filename) + if not filenames: + filenames.append(config_file('local')) + if len(filenames) > 1: + raise DistutilsOptionError( + "Must specify only one configuration file option", filenames + ) + (self.filename,) = filenames + + +class setopt(option_base): + """Save command-line options to a file""" + + description = "set an option in setup.cfg or another config file" + + user_options = [ + ('command=', 'c', 'command to set an option for'), + ('option=', 'o', 'option to set'), + ('set-value=', 's', 'value of the option'), + ('remove', 'r', 'remove (unset) the value'), + ] + option_base.user_options + + boolean_options = option_base.boolean_options + ['remove'] + + def initialize_options(self): + option_base.initialize_options(self) + self.command = None + self.option = None + self.set_value = None + self.remove = None + + def finalize_options(self) -> None: + option_base.finalize_options(self) + if self.command is None or self.option is None: + raise DistutilsOptionError("Must specify --command *and* --option") + if self.set_value is None and not self.remove: + raise DistutilsOptionError("Must specify --set-value or --remove") + + def run(self) -> None: + edit_config( + self.filename, + {self.command: {self.option.replace('-', '_'): self.set_value}}, + self.dry_run, + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/test.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/test.py new file mode 100644 index 0000000000000000000000000000000000000000..341b11a20eb2cce98cdeeebea9ba2de0282d5d38 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/command/test.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from setuptools import Command +from setuptools.warnings import SetuptoolsDeprecationWarning + + +# Would restrict to Literal["test"], but mypy doesn't support it: https://github.com/python/mypy/issues/8203 +def __getattr__(name: str) -> type[_test]: + if name == 'test': + SetuptoolsDeprecationWarning.emit( + "The test command is disabled and references to it are deprecated.", + "Please remove any references to `setuptools.command.test` in all " + "supported versions of the affected package.", + due_date=(2024, 11, 15), + stacklevel=2, + ) + return _test + raise AttributeError(name) + + +class _test(Command): + """ + Stub to warn when test command is referenced or used. + """ + + description = "stub for old test command (do not use)" + + user_options = [ + ('test-module=', 'm', "Run 'test_suite' in specified module"), + ( + 'test-suite=', + 's', + "Run single test, case or suite (e.g. 'module.test_suite')", + ), + ('test-runner=', 'r', "Test runner to use"), + ] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + raise RuntimeError("Support for the test command was removed in Setuptools 72") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py310.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py310.py new file mode 100644 index 0000000000000000000000000000000000000000..b3912f8e02a57a5b9ba738673a21a596e94c1122 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py310.py @@ -0,0 +1,9 @@ +import sys + +__all__ = ['tomllib'] + + +if sys.version_info >= (3, 11): + import tomllib +else: # pragma: no cover + import tomli as tomllib diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py311.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py311.py new file mode 100644 index 0000000000000000000000000000000000000000..52b58af32a28817deeaf592cc795f311434e39e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py311.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import shutil +import sys +from typing import TYPE_CHECKING, Any, Callable + +if TYPE_CHECKING: + from _typeshed import ExcInfo, StrOrBytesPath + from typing_extensions import TypeAlias + +# Same as shutil._OnExcCallback from typeshed +_OnExcCallback: TypeAlias = Callable[[Callable[..., Any], str, BaseException], object] + + +def shutil_rmtree( + path: StrOrBytesPath, + ignore_errors: bool = False, + onexc: _OnExcCallback | None = None, +) -> None: + if sys.version_info >= (3, 12): + return shutil.rmtree(path, ignore_errors, onexc=onexc) + + def _handler(fn: Callable[..., Any], path: str, excinfo: ExcInfo) -> None: + if onexc: + onexc(fn, path, excinfo[1]) + + return shutil.rmtree(path, ignore_errors, onerror=_handler) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py312.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py312.py new file mode 100644 index 0000000000000000000000000000000000000000..b20c5f697a861a680c4a747baf9ca9c901e72978 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py312.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import sys + +if sys.version_info >= (3, 12, 4): + # Python 3.13 should support `.pth` files encoded in UTF-8 + # See discussion in https://github.com/python/cpython/issues/77102 + PTH_ENCODING: str | None = "utf-8" +else: + from .py39 import LOCALE_ENCODING + + # PTH_ENCODING = "locale" + PTH_ENCODING = LOCALE_ENCODING diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py39.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py39.py new file mode 100644 index 0000000000000000000000000000000000000000..04a4abe5a9e61596f97dc3d1b0b3da93a558158f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/compat/py39.py @@ -0,0 +1,9 @@ +import sys + +# Explicitly use the ``"locale"`` encoding in versions that support it, +# otherwise just rely on the implicit handling of ``encoding=None``. +# Since all platforms that support ``EncodingWarning`` also support +# ``encoding="locale"``, this can be used to suppress the warning. +# However, please try to use UTF-8 when possible +# (.pth files are the notorious exception: python/cpython#77102, pypa/setuptools#3937). +LOCALE_ENCODING = "locale" if sys.version_info >= (3, 10) else None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/NOTICE b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/NOTICE new file mode 100644 index 0000000000000000000000000000000000000000..01864511b0f52a6bf944f70b5e15b75364791935 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/NOTICE @@ -0,0 +1,10 @@ +The following files include code from opensource projects +(either as direct copies or modified versions): + +- `setuptools.schema.json`, `distutils.schema.json`: + - project: `validate-pyproject` - licensed under MPL-2.0 + (https://github.com/abravalheri/validate-pyproject): + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this file, + You can obtain one at https://mozilla.org/MPL/2.0/. diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fcc7d008d6c9b3ab2610f2a1b05c4c1eeb5e1416 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/__init__.py @@ -0,0 +1,43 @@ +"""For backward compatibility, expose main functions from +``setuptools.config.setupcfg`` +""" + +from functools import wraps +from typing import Callable, TypeVar, cast + +from ..warnings import SetuptoolsDeprecationWarning +from . import setupcfg + +Fn = TypeVar("Fn", bound=Callable) + +__all__ = ('parse_configuration', 'read_configuration') + + +def _deprecation_notice(fn: Fn) -> Fn: + @wraps(fn) + def _wrapper(*args, **kwargs): + SetuptoolsDeprecationWarning.emit( + "Deprecated API usage.", + f""" + As setuptools moves its configuration towards `pyproject.toml`, + `{__name__}.{fn.__name__}` became deprecated. + + For the time being, you can use the `{setupcfg.__name__}` module + to access a backward compatible API, but this module is provisional + and might be removed in the future. + + To read project metadata, consider using + ``build.util.project_wheel_metadata`` (https://pypi.org/project/build/). + For simple scenarios, you can also try parsing the file directly + with the help of ``configparser``. + """, + # due_date not defined yet, because the community still heavily relies on it + # Warning introduced in 24 Mar 2022 + ) + return fn(*args, **kwargs) + + return cast(Fn, _wrapper) + + +read_configuration = _deprecation_notice(setupcfg.read_configuration) +parse_configuration = _deprecation_notice(setupcfg.parse_configuration) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/_apply_pyprojecttoml.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/_apply_pyprojecttoml.py new file mode 100644 index 0000000000000000000000000000000000000000..331596bdd79224429d8a5884c0a5cd19025a878a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/_apply_pyprojecttoml.py @@ -0,0 +1,488 @@ +"""Translation layer between pyproject config and setuptools distribution and +metadata objects. + +The distribution and metadata objects are modeled after (an old version of) +core metadata, therefore configs in the format specified for ``pyproject.toml`` +need to be processed before being applied. + +**PRIVATE MODULE**: API reserved for setuptools internal usage only. +""" + +from __future__ import annotations + +import logging +import os +from collections.abc import Mapping +from email.headerregistry import Address +from functools import partial, reduce +from inspect import cleandoc +from itertools import chain +from types import MappingProxyType +from typing import TYPE_CHECKING, Any, Callable, TypeVar, Union + +from .. import _static +from .._path import StrPath +from ..errors import RemovedConfigError +from ..extension import Extension +from ..warnings import SetuptoolsWarning + +if TYPE_CHECKING: + from typing_extensions import TypeAlias + + from setuptools._importlib import metadata + from setuptools.dist import Distribution + + from distutils.dist import _OptionsList # Comes from typeshed + + +EMPTY: Mapping = MappingProxyType({}) # Immutable dict-like +_ProjectReadmeValue: TypeAlias = Union[str, dict[str, str]] +_Correspondence: TypeAlias = Callable[["Distribution", Any, Union[StrPath, None]], None] +_T = TypeVar("_T") + +_logger = logging.getLogger(__name__) + + +def apply(dist: Distribution, config: dict, filename: StrPath) -> Distribution: + """Apply configuration dict read with :func:`read_configuration`""" + + if not config: + return dist # short-circuit unrelated pyproject.toml file + + root_dir = os.path.dirname(filename) or "." + + _apply_project_table(dist, config, root_dir) + _apply_tool_table(dist, config, filename) + + current_directory = os.getcwd() + os.chdir(root_dir) + try: + dist._finalize_requires() + dist._finalize_license_files() + finally: + os.chdir(current_directory) + + return dist + + +def _apply_project_table(dist: Distribution, config: dict, root_dir: StrPath): + orig_config = config.get("project", {}) + if not orig_config: + return # short-circuit + + project_table = {k: _static.attempt_conversion(v) for k, v in orig_config.items()} + _handle_missing_dynamic(dist, project_table) + _unify_entry_points(project_table) + + for field, value in project_table.items(): + norm_key = json_compatible_key(field) + corresp = PYPROJECT_CORRESPONDENCE.get(norm_key, norm_key) + if callable(corresp): + corresp(dist, value, root_dir) + else: + _set_config(dist, corresp, value) + + +def _apply_tool_table(dist: Distribution, config: dict, filename: StrPath): + tool_table = config.get("tool", {}).get("setuptools", {}) + if not tool_table: + return # short-circuit + + for field, value in tool_table.items(): + norm_key = json_compatible_key(field) + + if norm_key in TOOL_TABLE_REMOVALS: + suggestion = cleandoc(TOOL_TABLE_REMOVALS[norm_key]) + msg = f""" + The parameter `tool.setuptools.{field}` was long deprecated + and has been removed from `pyproject.toml`. + """ + raise RemovedConfigError("\n".join([cleandoc(msg), suggestion])) + + norm_key = TOOL_TABLE_RENAMES.get(norm_key, norm_key) + corresp = TOOL_TABLE_CORRESPONDENCE.get(norm_key, norm_key) + if callable(corresp): + corresp(dist, value) + else: + _set_config(dist, corresp, value) + + _copy_command_options(config, dist, filename) + + +def _handle_missing_dynamic(dist: Distribution, project_table: dict): + """Be temporarily forgiving with ``dynamic`` fields not listed in ``dynamic``""" + dynamic = set(project_table.get("dynamic", [])) + for field, getter in _PREVIOUSLY_DEFINED.items(): + if not (field in project_table or field in dynamic): + value = getter(dist) + if value: + _MissingDynamic.emit(field=field, value=value) + project_table[field] = _RESET_PREVIOUSLY_DEFINED.get(field) + + +def json_compatible_key(key: str) -> str: + """As defined in :pep:`566#json-compatible-metadata`""" + return key.lower().replace("-", "_") + + +def _set_config(dist: Distribution, field: str, value: Any): + val = _PREPROCESS.get(field, _noop)(dist, value) + setter = getattr(dist.metadata, f"set_{field}", None) + if setter: + setter(val) + elif hasattr(dist.metadata, field) or field in SETUPTOOLS_PATCHES: + setattr(dist.metadata, field, val) + else: + setattr(dist, field, val) + + +_CONTENT_TYPES = { + ".md": "text/markdown", + ".rst": "text/x-rst", + ".txt": "text/plain", +} + + +def _guess_content_type(file: str) -> str | None: + _, ext = os.path.splitext(file.lower()) + if not ext: + return None + + if ext in _CONTENT_TYPES: + return _static.Str(_CONTENT_TYPES[ext]) + + valid = ", ".join(f"{k} ({v})" for k, v in _CONTENT_TYPES.items()) + msg = f"only the following file extensions are recognized: {valid}." + raise ValueError(f"Undefined content type for {file}, {msg}") + + +def _long_description( + dist: Distribution, val: _ProjectReadmeValue, root_dir: StrPath | None +): + from setuptools.config import expand + + file: str | tuple[()] + if isinstance(val, str): + file = val + text = expand.read_files(file, root_dir) + ctype = _guess_content_type(file) + else: + file = val.get("file") or () + text = val.get("text") or expand.read_files(file, root_dir) + ctype = val["content-type"] + + # XXX: Is it completely safe to assume static? + _set_config(dist, "long_description", _static.Str(text)) + + if ctype: + _set_config(dist, "long_description_content_type", _static.Str(ctype)) + + if file: + dist._referenced_files.add(file) + + +def _license(dist: Distribution, val: dict, root_dir: StrPath | None): + from setuptools.config import expand + + if "file" in val: + # XXX: Is it completely safe to assume static? + value = expand.read_files([val["file"]], root_dir) + _set_config(dist, "license", _static.Str(value)) + dist._referenced_files.add(val["file"]) + else: + _set_config(dist, "license", _static.Str(val["text"])) + + +def _people(dist: Distribution, val: list[dict], _root_dir: StrPath | None, kind: str): + field = [] + email_field = [] + for person in val: + if "name" not in person: + email_field.append(person["email"]) + elif "email" not in person: + field.append(person["name"]) + else: + addr = Address(display_name=person["name"], addr_spec=person["email"]) + email_field.append(str(addr)) + + if field: + _set_config(dist, kind, _static.Str(", ".join(field))) + if email_field: + _set_config(dist, f"{kind}_email", _static.Str(", ".join(email_field))) + + +def _project_urls(dist: Distribution, val: dict, _root_dir: StrPath | None): + _set_config(dist, "project_urls", val) + + +def _python_requires(dist: Distribution, val: str, _root_dir: StrPath | None): + _set_config(dist, "python_requires", _static.SpecifierSet(val)) + + +def _dependencies(dist: Distribution, val: list, _root_dir: StrPath | None): + if getattr(dist, "install_requires", []): + msg = "`install_requires` overwritten in `pyproject.toml` (dependencies)" + SetuptoolsWarning.emit(msg) + dist.install_requires = val + + +def _optional_dependencies(dist: Distribution, val: dict, _root_dir: StrPath | None): + if getattr(dist, "extras_require", None): + msg = "`extras_require` overwritten in `pyproject.toml` (optional-dependencies)" + SetuptoolsWarning.emit(msg) + dist.extras_require = val + + +def _ext_modules(dist: Distribution, val: list[dict]) -> list[Extension]: + existing = dist.ext_modules or [] + args = ({k.replace("-", "_"): v for k, v in x.items()} for x in val) + new = [Extension(**kw) for kw in args] + return [*existing, *new] + + +def _noop(_dist: Distribution, val: _T) -> _T: + return val + + +def _identity(val: _T) -> _T: + return val + + +def _unify_entry_points(project_table: dict): + project = project_table + given = project.pop("entry-points", project.pop("entry_points", {})) + entry_points = dict(given) # Avoid problems with static + renaming = {"scripts": "console_scripts", "gui_scripts": "gui_scripts"} + for key, value in list(project.items()): # eager to allow modifications + norm_key = json_compatible_key(key) + if norm_key in renaming: + # Don't skip even if value is empty (reason: reset missing `dynamic`) + entry_points[renaming[norm_key]] = project.pop(key) + + if entry_points: + project["entry-points"] = { + name: [f"{k} = {v}" for k, v in group.items()] + for name, group in entry_points.items() + if group # now we can skip empty groups + } + # Sometimes this will set `project["entry-points"] = {}`, and that is + # intentional (for resetting configurations that are missing `dynamic`). + + +def _copy_command_options(pyproject: dict, dist: Distribution, filename: StrPath): + tool_table = pyproject.get("tool", {}) + cmdclass = tool_table.get("setuptools", {}).get("cmdclass", {}) + valid_options = _valid_command_options(cmdclass) + + cmd_opts = dist.command_options + for cmd, config in pyproject.get("tool", {}).get("distutils", {}).items(): + cmd = json_compatible_key(cmd) + valid = valid_options.get(cmd, set()) + cmd_opts.setdefault(cmd, {}) + for key, value in config.items(): + key = json_compatible_key(key) + cmd_opts[cmd][key] = (str(filename), value) + if key not in valid: + # To avoid removing options that are specified dynamically we + # just log a warn... + _logger.warning(f"Command option {cmd}.{key} is not defined") + + +def _valid_command_options(cmdclass: Mapping = EMPTY) -> dict[str, set[str]]: + from setuptools.dist import Distribution + + from .._importlib import metadata + + valid_options = {"global": _normalise_cmd_options(Distribution.global_options)} + + unloaded_entry_points = metadata.entry_points(group='distutils.commands') + loaded_entry_points = (_load_ep(ep) for ep in unloaded_entry_points) + entry_points = (ep for ep in loaded_entry_points if ep) + for cmd, cmd_class in chain(entry_points, cmdclass.items()): + opts = valid_options.get(cmd, set()) + opts = opts | _normalise_cmd_options(getattr(cmd_class, "user_options", [])) + valid_options[cmd] = opts + + return valid_options + + +def _load_ep(ep: metadata.EntryPoint) -> tuple[str, type] | None: + if ep.value.startswith("wheel.bdist_wheel"): + # Ignore deprecated entrypoint from wheel and avoid warning pypa/wheel#631 + # TODO: remove check when `bdist_wheel` has been fully removed from pypa/wheel + return None + + # Ignore all the errors + try: + return (ep.name, ep.load()) + except Exception as ex: + msg = f"{ex.__class__.__name__} while trying to load entry-point {ep.name}" + _logger.warning(f"{msg}: {ex}") + return None + + +def _normalise_cmd_option_key(name: str) -> str: + return json_compatible_key(name).strip("_=") + + +def _normalise_cmd_options(desc: _OptionsList) -> set[str]: + return {_normalise_cmd_option_key(fancy_option[0]) for fancy_option in desc} + + +def _get_previous_entrypoints(dist: Distribution) -> dict[str, list]: + ignore = ("console_scripts", "gui_scripts") + value = getattr(dist, "entry_points", None) or {} + return {k: v for k, v in value.items() if k not in ignore} + + +def _get_previous_scripts(dist: Distribution) -> list | None: + value = getattr(dist, "entry_points", None) or {} + return value.get("console_scripts") + + +def _get_previous_gui_scripts(dist: Distribution) -> list | None: + value = getattr(dist, "entry_points", None) or {} + return value.get("gui_scripts") + + +def _set_static_list_metadata(attr: str, dist: Distribution, val: list) -> None: + """Apply distutils metadata validation but preserve "static" behaviour""" + meta = dist.metadata + setter, getter = getattr(meta, f"set_{attr}"), getattr(meta, f"get_{attr}") + setter(val) + setattr(meta, attr, _static.List(getter())) + + +def _attrgetter(attr): + """ + Similar to ``operator.attrgetter`` but returns None if ``attr`` is not found + >>> from types import SimpleNamespace + >>> obj = SimpleNamespace(a=42, b=SimpleNamespace(c=13)) + >>> _attrgetter("a")(obj) + 42 + >>> _attrgetter("b.c")(obj) + 13 + >>> _attrgetter("d")(obj) is None + True + """ + return partial(reduce, lambda acc, x: getattr(acc, x, None), attr.split(".")) + + +def _some_attrgetter(*items): + """ + Return the first "truth-y" attribute or None + >>> from types import SimpleNamespace + >>> obj = SimpleNamespace(a=42, b=SimpleNamespace(c=13)) + >>> _some_attrgetter("d", "a", "b.c")(obj) + 42 + >>> _some_attrgetter("d", "e", "b.c", "a")(obj) + 13 + >>> _some_attrgetter("d", "e", "f")(obj) is None + True + """ + + def _acessor(obj): + values = (_attrgetter(i)(obj) for i in items) + return next((i for i in values if i is not None), None) + + return _acessor + + +PYPROJECT_CORRESPONDENCE: dict[str, _Correspondence] = { + "readme": _long_description, + "license": _license, + "authors": partial(_people, kind="author"), + "maintainers": partial(_people, kind="maintainer"), + "urls": _project_urls, + "dependencies": _dependencies, + "optional_dependencies": _optional_dependencies, + "requires_python": _python_requires, +} + +TOOL_TABLE_RENAMES = {"script_files": "scripts"} +TOOL_TABLE_REMOVALS = { + "namespace_packages": """ + Please migrate to implicit native namespaces instead. + See https://packaging.python.org/en/latest/guides/packaging-namespace-packages/. + """, +} +TOOL_TABLE_CORRESPONDENCE = { + # Fields with corresponding core metadata need to be marked as static: + "obsoletes": partial(_set_static_list_metadata, "obsoletes"), + "provides": partial(_set_static_list_metadata, "provides"), + "platforms": partial(_set_static_list_metadata, "platforms"), +} + +SETUPTOOLS_PATCHES = { + "long_description_content_type", + "project_urls", + "provides_extras", + "license_file", + "license_files", +} + +_PREPROCESS = { + "ext_modules": _ext_modules, +} + +_PREVIOUSLY_DEFINED = { + "name": _attrgetter("metadata.name"), + "version": _attrgetter("metadata.version"), + "description": _attrgetter("metadata.description"), + "readme": _attrgetter("metadata.long_description"), + "requires-python": _some_attrgetter("python_requires", "metadata.python_requires"), + "license": _attrgetter("metadata.license"), + "authors": _some_attrgetter("metadata.author", "metadata.author_email"), + "maintainers": _some_attrgetter("metadata.maintainer", "metadata.maintainer_email"), + "keywords": _attrgetter("metadata.keywords"), + "classifiers": _attrgetter("metadata.classifiers"), + "urls": _attrgetter("metadata.project_urls"), + "entry-points": _get_previous_entrypoints, + "scripts": _get_previous_scripts, + "gui-scripts": _get_previous_gui_scripts, + "dependencies": _attrgetter("install_requires"), + "optional-dependencies": _attrgetter("extras_require"), +} + + +_RESET_PREVIOUSLY_DEFINED: dict = { + # Fix improper setting: given in `setup.py`, but not listed in `dynamic` + # dict: pyproject name => value to which reset + "license": _static.EMPTY_DICT, + "authors": _static.EMPTY_LIST, + "maintainers": _static.EMPTY_LIST, + "keywords": _static.EMPTY_LIST, + "classifiers": _static.EMPTY_LIST, + "urls": _static.EMPTY_DICT, + "entry-points": _static.EMPTY_DICT, + "scripts": _static.EMPTY_DICT, + "gui-scripts": _static.EMPTY_DICT, + "dependencies": _static.EMPTY_LIST, + "optional-dependencies": _static.EMPTY_DICT, +} + + +class _MissingDynamic(SetuptoolsWarning): + _SUMMARY = "`{field}` defined outside of `pyproject.toml` is ignored." + + _DETAILS = """ + The following seems to be defined outside of `pyproject.toml`: + + `{field} = {value!r}` + + According to the spec (see the link below), however, setuptools CANNOT + consider this value unless `{field}` is listed as `dynamic`. + + https://packaging.python.org/en/latest/specifications/pyproject-toml/#declaring-project-metadata-the-project-table + + To prevent this problem, you can list `{field}` under `dynamic` or alternatively + remove the `[project]` table from your file and rely entirely on other means of + configuration. + """ + # TODO: Consider removing this check in the future? + # There is a trade-off here between improving "debug-ability" and the cost + # of running/testing/maintaining these unnecessary checks... + + @classmethod + def details(cls, field: str, value: Any) -> str: + return cls._DETAILS.format(field=field, value=value) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/distutils.schema.json b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/distutils.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..93cd2e868acdaf41bf8b1f77026e84c02a5af3f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/distutils.schema.json @@ -0,0 +1,26 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + + "$id": "https://setuptools.pypa.io/en/latest/deprecated/distutils/configfile.html", + "title": "``tool.distutils`` table", + "$$description": [ + "**EXPERIMENTAL** (NOT OFFICIALLY SUPPORTED): Use ``tool.distutils``", + "subtables to configure arguments for ``distutils`` commands.", + "Originally, ``distutils`` allowed developers to configure arguments for", + "``setup.py`` commands via `distutils configuration files", + "`_.", + "See also `the old Python docs _`." + ], + + "type": "object", + "properties": { + "global": { + "type": "object", + "description": "Global options applied to all ``distutils`` commands" + } + }, + "patternProperties": { + ".+": {"type": "object"} + }, + "$comment": "TODO: Is there a practical way of making this schema more specific?" +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/expand.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/expand.py new file mode 100644 index 0000000000000000000000000000000000000000..531f965013031a8c5159c8b8505c38cfb0a4302f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/expand.py @@ -0,0 +1,452 @@ +"""Utility functions to expand configuration directives or special values +(such glob patterns). + +We can split the process of interpreting configuration files into 2 steps: + +1. The parsing the file contents from strings to value objects + that can be understand by Python (for example a string with a comma + separated list of keywords into an actual Python list of strings). + +2. The expansion (or post-processing) of these values according to the + semantics ``setuptools`` assign to them (for example a configuration field + with the ``file:`` directive should be expanded from a list of file paths to + a single string with the contents of those files concatenated) + +This module focus on the second step, and therefore allow sharing the expansion +functions among several configuration file formats. + +**PRIVATE MODULE**: API reserved for setuptools internal usage only. +""" + +from __future__ import annotations + +import ast +import importlib +import os +import pathlib +import sys +from collections.abc import Iterable, Iterator, Mapping +from configparser import ConfigParser +from glob import iglob +from importlib.machinery import ModuleSpec, all_suffixes +from itertools import chain +from pathlib import Path +from types import ModuleType, TracebackType +from typing import TYPE_CHECKING, Any, Callable, TypeVar + +from .. import _static +from .._path import StrPath, same_path as _same_path +from ..discovery import find_package_path +from ..warnings import SetuptoolsWarning + +from distutils.errors import DistutilsOptionError + +if TYPE_CHECKING: + from typing_extensions import Self + + from setuptools.dist import Distribution + +_K = TypeVar("_K") +_V_co = TypeVar("_V_co", covariant=True) + + +class StaticModule: + """Proxy to a module object that avoids executing arbitrary code.""" + + def __init__(self, name: str, spec: ModuleSpec) -> None: + module = ast.parse(pathlib.Path(spec.origin).read_bytes()) # type: ignore[arg-type] # Let it raise an error on None + vars(self).update(locals()) + del self.self + + def _find_assignments(self) -> Iterator[tuple[ast.AST, ast.AST]]: + for statement in self.module.body: + if isinstance(statement, ast.Assign): + yield from ((target, statement.value) for target in statement.targets) + elif isinstance(statement, ast.AnnAssign) and statement.value: + yield (statement.target, statement.value) + + def __getattr__(self, attr: str): + """Attempt to load an attribute "statically", via :func:`ast.literal_eval`.""" + try: + return next( + ast.literal_eval(value) + for target, value in self._find_assignments() + if isinstance(target, ast.Name) and target.id == attr + ) + except Exception as e: + raise AttributeError(f"{self.name} has no attribute {attr}") from e + + +def glob_relative( + patterns: Iterable[str], root_dir: StrPath | None = None +) -> list[str]: + """Expand the list of glob patterns, but preserving relative paths. + + :param list[str] patterns: List of glob patterns + :param str root_dir: Path to which globs should be relative + (current directory by default) + :rtype: list + """ + glob_characters = {'*', '?', '[', ']', '{', '}'} + expanded_values = [] + root_dir = root_dir or os.getcwd() + for value in patterns: + # Has globby characters? + if any(char in value for char in glob_characters): + # then expand the glob pattern while keeping paths *relative*: + glob_path = os.path.abspath(os.path.join(root_dir, value)) + expanded_values.extend( + sorted( + os.path.relpath(path, root_dir).replace(os.sep, "/") + for path in iglob(glob_path, recursive=True) + ) + ) + + else: + # take the value as-is + path = os.path.relpath(value, root_dir).replace(os.sep, "/") + expanded_values.append(path) + + return expanded_values + + +def read_files( + filepaths: StrPath | Iterable[StrPath], root_dir: StrPath | None = None +) -> str: + """Return the content of the files concatenated using ``\n`` as str + + This function is sandboxed and won't reach anything outside ``root_dir`` + + (By default ``root_dir`` is the current directory). + """ + from more_itertools import always_iterable + + root_dir = os.path.abspath(root_dir or os.getcwd()) + _filepaths = (os.path.join(root_dir, path) for path in always_iterable(filepaths)) + return '\n'.join( + _read_file(path) + for path in _filter_existing_files(_filepaths) + if _assert_local(path, root_dir) + ) + + +def _filter_existing_files(filepaths: Iterable[StrPath]) -> Iterator[StrPath]: + for path in filepaths: + if os.path.isfile(path): + yield path + else: + SetuptoolsWarning.emit(f"File {path!r} cannot be found") + + +def _read_file(filepath: bytes | StrPath) -> str: + with open(filepath, encoding='utf-8') as f: + return f.read() + + +def _assert_local(filepath: StrPath, root_dir: str): + if Path(os.path.abspath(root_dir)) not in Path(os.path.abspath(filepath)).parents: + msg = f"Cannot access {filepath!r} (or anything outside {root_dir!r})" + raise DistutilsOptionError(msg) + + return True + + +def read_attr( + attr_desc: str, + package_dir: Mapping[str, str] | None = None, + root_dir: StrPath | None = None, +) -> Any: + """Reads the value of an attribute from a module. + + This function will try to read the attributed statically first + (via :func:`ast.literal_eval`), and only evaluate the module if it fails. + + Examples: + read_attr("package.attr") + read_attr("package.module.attr") + + :param str attr_desc: Dot-separated string describing how to reach the + attribute (see examples above) + :param dict[str, str] package_dir: Mapping of package names to their + location in disk (represented by paths relative to ``root_dir``). + :param str root_dir: Path to directory containing all the packages in + ``package_dir`` (current directory by default). + :rtype: str + """ + root_dir = root_dir or os.getcwd() + attrs_path = attr_desc.strip().split('.') + attr_name = attrs_path.pop() + module_name = '.'.join(attrs_path) + module_name = module_name or '__init__' + path = _find_module(module_name, package_dir, root_dir) + spec = _find_spec(module_name, path) + + try: + value = getattr(StaticModule(module_name, spec), attr_name) + # XXX: Is marking as static contents coming from modules too optimistic? + return _static.attempt_conversion(value) + except Exception: + # fallback to evaluate module + module = _load_spec(spec, module_name) + return getattr(module, attr_name) + + +def _find_spec(module_name: str, module_path: StrPath | None) -> ModuleSpec: + spec = importlib.util.spec_from_file_location(module_name, module_path) + spec = spec or importlib.util.find_spec(module_name) + + if spec is None: + raise ModuleNotFoundError(module_name) + + return spec + + +def _load_spec(spec: ModuleSpec, module_name: str) -> ModuleType: + name = getattr(spec, "__name__", module_name) + if name in sys.modules: + return sys.modules[name] + module = importlib.util.module_from_spec(spec) + sys.modules[name] = module # cache (it also ensures `==` works on loaded items) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +def _find_module( + module_name: str, package_dir: Mapping[str, str] | None, root_dir: StrPath +) -> str | None: + """Find the path to the module named ``module_name``, + considering the ``package_dir`` in the build configuration and ``root_dir``. + + >>> tmp = getfixture('tmpdir') + >>> _ = tmp.ensure("a/b/c.py") + >>> _ = tmp.ensure("a/b/d/__init__.py") + >>> r = lambda x: x.replace(str(tmp), "tmp").replace(os.sep, "/") + >>> r(_find_module("a.b.c", None, tmp)) + 'tmp/a/b/c.py' + >>> r(_find_module("f.g.h", {"": "1", "f": "2", "f.g": "3", "f.g.h": "a/b/d"}, tmp)) + 'tmp/a/b/d/__init__.py' + """ + path_start = find_package_path(module_name, package_dir or {}, root_dir) + candidates = chain.from_iterable( + (f"{path_start}{ext}", os.path.join(path_start, f"__init__{ext}")) + for ext in all_suffixes() + ) + return next((x for x in candidates if os.path.isfile(x)), None) + + +def resolve_class( + qualified_class_name: str, + package_dir: Mapping[str, str] | None = None, + root_dir: StrPath | None = None, +) -> Callable: + """Given a qualified class name, return the associated class object""" + root_dir = root_dir or os.getcwd() + idx = qualified_class_name.rfind('.') + class_name = qualified_class_name[idx + 1 :] + pkg_name = qualified_class_name[:idx] + + path = _find_module(pkg_name, package_dir, root_dir) + module = _load_spec(_find_spec(pkg_name, path), pkg_name) + return getattr(module, class_name) + + +def cmdclass( + values: dict[str, str], + package_dir: Mapping[str, str] | None = None, + root_dir: StrPath | None = None, +) -> dict[str, Callable]: + """Given a dictionary mapping command names to strings for qualified class + names, apply :func:`resolve_class` to the dict values. + """ + return {k: resolve_class(v, package_dir, root_dir) for k, v in values.items()} + + +def find_packages( + *, + namespaces=True, + fill_package_dir: dict[str, str] | None = None, + root_dir: StrPath | None = None, + **kwargs, +) -> list[str]: + """Works similarly to :func:`setuptools.find_packages`, but with all + arguments given as keyword arguments. Moreover, ``where`` can be given + as a list (the results will be simply concatenated). + + When the additional keyword argument ``namespaces`` is ``True``, it will + behave like :func:`setuptools.find_namespace_packages`` (i.e. include + implicit namespaces as per :pep:`420`). + + The ``where`` argument will be considered relative to ``root_dir`` (or the current + working directory when ``root_dir`` is not given). + + If the ``fill_package_dir`` argument is passed, this function will consider it as a + similar data structure to the ``package_dir`` configuration parameter add fill-in + any missing package location. + + :rtype: list + """ + from more_itertools import always_iterable, unique_everseen + + from setuptools.discovery import construct_package_dir + + # check "not namespaces" first due to python/mypy#6232 + if not namespaces: + from setuptools.discovery import PackageFinder + else: + from setuptools.discovery import PEP420PackageFinder as PackageFinder + + root_dir = root_dir or os.curdir + where = kwargs.pop('where', ['.']) + packages: list[str] = [] + fill_package_dir = {} if fill_package_dir is None else fill_package_dir + search = list(unique_everseen(always_iterable(where))) + + if len(search) == 1 and all(not _same_path(search[0], x) for x in (".", root_dir)): + fill_package_dir.setdefault("", search[0]) + + for path in search: + package_path = _nest_path(root_dir, path) + pkgs = PackageFinder.find(package_path, **kwargs) + packages.extend(pkgs) + if pkgs and not ( + fill_package_dir.get("") == path or os.path.samefile(package_path, root_dir) + ): + fill_package_dir.update(construct_package_dir(pkgs, path)) + + return packages + + +def _nest_path(parent: StrPath, path: StrPath) -> str: + path = parent if path in {".", ""} else os.path.join(parent, path) + return os.path.normpath(path) + + +def version(value: Callable | Iterable[str | int] | str) -> str: + """When getting the version directly from an attribute, + it should be normalised to string. + """ + _value = value() if callable(value) else value + + if isinstance(_value, str): + return _value + if hasattr(_value, '__iter__'): + return '.'.join(map(str, _value)) + return f'{_value}' + + +def canonic_package_data(package_data: dict) -> dict: + if "*" in package_data: + package_data[""] = package_data.pop("*") + return package_data + + +def canonic_data_files( + data_files: list | dict, root_dir: StrPath | None = None +) -> list[tuple[str, list[str]]]: + """For compatibility with ``setup.py``, ``data_files`` should be a list + of pairs instead of a dict. + + This function also expands glob patterns. + """ + if isinstance(data_files, list): + return data_files + + return [ + (dest, glob_relative(patterns, root_dir)) + for dest, patterns in data_files.items() + ] + + +def entry_points( + text: str, text_source: str = "entry-points" +) -> dict[str, dict[str, str]]: + """Given the contents of entry-points file, + process it into a 2-level dictionary (``dict[str, dict[str, str]]``). + The first level keys are entry-point groups, the second level keys are + entry-point names, and the second level values are references to objects + (that correspond to the entry-point value). + """ + # Using undocumented behaviour, see python/typeshed#12700 + parser = ConfigParser(default_section=None, delimiters=("=",)) # type: ignore[call-overload] + parser.optionxform = str # case sensitive + parser.read_string(text, text_source) + groups = {k: dict(v.items()) for k, v in parser.items()} + groups.pop(parser.default_section, None) + return groups + + +class EnsurePackagesDiscovered: + """Some expand functions require all the packages to already be discovered before + they run, e.g. :func:`read_attr`, :func:`resolve_class`, :func:`cmdclass`. + + Therefore in some cases we will need to run autodiscovery during the evaluation of + the configuration. However, it is better to postpone calling package discovery as + much as possible, because some parameters can influence it (e.g. ``package_dir``), + and those might not have been processed yet. + """ + + def __init__(self, distribution: Distribution) -> None: + self._dist = distribution + self._called = False + + def __call__(self): + """Trigger the automatic package discovery, if it is still necessary.""" + if not self._called: + self._called = True + self._dist.set_defaults(name=False) # Skip name, we can still be parsing + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ): + if self._called: + self._dist.set_defaults.analyse_name() # Now we can set a default name + + def _get_package_dir(self) -> Mapping[str, str]: + self() + pkg_dir = self._dist.package_dir + return {} if pkg_dir is None else pkg_dir + + @property + def package_dir(self) -> Mapping[str, str]: + """Proxy to ``package_dir`` that may trigger auto-discovery when used.""" + return LazyMappingProxy(self._get_package_dir) + + +class LazyMappingProxy(Mapping[_K, _V_co]): + """Mapping proxy that delays resolving the target object, until really needed. + + >>> def obtain_mapping(): + ... print("Running expensive function!") + ... return {"key": "value", "other key": "other value"} + >>> mapping = LazyMappingProxy(obtain_mapping) + >>> mapping["key"] + Running expensive function! + 'value' + >>> mapping["other key"] + 'other value' + """ + + def __init__(self, obtain_mapping_value: Callable[[], Mapping[_K, _V_co]]) -> None: + self._obtain = obtain_mapping_value + self._value: Mapping[_K, _V_co] | None = None + + def _target(self) -> Mapping[_K, _V_co]: + if self._value is None: + self._value = self._obtain() + return self._value + + def __getitem__(self, key: _K) -> _V_co: + return self._target()[key] + + def __len__(self) -> int: + return len(self._target()) + + def __iter__(self) -> Iterator[_K]: + return iter(self._target()) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/pyprojecttoml.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/pyprojecttoml.py new file mode 100644 index 0000000000000000000000000000000000000000..fd6c5968c820435a0a8a82ecfacbe94abd7e8d3d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/pyprojecttoml.py @@ -0,0 +1,468 @@ +""" +Load setuptools configuration from ``pyproject.toml`` files. + +**PRIVATE MODULE**: API reserved for setuptools internal usage only. + +To read project metadata, consider using +``build.util.project_wheel_metadata`` (https://pypi.org/project/build/). +For simple scenarios, you can also try parsing the file directly +with the help of ``tomllib`` or ``tomli``. +""" + +from __future__ import annotations + +import logging +import os +from collections.abc import Mapping +from contextlib import contextmanager +from functools import partial +from types import TracebackType +from typing import TYPE_CHECKING, Any, Callable + +from .._path import StrPath +from ..errors import FileError, InvalidConfigError +from ..warnings import SetuptoolsWarning +from . import expand as _expand +from ._apply_pyprojecttoml import _PREVIOUSLY_DEFINED, _MissingDynamic, apply as _apply + +if TYPE_CHECKING: + from typing_extensions import Self + + from setuptools.dist import Distribution + +_logger = logging.getLogger(__name__) + + +def load_file(filepath: StrPath) -> dict: + from ..compat.py310 import tomllib + + with open(filepath, "rb") as file: + return tomllib.load(file) + + +def validate(config: dict, filepath: StrPath) -> bool: + from . import _validate_pyproject as validator + + trove_classifier = validator.FORMAT_FUNCTIONS.get("trove-classifier") + if hasattr(trove_classifier, "_disable_download"): + # Improve reproducibility by default. See abravalheri/validate-pyproject#31 + trove_classifier._disable_download() # type: ignore[union-attr] + + try: + return validator.validate(config) + except validator.ValidationError as ex: + summary = f"configuration error: {ex.summary}" + if ex.name.strip("`") != "project": + # Probably it is just a field missing/misnamed, not worthy the verbosity... + _logger.debug(summary) + _logger.debug(ex.details) + + error = f"invalid pyproject.toml config: {ex.name}." + raise ValueError(f"{error}\n{summary}") from None + + +def apply_configuration( + dist: Distribution, + filepath: StrPath, + ignore_option_errors: bool = False, +) -> Distribution: + """Apply the configuration from a ``pyproject.toml`` file into an existing + distribution object. + """ + config = read_configuration(filepath, True, ignore_option_errors, dist) + return _apply(dist, config, filepath) + + +def read_configuration( + filepath: StrPath, + expand: bool = True, + ignore_option_errors: bool = False, + dist: Distribution | None = None, +) -> dict[str, Any]: + """Read given configuration file and returns options from it as a dict. + + :param str|unicode filepath: Path to configuration file in the ``pyproject.toml`` + format. + + :param bool expand: Whether to expand directives and other computed values + (i.e. post-process the given configuration) + + :param bool ignore_option_errors: Whether to silently ignore + options, values of which could not be resolved (e.g. due to exceptions + in directives such as file:, attr:, etc.). + If False exceptions are propagated as expected. + + :param Distribution|None: Distribution object to which the configuration refers. + If not given a dummy object will be created and discarded after the + configuration is read. This is used for auto-discovery of packages and in the + case a dynamic configuration (e.g. ``attr`` or ``cmdclass``) is expanded. + When ``expand=False`` this object is simply ignored. + + :rtype: dict + """ + filepath = os.path.abspath(filepath) + + if not os.path.isfile(filepath): + raise FileError(f"Configuration file {filepath!r} does not exist.") + + asdict = load_file(filepath) or {} + project_table = asdict.get("project", {}) + tool_table = asdict.get("tool", {}) + setuptools_table = tool_table.get("setuptools", {}) + if not asdict or not (project_table or setuptools_table): + return {} # User is not using pyproject to configure setuptools + + if "setuptools" in asdict.get("tools", {}): + # let the user know they probably have a typo in their metadata + _ToolsTypoInMetadata.emit() + + if "distutils" in tool_table: + _ExperimentalConfiguration.emit(subject="[tool.distutils]") + + # There is an overall sense in the community that making include_package_data=True + # the default would be an improvement. + # `ini2toml` backfills include_package_data=False when nothing is explicitly given, + # therefore setting a default here is backwards compatible. + if dist and dist.include_package_data is not None: + setuptools_table.setdefault("include-package-data", dist.include_package_data) + else: + setuptools_table.setdefault("include-package-data", True) + # Persist changes: + asdict["tool"] = tool_table + tool_table["setuptools"] = setuptools_table + + if "ext-modules" in setuptools_table: + _ExperimentalConfiguration.emit(subject="[tool.setuptools.ext-modules]") + + with _ignore_errors(ignore_option_errors): + # Don't complain about unrelated errors (e.g. tools not using the "tool" table) + subset = {"project": project_table, "tool": {"setuptools": setuptools_table}} + validate(subset, filepath) + + if expand: + root_dir = os.path.dirname(filepath) + return expand_configuration(asdict, root_dir, ignore_option_errors, dist) + + return asdict + + +def expand_configuration( + config: dict, + root_dir: StrPath | None = None, + ignore_option_errors: bool = False, + dist: Distribution | None = None, +) -> dict: + """Given a configuration with unresolved fields (e.g. dynamic, cmdclass, ...) + find their final values. + + :param dict config: Dict containing the configuration for the distribution + :param str root_dir: Top-level directory for the distribution/project + (the same directory where ``pyproject.toml`` is place) + :param bool ignore_option_errors: see :func:`read_configuration` + :param Distribution|None: Distribution object to which the configuration refers. + If not given a dummy object will be created and discarded after the + configuration is read. Used in the case a dynamic configuration + (e.g. ``attr`` or ``cmdclass``). + + :rtype: dict + """ + return _ConfigExpander(config, root_dir, ignore_option_errors, dist).expand() + + +class _ConfigExpander: + def __init__( + self, + config: dict, + root_dir: StrPath | None = None, + ignore_option_errors: bool = False, + dist: Distribution | None = None, + ) -> None: + self.config = config + self.root_dir = root_dir or os.getcwd() + self.project_cfg = config.get("project", {}) + self.dynamic = self.project_cfg.get("dynamic", []) + self.setuptools_cfg = config.get("tool", {}).get("setuptools", {}) + self.dynamic_cfg = self.setuptools_cfg.get("dynamic", {}) + self.ignore_option_errors = ignore_option_errors + self._dist = dist + self._referenced_files = set[str]() + + def _ensure_dist(self) -> Distribution: + from setuptools.dist import Distribution + + attrs = {"src_root": self.root_dir, "name": self.project_cfg.get("name", None)} + return self._dist or Distribution(attrs) + + def _process_field(self, container: dict, field: str, fn: Callable): + if field in container: + with _ignore_errors(self.ignore_option_errors): + container[field] = fn(container[field]) + + def _canonic_package_data(self, field="package-data"): + package_data = self.setuptools_cfg.get(field, {}) + return _expand.canonic_package_data(package_data) + + def expand(self): + self._expand_packages() + self._canonic_package_data() + self._canonic_package_data("exclude-package-data") + + # A distribution object is required for discovering the correct package_dir + dist = self._ensure_dist() + ctx = _EnsurePackagesDiscovered(dist, self.project_cfg, self.setuptools_cfg) + with ctx as ensure_discovered: + package_dir = ensure_discovered.package_dir + self._expand_data_files() + self._expand_cmdclass(package_dir) + self._expand_all_dynamic(dist, package_dir) + + dist._referenced_files.update(self._referenced_files) + return self.config + + def _expand_packages(self): + packages = self.setuptools_cfg.get("packages") + if packages is None or isinstance(packages, (list, tuple)): + return + + find = packages.get("find") + if isinstance(find, dict): + find["root_dir"] = self.root_dir + find["fill_package_dir"] = self.setuptools_cfg.setdefault("package-dir", {}) + with _ignore_errors(self.ignore_option_errors): + self.setuptools_cfg["packages"] = _expand.find_packages(**find) + + def _expand_data_files(self): + data_files = partial(_expand.canonic_data_files, root_dir=self.root_dir) + self._process_field(self.setuptools_cfg, "data-files", data_files) + + def _expand_cmdclass(self, package_dir: Mapping[str, str]): + root_dir = self.root_dir + cmdclass = partial(_expand.cmdclass, package_dir=package_dir, root_dir=root_dir) + self._process_field(self.setuptools_cfg, "cmdclass", cmdclass) + + def _expand_all_dynamic(self, dist: Distribution, package_dir: Mapping[str, str]): + special = ( # need special handling + "version", + "readme", + "entry-points", + "scripts", + "gui-scripts", + "classifiers", + "dependencies", + "optional-dependencies", + ) + # `_obtain` functions are assumed to raise appropriate exceptions/warnings. + obtained_dynamic = { + field: self._obtain(dist, field, package_dir) + for field in self.dynamic + if field not in special + } + obtained_dynamic.update( + self._obtain_entry_points(dist, package_dir) or {}, + version=self._obtain_version(dist, package_dir), + readme=self._obtain_readme(dist), + classifiers=self._obtain_classifiers(dist), + dependencies=self._obtain_dependencies(dist), + optional_dependencies=self._obtain_optional_dependencies(dist), + ) + # `None` indicates there is nothing in `tool.setuptools.dynamic` but the value + # might have already been set by setup.py/extensions, so avoid overwriting. + updates = {k: v for k, v in obtained_dynamic.items() if v is not None} + self.project_cfg.update(updates) + + def _ensure_previously_set(self, dist: Distribution, field: str): + previous = _PREVIOUSLY_DEFINED[field](dist) + if previous is None and not self.ignore_option_errors: + msg = ( + f"No configuration found for dynamic {field!r}.\n" + "Some dynamic fields need to be specified via `tool.setuptools.dynamic`" + "\nothers must be specified via the equivalent attribute in `setup.py`." + ) + raise InvalidConfigError(msg) + + def _expand_directive( + self, specifier: str, directive, package_dir: Mapping[str, str] + ): + from more_itertools import always_iterable + + with _ignore_errors(self.ignore_option_errors): + root_dir = self.root_dir + if "file" in directive: + self._referenced_files.update(always_iterable(directive["file"])) + return _expand.read_files(directive["file"], root_dir) + if "attr" in directive: + return _expand.read_attr(directive["attr"], package_dir, root_dir) + raise ValueError(f"invalid `{specifier}`: {directive!r}") + return None + + def _obtain(self, dist: Distribution, field: str, package_dir: Mapping[str, str]): + if field in self.dynamic_cfg: + return self._expand_directive( + f"tool.setuptools.dynamic.{field}", + self.dynamic_cfg[field], + package_dir, + ) + self._ensure_previously_set(dist, field) + return None + + def _obtain_version(self, dist: Distribution, package_dir: Mapping[str, str]): + # Since plugins can set version, let's silently skip if it cannot be obtained + if "version" in self.dynamic and "version" in self.dynamic_cfg: + return _expand.version( + # We already do an early check for the presence of "version" + self._obtain(dist, "version", package_dir) # pyright: ignore[reportArgumentType] + ) + return None + + def _obtain_readme(self, dist: Distribution) -> dict[str, str] | None: + if "readme" not in self.dynamic: + return None + + dynamic_cfg = self.dynamic_cfg + if "readme" in dynamic_cfg: + return { + # We already do an early check for the presence of "readme" + "text": self._obtain(dist, "readme", {}), + "content-type": dynamic_cfg["readme"].get("content-type", "text/x-rst"), + } # pyright: ignore[reportReturnType] + + self._ensure_previously_set(dist, "readme") + return None + + def _obtain_entry_points( + self, dist: Distribution, package_dir: Mapping[str, str] + ) -> dict[str, dict[str, Any]] | None: + fields = ("entry-points", "scripts", "gui-scripts") + if not any(field in self.dynamic for field in fields): + return None + + text = self._obtain(dist, "entry-points", package_dir) + if text is None: + return None + + groups = _expand.entry_points(text) + # Any is str | dict[str, str], but causes variance issues + expanded: dict[str, dict[str, Any]] = {"entry-points": groups} + + def _set_scripts(field: str, group: str): + if group in groups: + value = groups.pop(group) + if field not in self.dynamic: + raise InvalidConfigError(_MissingDynamic.details(field, value)) + expanded[field] = value + + _set_scripts("scripts", "console_scripts") + _set_scripts("gui-scripts", "gui_scripts") + + return expanded + + def _obtain_classifiers(self, dist: Distribution): + if "classifiers" in self.dynamic: + value = self._obtain(dist, "classifiers", {}) + if value: + return value.splitlines() + return None + + def _obtain_dependencies(self, dist: Distribution): + if "dependencies" in self.dynamic: + value = self._obtain(dist, "dependencies", {}) + if value: + return _parse_requirements_list(value) + return None + + def _obtain_optional_dependencies(self, dist: Distribution): + if "optional-dependencies" not in self.dynamic: + return None + if "optional-dependencies" in self.dynamic_cfg: + optional_dependencies_map = self.dynamic_cfg["optional-dependencies"] + assert isinstance(optional_dependencies_map, dict) + return { + group: _parse_requirements_list( + self._expand_directive( + f"tool.setuptools.dynamic.optional-dependencies.{group}", + directive, + {}, + ) + ) + for group, directive in optional_dependencies_map.items() + } + self._ensure_previously_set(dist, "optional-dependencies") + return None + + +def _parse_requirements_list(value): + return [ + line + for line in value.splitlines() + if line.strip() and not line.strip().startswith("#") + ] + + +@contextmanager +def _ignore_errors(ignore_option_errors: bool): + if not ignore_option_errors: + yield + return + + try: + yield + except Exception as ex: + _logger.debug(f"ignored error: {ex.__class__.__name__} - {ex}") + + +class _EnsurePackagesDiscovered(_expand.EnsurePackagesDiscovered): + def __init__( + self, distribution: Distribution, project_cfg: dict, setuptools_cfg: dict + ) -> None: + super().__init__(distribution) + self._project_cfg = project_cfg + self._setuptools_cfg = setuptools_cfg + + def __enter__(self) -> Self: + """When entering the context, the values of ``packages``, ``py_modules`` and + ``package_dir`` that are missing in ``dist`` are copied from ``setuptools_cfg``. + """ + dist, cfg = self._dist, self._setuptools_cfg + package_dir: dict[str, str] = cfg.setdefault("package-dir", {}) + package_dir.update(dist.package_dir or {}) + dist.package_dir = package_dir # needs to be the same object + + dist.set_defaults._ignore_ext_modules() # pyproject.toml-specific behaviour + + # Set `name`, `py_modules` and `packages` in dist to short-circuit + # auto-discovery, but avoid overwriting empty lists purposefully set by users. + if dist.metadata.name is None: + dist.metadata.name = self._project_cfg.get("name") + if dist.py_modules is None: + dist.py_modules = cfg.get("py-modules") + if dist.packages is None: + dist.packages = cfg.get("packages") + + return super().__enter__() + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: + """When exiting the context, if values of ``packages``, ``py_modules`` and + ``package_dir`` are missing in ``setuptools_cfg``, copy from ``dist``. + """ + # If anything was discovered set them back, so they count in the final config. + self._setuptools_cfg.setdefault("packages", self._dist.packages) + self._setuptools_cfg.setdefault("py-modules", self._dist.py_modules) + return super().__exit__(exc_type, exc_value, traceback) + + +class _ExperimentalConfiguration(SetuptoolsWarning): + _SUMMARY = ( + "`{subject}` in `pyproject.toml` is still *experimental* " + "and likely to change in future releases." + ) + + +class _ToolsTypoInMetadata(SetuptoolsWarning): + _SUMMARY = ( + "Ignoring [tools.setuptools] in pyproject.toml, did you mean [tool.setuptools]?" + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/setupcfg.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/setupcfg.py new file mode 100644 index 0000000000000000000000000000000000000000..633aa9d45d086dd0290a2c38bee632ed7f9a63a4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/setupcfg.py @@ -0,0 +1,780 @@ +""" +Load setuptools configuration from ``setup.cfg`` files. + +**API will be made private in the future** + +To read project metadata, consider using +``build.util.project_wheel_metadata`` (https://pypi.org/project/build/). +For simple scenarios, you can also try parsing the file directly +with the help of ``configparser``. +""" + +from __future__ import annotations + +import contextlib +import functools +import os +from collections import defaultdict +from collections.abc import Iterable, Iterator +from functools import partial, wraps +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Generic, TypeVar, cast + +from packaging.markers import default_environment as marker_env +from packaging.requirements import InvalidRequirement, Requirement +from packaging.version import InvalidVersion, Version + +from .. import _static +from .._path import StrPath +from ..errors import FileError, OptionError +from ..warnings import SetuptoolsDeprecationWarning +from . import expand + +if TYPE_CHECKING: + from typing_extensions import TypeAlias + + from setuptools.dist import Distribution + + from distutils.dist import DistributionMetadata + +SingleCommandOptions: TypeAlias = dict[str, tuple[str, Any]] +"""Dict that associate the name of the options of a particular command to a +tuple. The first element of the tuple indicates the origin of the option value +(e.g. the name of the configuration file where it was read from), +while the second element of the tuple is the option value itself +""" +AllCommandOptions: TypeAlias = dict[str, SingleCommandOptions] +"""cmd name => its options""" +Target = TypeVar("Target", "Distribution", "DistributionMetadata") + + +def read_configuration( + filepath: StrPath, find_others: bool = False, ignore_option_errors: bool = False +) -> dict: + """Read given configuration file and returns options from it as a dict. + + :param str|unicode filepath: Path to configuration file + to get options from. + + :param bool find_others: Whether to search for other configuration files + which could be on in various places. + + :param bool ignore_option_errors: Whether to silently ignore + options, values of which could not be resolved (e.g. due to exceptions + in directives such as file:, attr:, etc.). + If False exceptions are propagated as expected. + + :rtype: dict + """ + from setuptools.dist import Distribution + + dist = Distribution() + filenames = dist.find_config_files() if find_others else [] + handlers = _apply(dist, filepath, filenames, ignore_option_errors) + return configuration_to_dict(handlers) + + +def apply_configuration(dist: Distribution, filepath: StrPath) -> Distribution: + """Apply the configuration from a ``setup.cfg`` file into an existing + distribution object. + """ + _apply(dist, filepath) + dist._finalize_requires() + return dist + + +def _apply( + dist: Distribution, + filepath: StrPath, + other_files: Iterable[StrPath] = (), + ignore_option_errors: bool = False, +) -> tuple[ConfigMetadataHandler, ConfigOptionsHandler]: + """Read configuration from ``filepath`` and applies to the ``dist`` object.""" + from setuptools.dist import _Distribution + + filepath = os.path.abspath(filepath) + + if not os.path.isfile(filepath): + raise FileError(f'Configuration file {filepath} does not exist.') + + current_directory = os.getcwd() + os.chdir(os.path.dirname(filepath)) + filenames = [*other_files, filepath] + + try: + # TODO: Temporary cast until mypy 1.12 is released with upstream fixes from typeshed + _Distribution.parse_config_files(dist, filenames=cast(list[str], filenames)) + handlers = parse_configuration( + dist, dist.command_options, ignore_option_errors=ignore_option_errors + ) + dist._finalize_license_files() + finally: + os.chdir(current_directory) + + return handlers + + +def _get_option(target_obj: Distribution | DistributionMetadata, key: str): + """ + Given a target object and option key, get that option from + the target object, either through a get_{key} method or + from an attribute directly. + """ + getter_name = f'get_{key}' + by_attribute = functools.partial(getattr, target_obj, key) + getter = getattr(target_obj, getter_name, by_attribute) + return getter() + + +def configuration_to_dict( + handlers: Iterable[ + ConfigHandler[Distribution] | ConfigHandler[DistributionMetadata] + ], +) -> dict: + """Returns configuration data gathered by given handlers as a dict. + + :param Iterable[ConfigHandler] handlers: Handlers list, + usually from parse_configuration() + + :rtype: dict + """ + config_dict: dict = defaultdict(dict) + + for handler in handlers: + for option in handler.set_options: + value = _get_option(handler.target_obj, option) + config_dict[handler.section_prefix][option] = value + + return config_dict + + +def parse_configuration( + distribution: Distribution, + command_options: AllCommandOptions, + ignore_option_errors: bool = False, +) -> tuple[ConfigMetadataHandler, ConfigOptionsHandler]: + """Performs additional parsing of configuration options + for a distribution. + + Returns a list of used option handlers. + + :param Distribution distribution: + :param dict command_options: + :param bool ignore_option_errors: Whether to silently ignore + options, values of which could not be resolved (e.g. due to exceptions + in directives such as file:, attr:, etc.). + If False exceptions are propagated as expected. + :rtype: list + """ + with expand.EnsurePackagesDiscovered(distribution) as ensure_discovered: + options = ConfigOptionsHandler( + distribution, + command_options, + ignore_option_errors, + ensure_discovered, + ) + + options.parse() + if not distribution.package_dir: + distribution.package_dir = options.package_dir # Filled by `find_packages` + + meta = ConfigMetadataHandler( + distribution.metadata, + command_options, + ignore_option_errors, + ensure_discovered, + distribution.package_dir, + distribution.src_root, + ) + meta.parse() + distribution._referenced_files.update( + options._referenced_files, meta._referenced_files + ) + + return meta, options + + +def _warn_accidental_env_marker_misconfig(label: str, orig_value: str, parsed: list): + """Because users sometimes misinterpret this configuration: + + [options.extras_require] + foo = bar;python_version<"4" + + It looks like one requirement with an environment marker + but because there is no newline, it's parsed as two requirements + with a semicolon as separator. + + Therefore, if: + * input string does not contain a newline AND + * parsed result contains two requirements AND + * parsing of the two parts from the result (";") + leads in a valid Requirement with a valid marker + a UserWarning is shown to inform the user about the possible problem. + """ + if "\n" in orig_value or len(parsed) != 2: + return + + markers = marker_env().keys() + + try: + req = Requirement(parsed[1]) + if req.name in markers: + _AmbiguousMarker.emit(field=label, req=parsed[1]) + except InvalidRequirement as ex: + if any(parsed[1].startswith(marker) for marker in markers): + msg = _AmbiguousMarker.message(field=label, req=parsed[1]) + raise InvalidRequirement(msg) from ex + + +class ConfigHandler(Generic[Target]): + """Handles metadata supplied in configuration files.""" + + section_prefix: str + """Prefix for config sections handled by this handler. + Must be provided by class heirs. + + """ + + aliases: ClassVar[dict[str, str]] = {} + """Options aliases. + For compatibility with various packages. E.g.: d2to1 and pbr. + Note: `-` in keys is replaced with `_` by config parser. + + """ + + def __init__( + self, + target_obj: Target, + options: AllCommandOptions, + ignore_option_errors, + ensure_discovered: expand.EnsurePackagesDiscovered, + ) -> None: + self.ignore_option_errors = ignore_option_errors + self.target_obj: Target = target_obj + self.sections = dict(self._section_options(options)) + self.set_options: list[str] = [] + self.ensure_discovered = ensure_discovered + self._referenced_files = set[str]() + """After parsing configurations, this property will enumerate + all files referenced by the "file:" directive. Private API for setuptools only. + """ + + @classmethod + def _section_options( + cls, options: AllCommandOptions + ) -> Iterator[tuple[str, SingleCommandOptions]]: + for full_name, value in options.items(): + pre, _sep, name = full_name.partition(cls.section_prefix) + if pre: + continue + yield name.lstrip('.'), value + + @property + def parsers(self): + """Metadata item name to parser function mapping.""" + raise NotImplementedError( + f'{self.__class__.__name__} must provide .parsers property' + ) + + def __setitem__(self, option_name, value) -> None: + target_obj = self.target_obj + + # Translate alias into real name. + option_name = self.aliases.get(option_name, option_name) + + try: + current_value = getattr(target_obj, option_name) + except AttributeError as e: + raise KeyError(option_name) from e + + if current_value: + # Already inhabited. Skipping. + return + + try: + parsed = self.parsers.get(option_name, lambda x: x)(value) + except (Exception,) * self.ignore_option_errors: + return + + simple_setter = functools.partial(target_obj.__setattr__, option_name) + setter = getattr(target_obj, f"set_{option_name}", simple_setter) + setter(parsed) + + self.set_options.append(option_name) + + @classmethod + def _parse_list(cls, value, separator=','): + """Represents value as a list. + + Value is split either by separator (defaults to comma) or by lines. + + :param value: + :param separator: List items separator character. + :rtype: list + """ + if isinstance(value, list): # _get_parser_compound case + return value + + if '\n' in value: + value = value.splitlines() + else: + value = value.split(separator) + + return [chunk.strip() for chunk in value if chunk.strip()] + + @classmethod + def _parse_dict(cls, value): + """Represents value as a dict. + + :param value: + :rtype: dict + """ + separator = '=' + result = {} + for line in cls._parse_list(value): + key, sep, val = line.partition(separator) + if sep != separator: + raise OptionError(f"Unable to parse option value to dict: {value}") + result[key.strip()] = val.strip() + + return result + + @classmethod + def _parse_bool(cls, value): + """Represents value as boolean. + + :param value: + :rtype: bool + """ + value = value.lower() + return value in ('1', 'true', 'yes') + + @classmethod + def _exclude_files_parser(cls, key): + """Returns a parser function to make sure field inputs + are not files. + + Parses a value after getting the key so error messages are + more informative. + + :param key: + :rtype: callable + """ + + def parser(value): + exclude_directive = 'file:' + if value.startswith(exclude_directive): + raise ValueError( + f'Only strings are accepted for the {key} field, ' + 'files are not accepted' + ) + return _static.Str(value) + + return parser + + def _parse_file(self, value, root_dir: StrPath | None): + """Represents value as a string, allowing including text + from nearest files using `file:` directive. + + Directive is sandboxed and won't reach anything outside + directory with setup.py. + + Examples: + file: README.rst, CHANGELOG.md, src/file.txt + + :param str value: + :rtype: str + """ + include_directive = 'file:' + + if not isinstance(value, str): + return value + + if not value.startswith(include_directive): + return _static.Str(value) + + spec = value[len(include_directive) :] + filepaths = [path.strip() for path in spec.split(',')] + self._referenced_files.update(filepaths) + # XXX: Is marking as static contents coming from files too optimistic? + return _static.Str(expand.read_files(filepaths, root_dir)) + + def _parse_attr(self, value, package_dir, root_dir: StrPath): + """Represents value as a module attribute. + + Examples: + attr: package.attr + attr: package.module.attr + + :param str value: + :rtype: str + """ + attr_directive = 'attr:' + if not value.startswith(attr_directive): + return _static.Str(value) + + attr_desc = value.replace(attr_directive, '') + + # Make sure package_dir is populated correctly, so `attr:` directives can work + package_dir.update(self.ensure_discovered.package_dir) + return expand.read_attr(attr_desc, package_dir, root_dir) + + @classmethod + def _get_parser_compound(cls, *parse_methods): + """Returns parser function to represents value as a list. + + Parses a value applying given methods one after another. + + :param parse_methods: + :rtype: callable + """ + + def parse(value): + parsed = value + + for method in parse_methods: + parsed = method(parsed) + + return parsed + + return parse + + @classmethod + def _parse_section_to_dict_with_key(cls, section_options, values_parser): + """Parses section options into a dictionary. + + Applies a given parser to each option in a section. + + :param dict section_options: + :param callable values_parser: function with 2 args corresponding to key, value + :rtype: dict + """ + value = {} + for key, (_, val) in section_options.items(): + value[key] = values_parser(key, val) + return value + + @classmethod + def _parse_section_to_dict(cls, section_options, values_parser=None): + """Parses section options into a dictionary. + + Optionally applies a given parser to each value. + + :param dict section_options: + :param callable values_parser: function with 1 arg corresponding to option value + :rtype: dict + """ + parser = (lambda _, v: values_parser(v)) if values_parser else (lambda _, v: v) + return cls._parse_section_to_dict_with_key(section_options, parser) + + def parse_section(self, section_options) -> None: + """Parses configuration file section. + + :param dict section_options: + """ + for name, (_, value) in section_options.items(): + with contextlib.suppress(KeyError): + # Keep silent for a new option may appear anytime. + self[name] = value + + def parse(self) -> None: + """Parses configuration file items from one + or more related sections. + + """ + for section_name, section_options in self.sections.items(): + method_postfix = '' + if section_name: # [section.option] variant + method_postfix = f"_{section_name}" + + section_parser_method: Callable | None = getattr( + self, + # Dots in section names are translated into dunderscores. + f'parse_section{method_postfix}'.replace('.', '__'), + None, + ) + + if section_parser_method is None: + raise OptionError( + "Unsupported distribution option section: " + f"[{self.section_prefix}.{section_name}]" + ) + + section_parser_method(section_options) + + def _deprecated_config_handler(self, func, msg, **kw): + """this function will wrap around parameters that are deprecated + + :param msg: deprecation message + :param func: function to be wrapped around + """ + + @wraps(func) + def config_handler(*args, **kwargs): + kw.setdefault("stacklevel", 2) + _DeprecatedConfig.emit("Deprecated config in `setup.cfg`", msg, **kw) + return func(*args, **kwargs) + + return config_handler + + +class ConfigMetadataHandler(ConfigHandler["DistributionMetadata"]): + section_prefix = 'metadata' + + aliases = { + 'home_page': 'url', + 'summary': 'description', + 'classifier': 'classifiers', + 'platform': 'platforms', + } + + strict_mode = False + """We need to keep it loose, to be partially compatible with + `pbr` and `d2to1` packages which also uses `metadata` section. + + """ + + def __init__( + self, + target_obj: DistributionMetadata, + options: AllCommandOptions, + ignore_option_errors: bool, + ensure_discovered: expand.EnsurePackagesDiscovered, + package_dir: dict | None = None, + root_dir: StrPath | None = os.curdir, + ) -> None: + super().__init__(target_obj, options, ignore_option_errors, ensure_discovered) + self.package_dir = package_dir + self.root_dir = root_dir + + @property + def parsers(self): + """Metadata item name to parser function mapping.""" + parse_list_static = self._get_parser_compound(self._parse_list, _static.List) + parse_dict_static = self._get_parser_compound(self._parse_dict, _static.Dict) + parse_file = partial(self._parse_file, root_dir=self.root_dir) + exclude_files_parser = self._exclude_files_parser + + return { + 'author': _static.Str, + 'author_email': _static.Str, + 'maintainer': _static.Str, + 'maintainer_email': _static.Str, + 'platforms': parse_list_static, + 'keywords': parse_list_static, + 'provides': parse_list_static, + 'obsoletes': parse_list_static, + 'classifiers': self._get_parser_compound(parse_file, parse_list_static), + 'license': exclude_files_parser('license'), + 'license_files': parse_list_static, + 'description': parse_file, + 'long_description': parse_file, + 'long_description_content_type': _static.Str, + 'version': self._parse_version, # Cannot be marked as dynamic + 'url': _static.Str, + 'project_urls': parse_dict_static, + } + + def _parse_version(self, value): + """Parses `version` option value. + + :param value: + :rtype: str + + """ + version = self._parse_file(value, self.root_dir) + + if version != value: + version = version.strip() + # Be strict about versions loaded from file because it's easy to + # accidentally include newlines and other unintended content + try: + Version(version) + except InvalidVersion as e: + raise OptionError( + f'Version loaded from {value} does not ' + f'comply with PEP 440: {version}' + ) from e + + return version + + return expand.version(self._parse_attr(value, self.package_dir, self.root_dir)) + + +class ConfigOptionsHandler(ConfigHandler["Distribution"]): + section_prefix = 'options' + + def __init__( + self, + target_obj: Distribution, + options: AllCommandOptions, + ignore_option_errors: bool, + ensure_discovered: expand.EnsurePackagesDiscovered, + ) -> None: + super().__init__(target_obj, options, ignore_option_errors, ensure_discovered) + self.root_dir = target_obj.src_root + self.package_dir: dict[str, str] = {} # To be filled by `find_packages` + + @classmethod + def _parse_list_semicolon(cls, value): + return cls._parse_list(value, separator=';') + + def _parse_file_in_root(self, value): + return self._parse_file(value, root_dir=self.root_dir) + + def _parse_requirements_list(self, label: str, value: str): + # Parse a requirements list, either by reading in a `file:`, or a list. + parsed = self._parse_list_semicolon(self._parse_file_in_root(value)) + _warn_accidental_env_marker_misconfig(label, value, parsed) + # Filter it to only include lines that are not comments. `parse_list` + # will have stripped each line and filtered out empties. + return _static.List(line for line in parsed if not line.startswith("#")) + # ^-- Use `_static.List` to mark a non-`Dynamic` Core Metadata + + @property + def parsers(self): + """Metadata item name to parser function mapping.""" + parse_list = self._parse_list + parse_bool = self._parse_bool + parse_cmdclass = self._parse_cmdclass + + return { + 'zip_safe': parse_bool, + 'include_package_data': parse_bool, + 'package_dir': self._parse_dict, + 'scripts': parse_list, + 'eager_resources': parse_list, + 'dependency_links': parse_list, + 'namespace_packages': self._deprecated_config_handler( + parse_list, + "The namespace_packages parameter is deprecated, " + "consider using implicit namespaces instead (PEP 420).", + # TODO: define due date, see setuptools.dist:check_nsp. + ), + 'install_requires': partial( # Core Metadata + self._parse_requirements_list, "install_requires" + ), + 'setup_requires': self._parse_list_semicolon, + 'packages': self._parse_packages, + 'entry_points': self._parse_file_in_root, + 'py_modules': parse_list, + 'python_requires': _static.SpecifierSet, # Core Metadata + 'cmdclass': parse_cmdclass, + } + + def _parse_cmdclass(self, value): + package_dir = self.ensure_discovered.package_dir + return expand.cmdclass(self._parse_dict(value), package_dir, self.root_dir) + + def _parse_packages(self, value): + """Parses `packages` option value. + + :param value: + :rtype: list + """ + find_directives = ['find:', 'find_namespace:'] + trimmed_value = value.strip() + + if trimmed_value not in find_directives: + return self._parse_list(value) + + # Read function arguments from a dedicated section. + find_kwargs = self.parse_section_packages__find( + self.sections.get('packages.find', {}) + ) + + find_kwargs.update( + namespaces=(trimmed_value == find_directives[1]), + root_dir=self.root_dir, + fill_package_dir=self.package_dir, + ) + + return expand.find_packages(**find_kwargs) + + def parse_section_packages__find(self, section_options): + """Parses `packages.find` configuration file section. + + To be used in conjunction with _parse_packages(). + + :param dict section_options: + """ + section_data = self._parse_section_to_dict(section_options, self._parse_list) + + valid_keys = ['where', 'include', 'exclude'] + find_kwargs = {k: v for k, v in section_data.items() if k in valid_keys and v} + + where = find_kwargs.get('where') + if where is not None: + find_kwargs['where'] = where[0] # cast list to single val + + return find_kwargs + + def parse_section_entry_points(self, section_options) -> None: + """Parses `entry_points` configuration file section. + + :param dict section_options: + """ + parsed = self._parse_section_to_dict(section_options, self._parse_list) + self['entry_points'] = parsed + + def _parse_package_data(self, section_options): + package_data = self._parse_section_to_dict(section_options, self._parse_list) + return expand.canonic_package_data(package_data) + + def parse_section_package_data(self, section_options) -> None: + """Parses `package_data` configuration file section. + + :param dict section_options: + """ + self['package_data'] = self._parse_package_data(section_options) + + def parse_section_exclude_package_data(self, section_options) -> None: + """Parses `exclude_package_data` configuration file section. + + :param dict section_options: + """ + self['exclude_package_data'] = self._parse_package_data(section_options) + + def parse_section_extras_require(self, section_options) -> None: # Core Metadata + """Parses `extras_require` configuration file section. + + :param dict section_options: + """ + parsed = self._parse_section_to_dict_with_key( + section_options, + lambda k, v: self._parse_requirements_list(f"extras_require[{k}]", v), + ) + + self['extras_require'] = _static.Dict(parsed) + # ^-- Use `_static.Dict` to mark a non-`Dynamic` Core Metadata + + def parse_section_data_files(self, section_options) -> None: + """Parses `data_files` configuration file section. + + :param dict section_options: + """ + parsed = self._parse_section_to_dict(section_options, self._parse_list) + self['data_files'] = expand.canonic_data_files(parsed, self.root_dir) + + +class _AmbiguousMarker(SetuptoolsDeprecationWarning): + _SUMMARY = "Ambiguous requirement marker." + _DETAILS = """ + One of the parsed requirements in `{field}` looks like a valid environment marker: + + {req!r} + + Please make sure that the configuration file is correct. + You can use dangling lines to avoid this problem. + """ + _SEE_DOCS = "userguide/declarative_config.html#opt-2" + # TODO: should we include due_date here? Initially introduced in 6 Aug 2022. + # Does this make sense with latest version of packaging? + + @classmethod + def message(cls, **kw): + docs = f"https://setuptools.pypa.io/en/latest/{cls._SEE_DOCS}" + return cls._format(cls._SUMMARY, cls._DETAILS, see_url=docs, format_args=kw) + + +class _DeprecatedConfig(SetuptoolsDeprecationWarning): + _SEE_DOCS = "userguide/declarative_config.html" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/setuptools.schema.json b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/setuptools.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..ec887b35736e4ede6265e95e3a9e2520b43cc40f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/config/setuptools.schema.json @@ -0,0 +1,433 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + + "$id": "https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html", + "title": "``tool.setuptools`` table", + "$$description": [ + "``setuptools``-specific configurations that can be set by users that require", + "customization.", + "These configurations are completely optional and probably can be skipped when", + "creating simple packages. They are equivalent to some of the `Keywords", + "`_", + "used by the ``setup.py`` file, and can be set via the ``tool.setuptools`` table.", + "It considers only ``setuptools`` `parameters", + "`_", + "that are not covered by :pep:`621`; and intentionally excludes ``dependency_links``", + "and ``setup_requires`` (incompatible with modern workflows/standards)." + ], + + "type": "object", + "additionalProperties": false, + "properties": { + "platforms": { + "type": "array", + "items": {"type": "string"} + }, + "provides": { + "$$description": [ + "Package and virtual package names contained within this package", + "**(not supported by pip)**" + ], + "type": "array", + "items": {"type": "string", "format": "pep508-identifier"} + }, + "obsoletes": { + "$$description": [ + "Packages which this package renders obsolete", + "**(not supported by pip)**" + ], + "type": "array", + "items": {"type": "string", "format": "pep508-identifier"} + }, + "zip-safe": { + "$$description": [ + "Whether the project can be safely installed and run from a zip file.", + "**OBSOLETE**: only relevant for ``pkg_resources``, ``easy_install`` and", + "``setup.py install`` in the context of ``eggs`` (**DEPRECATED**)." + ], + "type": "boolean" + }, + "script-files": { + "$$description": [ + "Legacy way of defining scripts (entry-points are preferred).", + "Equivalent to the ``script`` keyword in ``setup.py``", + "(it was renamed to avoid confusion with entry-point based ``project.scripts``", + "defined in :pep:`621`).", + "**DISCOURAGED**: generic script wrappers are tricky and may not work properly.", + "Whenever possible, please use ``project.scripts`` instead." + ], + "type": "array", + "items": {"type": "string"}, + "$comment": "TODO: is this field deprecated/should be removed?" + }, + "eager-resources": { + "$$description": [ + "Resources that should be extracted together, if any of them is needed,", + "or if any C extensions included in the project are imported.", + "**OBSOLETE**: only relevant for ``pkg_resources``, ``easy_install`` and", + "``setup.py install`` in the context of ``eggs`` (**DEPRECATED**)." + ], + "type": "array", + "items": {"type": "string"} + }, + "packages": { + "$$description": [ + "Packages that should be included in the distribution.", + "It can be given either as a list of package identifiers", + "or as a ``dict``-like structure with a single key ``find``", + "which corresponds to a dynamic call to", + "``setuptools.config.expand.find_packages`` function.", + "The ``find`` key is associated with a nested ``dict``-like structure that can", + "contain ``where``, ``include``, ``exclude`` and ``namespaces`` keys,", + "mimicking the keyword arguments of the associated function." + ], + "oneOf": [ + { + "title": "Array of Python package identifiers", + "type": "array", + "items": {"$ref": "#/definitions/package-name"} + }, + {"$ref": "#/definitions/find-directive"} + ] + }, + "package-dir": { + "$$description": [ + ":class:`dict`-like structure mapping from package names to directories where their", + "code can be found.", + "The empty string (as key) means that all packages are contained inside", + "the given directory will be included in the distribution." + ], + "type": "object", + "additionalProperties": false, + "propertyNames": { + "anyOf": [{"const": ""}, {"$ref": "#/definitions/package-name"}] + }, + "patternProperties": { + "^.*$": {"type": "string" } + } + }, + "package-data": { + "$$description": [ + "Mapping from package names to lists of glob patterns.", + "Usually this option is not needed when using ``include-package-data = true``", + "For more information on how to include data files, check ``setuptools`` `docs", + "`_." + ], + "type": "object", + "additionalProperties": false, + "propertyNames": { + "anyOf": [{"type": "string", "format": "python-module-name"}, {"const": "*"}] + }, + "patternProperties": { + "^.*$": {"type": "array", "items": {"type": "string"}} + } + }, + "include-package-data": { + "$$description": [ + "Automatically include any data files inside the package directories", + "that are specified by ``MANIFEST.in``", + "For more information on how to include data files, check ``setuptools`` `docs", + "`_." + ], + "type": "boolean" + }, + "exclude-package-data": { + "$$description": [ + "Mapping from package names to lists of glob patterns that should be excluded", + "For more information on how to include data files, check ``setuptools`` `docs", + "`_." + ], + "type": "object", + "additionalProperties": false, + "propertyNames": { + "anyOf": [{"type": "string", "format": "python-module-name"}, {"const": "*"}] + }, + "patternProperties": { + "^.*$": {"type": "array", "items": {"type": "string"}} + } + }, + "namespace-packages": { + "type": "array", + "items": {"type": "string", "format": "python-module-name-relaxed"}, + "$comment": "https://setuptools.pypa.io/en/latest/userguide/package_discovery.html", + "description": "**DEPRECATED**: use implicit namespaces instead (:pep:`420`)." + }, + "py-modules": { + "description": "Modules that setuptools will manipulate", + "type": "array", + "items": {"type": "string", "format": "python-module-name-relaxed"}, + "$comment": "TODO: clarify the relationship with ``packages``" + }, + "ext-modules": { + "description": "Extension modules to be compiled by setuptools", + "type": "array", + "items": {"$ref": "#/definitions/ext-module"} + }, + "data-files": { + "$$description": [ + "``dict``-like structure where each key represents a directory and", + "the value is a list of glob patterns that should be installed in them.", + "**DISCOURAGED**: please notice this might not work as expected with wheels.", + "Whenever possible, consider using data files inside the package directories", + "(or create a new namespace package that only contains data files).", + "See `data files support", + "`_." + ], + "type": "object", + "patternProperties": { + "^.*$": {"type": "array", "items": {"type": "string"}} + } + }, + "cmdclass": { + "$$description": [ + "Mapping of distutils-style command names to ``setuptools.Command`` subclasses", + "which in turn should be represented by strings with a qualified class name", + "(i.e., \"dotted\" form with module), e.g.::\n\n", + " cmdclass = {mycmd = \"pkg.subpkg.module.CommandClass\"}\n\n", + "The command class should be a directly defined at the top-level of the", + "containing module (no class nesting)." + ], + "type": "object", + "patternProperties": { + "^.*$": {"type": "string", "format": "python-qualified-identifier"} + } + }, + "license-files": { + "type": "array", + "items": {"type": "string"}, + "$$description": [ + "**PROVISIONAL**: list of glob patterns for all license files being distributed.", + "(likely to become standard with :pep:`639`).", + "By default: ``['LICEN[CS]E*', 'COPYING*', 'NOTICE*', 'AUTHORS*']``" + ], + "$comment": "TODO: revise if PEP 639 is accepted. Probably ``project.license-files``?" + }, + "dynamic": { + "type": "object", + "description": "Instructions for loading :pep:`621`-related metadata dynamically", + "additionalProperties": false, + "properties": { + "version": { + "$$description": [ + "A version dynamically loaded via either the ``attr:`` or ``file:``", + "directives. Please make sure the given file or attribute respects :pep:`440`.", + "Also ensure to set ``project.dynamic`` accordingly." + ], + "oneOf": [ + {"$ref": "#/definitions/attr-directive"}, + {"$ref": "#/definitions/file-directive"} + ] + }, + "classifiers": {"$ref": "#/definitions/file-directive"}, + "description": {"$ref": "#/definitions/file-directive"}, + "entry-points": {"$ref": "#/definitions/file-directive"}, + "dependencies": {"$ref": "#/definitions/file-directive-for-dependencies"}, + "optional-dependencies": { + "type": "object", + "propertyNames": {"type": "string", "format": "pep508-identifier"}, + "additionalProperties": false, + "patternProperties": { + ".+": {"$ref": "#/definitions/file-directive-for-dependencies"} + } + }, + "readme": { + "type": "object", + "anyOf": [ + {"$ref": "#/definitions/file-directive"}, + { + "type": "object", + "properties": { + "content-type": {"type": "string"}, + "file": { "$ref": "#/definitions/file-directive/properties/file" } + }, + "additionalProperties": false} + ], + "required": ["file"] + } + } + } + }, + + "definitions": { + "package-name": { + "$id": "#/definitions/package-name", + "title": "Valid package name", + "description": "Valid package name (importable or :pep:`561`).", + "type": "string", + "anyOf": [ + {"type": "string", "format": "python-module-name-relaxed"}, + {"type": "string", "format": "pep561-stub-name"} + ] + }, + "ext-module": { + "$id": "#/definitions/ext-module", + "title": "Extension module", + "description": "Parameters to construct a :class:`setuptools.Extension` object", + "type": "object", + "required": ["name", "sources"], + "additionalProperties": false, + "properties": { + "name": { + "type": "string", + "format": "python-module-name-relaxed" + }, + "sources": { + "type": "array", + "items": {"type": "string"} + }, + "include-dirs":{ + "type": "array", + "items": {"type": "string"} + }, + "define-macros": { + "type": "array", + "items": { + "type": "array", + "items": [ + {"description": "macro name", "type": "string"}, + {"description": "macro value", "oneOf": [{"type": "string"}, {"type": "null"}]} + ], + "additionalItems": false + } + }, + "undef-macros": { + "type": "array", + "items": {"type": "string"} + }, + "library-dirs": { + "type": "array", + "items": {"type": "string"} + }, + "libraries": { + "type": "array", + "items": {"type": "string"} + }, + "runtime-library-dirs": { + "type": "array", + "items": {"type": "string"} + }, + "extra-objects": { + "type": "array", + "items": {"type": "string"} + }, + "extra-compile-args": { + "type": "array", + "items": {"type": "string"} + }, + "extra-link-args": { + "type": "array", + "items": {"type": "string"} + }, + "export-symbols": { + "type": "array", + "items": {"type": "string"} + }, + "swig-opts": { + "type": "array", + "items": {"type": "string"} + }, + "depends": { + "type": "array", + "items": {"type": "string"} + }, + "language": {"type": "string"}, + "optional": {"type": "boolean"}, + "py-limited-api": {"type": "boolean"} + } + }, + "file-directive": { + "$id": "#/definitions/file-directive", + "title": "'file:' directive", + "description": + "Value is read from a file (or list of files and then concatenated)", + "type": "object", + "additionalProperties": false, + "properties": { + "file": { + "oneOf": [ + {"type": "string"}, + {"type": "array", "items": {"type": "string"}} + ] + } + }, + "required": ["file"] + }, + "file-directive-for-dependencies": { + "title": "'file:' directive for dependencies", + "allOf": [ + { + "$$description": [ + "**BETA**: subset of the ``requirements.txt`` format", + "without ``pip`` flags and options", + "(one :pep:`508`-compliant string per line,", + "lines that are blank or start with ``#`` are excluded).", + "See `dynamic metadata", + "`_." + ] + }, + {"$ref": "#/definitions/file-directive"} + ] + }, + "attr-directive": { + "title": "'attr:' directive", + "$id": "#/definitions/attr-directive", + "$$description": [ + "Value is read from a module attribute. Supports callables and iterables;", + "unsupported types are cast via ``str()``" + ], + "type": "object", + "additionalProperties": false, + "properties": { + "attr": {"type": "string", "format": "python-qualified-identifier"} + }, + "required": ["attr"] + }, + "find-directive": { + "$id": "#/definitions/find-directive", + "title": "'find:' directive", + "type": "object", + "additionalProperties": false, + "properties": { + "find": { + "type": "object", + "$$description": [ + "Dynamic `package discovery", + "`_." + ], + "additionalProperties": false, + "properties": { + "where": { + "description": + "Directories to be searched for packages (Unix-style relative path)", + "type": "array", + "items": {"type": "string"} + }, + "exclude": { + "type": "array", + "$$description": [ + "Exclude packages that match the values listed in this field.", + "Can container shell-style wildcards (e.g. ``'pkg.*'``)" + ], + "items": {"type": "string"} + }, + "include": { + "type": "array", + "$$description": [ + "Restrict the found packages to just the ones listed in this field.", + "Can container shell-style wildcards (e.g. ``'pkg.*'``)" + ], + "items": {"type": "string"} + }, + "namespaces": { + "type": "boolean", + "$$description": [ + "When ``True``, directories without a ``__init__.py`` file will also", + "be scanned for :pep:`420`-style implicit namespaces" + ] + } + } + } + } + } + } +} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eb70bfb7115a2a94a8b942b31cafc3a550f0c005 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/__init__.py @@ -0,0 +1,13 @@ +import locale +import sys + +import pytest + +__all__ = ['fail_on_ascii'] + +if sys.version_info >= (3, 11): + locale_encoding = locale.getencoding() +else: + locale_encoding = locale.getpreferredencoding(False) +is_ascii = locale_encoding == 'ANSI_X3.4-1968' +fail_on_ascii = pytest.mark.xfail(is_ascii, reason="Test fails in this locale") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/contexts.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/contexts.py new file mode 100644 index 0000000000000000000000000000000000000000..97cceea0e7060c6e3568956e1ba20893301da387 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/contexts.py @@ -0,0 +1,145 @@ +import contextlib +import io +import os +import shutil +import site +import sys +import tempfile + +from filelock import FileLock + + +@contextlib.contextmanager +def tempdir(cd=lambda dir: None, **kwargs): + temp_dir = tempfile.mkdtemp(**kwargs) + orig_dir = os.getcwd() + try: + cd(temp_dir) + yield temp_dir + finally: + cd(orig_dir) + shutil.rmtree(temp_dir) + + +@contextlib.contextmanager +def environment(**replacements): + """ + In a context, patch the environment with replacements. Pass None values + to clear the values. + """ + saved = dict((key, os.environ[key]) for key in replacements if key in os.environ) + + # remove values that are null + remove = (key for (key, value) in replacements.items() if value is None) + for key in list(remove): + os.environ.pop(key, None) + replacements.pop(key) + + os.environ.update(replacements) + + try: + yield saved + finally: + for key in replacements: + os.environ.pop(key, None) + os.environ.update(saved) + + +@contextlib.contextmanager +def quiet(): + """ + Redirect stdout/stderr to StringIO objects to prevent console output from + distutils commands. + """ + + old_stdout = sys.stdout + old_stderr = sys.stderr + new_stdout = sys.stdout = io.StringIO() + new_stderr = sys.stderr = io.StringIO() + try: + yield new_stdout, new_stderr + finally: + new_stdout.seek(0) + new_stderr.seek(0) + sys.stdout = old_stdout + sys.stderr = old_stderr + + +@contextlib.contextmanager +def save_user_site_setting(): + saved = site.ENABLE_USER_SITE + try: + yield saved + finally: + site.ENABLE_USER_SITE = saved + + +@contextlib.contextmanager +def save_pkg_resources_state(): + import pkg_resources + + pr_state = pkg_resources.__getstate__() + # also save sys.path + sys_path = sys.path[:] + try: + yield pr_state, sys_path + finally: + sys.path[:] = sys_path + pkg_resources.__setstate__(pr_state) + + +@contextlib.contextmanager +def suppress_exceptions(*excs): + try: + yield + except excs: + pass + + +def multiproc(request): + """ + Return True if running under xdist and multiple + workers are used. + """ + try: + worker_id = request.getfixturevalue('worker_id') + except Exception: + return False + return worker_id != 'master' + + +@contextlib.contextmanager +def session_locked_tmp_dir(request, tmp_path_factory, name): + """Uses a file lock to guarantee only one worker can access a temp dir""" + # get the temp directory shared by all workers + base = tmp_path_factory.getbasetemp() + shared_dir = base.parent if multiproc(request) else base + + locked_dir = shared_dir / name + with FileLock(locked_dir.with_suffix(".lock")): + # ^-- prevent multiple workers to access the directory at once + locked_dir.mkdir(exist_ok=True, parents=True) + yield locked_dir + + +@contextlib.contextmanager +def save_paths(): + """Make sure ``sys.path``, ``sys.meta_path`` and ``sys.path_hooks`` are preserved""" + prev = sys.path[:], sys.meta_path[:], sys.path_hooks[:] + + try: + yield + finally: + sys.path, sys.meta_path, sys.path_hooks = prev + + +@contextlib.contextmanager +def save_sys_modules(): + """Make sure initial ``sys.modules`` is preserved""" + prev_modules = sys.modules + + try: + sys.modules = sys.modules.copy() + yield + finally: + sys.modules = prev_modules diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/environment.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/environment.py new file mode 100644 index 0000000000000000000000000000000000000000..ed5499ef7d73762d033a4877bfe586d6c0b82235 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/environment.py @@ -0,0 +1,95 @@ +import os +import subprocess +import sys +import unicodedata +from subprocess import PIPE as _PIPE, Popen as _Popen + +import jaraco.envs + + +class VirtualEnv(jaraco.envs.VirtualEnv): + name = '.env' + # Some version of PyPy will import distutils on startup, implicitly + # importing setuptools, and thus leading to BackendInvalid errors + # when upgrading Setuptools. Bypass this behavior by avoiding the + # early availability and need to upgrade. + create_opts = ['--no-setuptools'] + + def run(self, cmd, *args, **kwargs): + cmd = [self.exe(cmd[0])] + cmd[1:] + kwargs = {"cwd": self.root, "encoding": "utf-8", **kwargs} # Allow overriding + # In some environments (eg. downstream distro packaging), where: + # - tox isn't used to run tests and + # - PYTHONPATH is set to point to a specific setuptools codebase and + # - no custom env is explicitly set by a test + # PYTHONPATH will leak into the spawned processes. + # In that case tests look for module in the wrong place (on PYTHONPATH). + # Unless the test sets its own special env, pass a copy of the existing + # environment with removed PYTHONPATH to the subprocesses. + if "env" not in kwargs: + env = dict(os.environ) + if "PYTHONPATH" in env: + del env["PYTHONPATH"] + kwargs["env"] = env + return subprocess.check_output(cmd, *args, **kwargs) + + +def _which_dirs(cmd): + result = set() + for path in os.environ.get('PATH', '').split(os.pathsep): + filename = os.path.join(path, cmd) + if os.access(filename, os.X_OK): + result.add(path) + return result + + +def run_setup_py(cmd, pypath=None, path=None, data_stream=0, env=None): + """ + Execution command for tests, separate from those used by the + code directly to prevent accidental behavior issues + """ + if env is None: + env = dict() + for envname in os.environ: + env[envname] = os.environ[envname] + + # override the python path if needed + if pypath is not None: + env["PYTHONPATH"] = pypath + + # override the execution path if needed + if path is not None: + env["PATH"] = path + if not env.get("PATH", ""): + env["PATH"] = _which_dirs("tar").union(_which_dirs("gzip")) + env["PATH"] = os.pathsep.join(env["PATH"]) + + cmd = [sys.executable, "setup.py"] + list(cmd) + + # https://bugs.python.org/issue8557 + shell = sys.platform == 'win32' + + try: + proc = _Popen( + cmd, + stdout=_PIPE, + stderr=_PIPE, + shell=shell, + env=env, + encoding="utf-8", + ) + + if isinstance(data_stream, tuple): + data_stream = slice(*data_stream) + data = proc.communicate()[data_stream] + except OSError: + return 1, '' + + # decode the console string if needed + if hasattr(data, "decode"): + # use the default encoding + data = data.decode() + data = unicodedata.normalize('NFC', data) + + # communicate calls wait() + return proc.returncode, data diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/fixtures.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/fixtures.py new file mode 100644 index 0000000000000000000000000000000000000000..a5472984b5690572285a84575f6c2d598f06dd11 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/fixtures.py @@ -0,0 +1,157 @@ +import contextlib +import os +import subprocess +import sys +from pathlib import Path + +import path +import pytest + +from . import contexts, environment + + +@pytest.fixture +def user_override(monkeypatch): + """ + Override site.USER_BASE and site.USER_SITE with temporary directories in + a context. + """ + with contexts.tempdir() as user_base: + monkeypatch.setattr('site.USER_BASE', user_base) + with contexts.tempdir() as user_site: + monkeypatch.setattr('site.USER_SITE', user_site) + with contexts.save_user_site_setting(): + yield + + +@pytest.fixture +def tmpdir_cwd(tmpdir): + with tmpdir.as_cwd() as orig: + yield orig + + +@pytest.fixture(autouse=True, scope="session") +def workaround_xdist_376(request): + """ + Workaround pytest-dev/pytest-xdist#376 + + ``pytest-xdist`` tends to inject '' into ``sys.path``, + which may break certain isolation expectations. + Remove the entry so the import + machinery behaves the same irrespective of xdist. + """ + if not request.config.pluginmanager.has_plugin('xdist'): + return + + with contextlib.suppress(ValueError): + sys.path.remove('') + + +@pytest.fixture +def sample_project(tmp_path): + """ + Clone the 'sampleproject' and return a path to it. + """ + cmd = ['git', 'clone', 'https://github.com/pypa/sampleproject'] + try: + subprocess.check_call(cmd, cwd=str(tmp_path)) + except Exception: + pytest.skip("Unable to clone sampleproject") + return tmp_path / 'sampleproject' + + +# sdist and wheel artifacts should be stable across a round of tests +# so we can build them once per session and use the files as "readonly" + +# In the case of setuptools, building the wheel without sdist may cause +# it to contain the `build` directory, and therefore create situations with +# `setuptools/build/lib/build/lib/...`. To avoid that, build both artifacts at once. + + +def _build_distributions(tmp_path_factory, request): + with contexts.session_locked_tmp_dir( + request, tmp_path_factory, "dist_build" + ) as tmp: # pragma: no cover + sdist = next(tmp.glob("*.tar.gz"), None) + wheel = next(tmp.glob("*.whl"), None) + if sdist and wheel: + return (sdist, wheel) + + # Sanity check: should not create recursive setuptools/build/lib/build/lib/... + assert not Path(request.config.rootdir, "build/lib/build").exists() + + subprocess.check_output([ + sys.executable, + "-m", + "build", + "--outdir", + str(tmp), + str(request.config.rootdir), + ]) + + # Sanity check: should not create recursive setuptools/build/lib/build/lib/... + assert not Path(request.config.rootdir, "build/lib/build").exists() + + return next(tmp.glob("*.tar.gz")), next(tmp.glob("*.whl")) + + +@pytest.fixture(scope="session") +def setuptools_sdist(tmp_path_factory, request): + prebuilt = os.getenv("PRE_BUILT_SETUPTOOLS_SDIST") + if prebuilt and os.path.exists(prebuilt): # pragma: no cover + return Path(prebuilt).resolve() + + sdist, _ = _build_distributions(tmp_path_factory, request) + return sdist + + +@pytest.fixture(scope="session") +def setuptools_wheel(tmp_path_factory, request): + prebuilt = os.getenv("PRE_BUILT_SETUPTOOLS_WHEEL") + if prebuilt and os.path.exists(prebuilt): # pragma: no cover + return Path(prebuilt).resolve() + + _, wheel = _build_distributions(tmp_path_factory, request) + return wheel + + +@pytest.fixture +def venv(tmp_path, setuptools_wheel): + """Virtual env with the version of setuptools under test installed""" + env = environment.VirtualEnv() + env.root = path.Path(tmp_path / 'venv') + env.create_opts = ['--no-setuptools', '--wheel=bundle'] + # TODO: Use `--no-wheel` when setuptools implements its own bdist_wheel + env.req = str(setuptools_wheel) + # In some environments (eg. downstream distro packaging), + # where tox isn't used to run tests and PYTHONPATH is set to point to + # a specific setuptools codebase, PYTHONPATH will leak into the spawned + # processes. + # env.create() should install the just created setuptools + # wheel, but it doesn't if it finds another existing matching setuptools + # installation present on PYTHONPATH: + # `setuptools is already installed with the same version as the provided + # wheel. Use --force-reinstall to force an installation of the wheel.` + # This prevents leaking PYTHONPATH to the created environment. + with contexts.environment(PYTHONPATH=None): + return env.create() + + +@pytest.fixture +def venv_without_setuptools(tmp_path): + """Virtual env without any version of setuptools installed""" + env = environment.VirtualEnv() + env.root = path.Path(tmp_path / 'venv_without_setuptools') + env.create_opts = ['--no-setuptools', '--no-wheel'] + env.ensure_env() + return env + + +@pytest.fixture +def bare_venv(tmp_path): + """Virtual env without any common packages installed""" + env = environment.VirtualEnv() + env.root = path.Path(tmp_path / 'bare_venv') + env.create_opts = ['--no-setuptools', '--no-pip', '--no-wheel', '--no-seed'] + env.ensure_env() + return env diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/mod_with_constant.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/mod_with_constant.py new file mode 100644 index 0000000000000000000000000000000000000000..ef755dd1c7a8d1f116fe51f1b43315057f03379d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/mod_with_constant.py @@ -0,0 +1 @@ +value = 'three, sir!' diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/namespaces.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/namespaces.py new file mode 100644 index 0000000000000000000000000000000000000000..248db98f97951aeeee0222131417e73074cc72d2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/namespaces.py @@ -0,0 +1,90 @@ +import ast +import json +import textwrap +from pathlib import Path + + +def iter_namespace_pkgs(namespace): + parts = namespace.split(".") + for i in range(len(parts)): + yield ".".join(parts[: i + 1]) + + +def build_namespace_package(tmpdir, name, version="1.0", impl="pkg_resources"): + src_dir = tmpdir / name + src_dir.mkdir() + setup_py = src_dir / 'setup.py' + namespace, _, rest = name.rpartition('.') + namespaces = list(iter_namespace_pkgs(namespace)) + setup_args = { + "name": name, + "version": version, + "packages": namespaces, + } + + if impl == "pkg_resources": + tmpl = '__import__("pkg_resources").declare_namespace(__name__)' + setup_args["namespace_packages"] = namespaces + elif impl == "pkgutil": + tmpl = '__path__ = __import__("pkgutil").extend_path(__path__, __name__)' + else: + raise ValueError(f"Cannot recognise {impl=} when creating namespaces") + + args = json.dumps(setup_args, indent=4) + assert ast.literal_eval(args) # ensure it is valid Python + + script = textwrap.dedent( + """\ + import setuptools + args = {args} + setuptools.setup(**args) + """ + ).format(args=args) + setup_py.write_text(script, encoding='utf-8') + + ns_pkg_dir = Path(src_dir, namespace.replace(".", "/")) + ns_pkg_dir.mkdir(parents=True) + + for ns in namespaces: + pkg_init = src_dir / ns.replace(".", "/") / '__init__.py' + pkg_init.write_text(tmpl, encoding='utf-8') + + pkg_mod = ns_pkg_dir / (rest + '.py') + some_functionality = 'name = {rest!r}'.format(**locals()) + pkg_mod.write_text(some_functionality, encoding='utf-8') + return src_dir + + +def build_pep420_namespace_package(tmpdir, name): + src_dir = tmpdir / name + src_dir.mkdir() + pyproject = src_dir / "pyproject.toml" + namespace, _, rest = name.rpartition(".") + script = f"""\ + [build-system] + requires = ["setuptools"] + build-backend = "setuptools.build_meta" + + [project] + name = "{name}" + version = "3.14159" + """ + pyproject.write_text(textwrap.dedent(script), encoding='utf-8') + ns_pkg_dir = Path(src_dir, namespace.replace(".", "/")) + ns_pkg_dir.mkdir(parents=True) + pkg_mod = ns_pkg_dir / (rest + ".py") + some_functionality = f"name = {rest!r}" + pkg_mod.write_text(some_functionality, encoding='utf-8') + return src_dir + + +def make_site_dir(target): + """ + Add a sitecustomize.py module in target to cause + target to be added to site dirs such that .pth files + are processed there. + """ + sc = target / 'sitecustomize.py' + target_str = str(target) + tmpl = '__import__("site").addsitedir({target_str!r})' + sc.write_text(tmpl.format(**locals()), encoding='utf-8') diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/script-with-bom.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/script-with-bom.py new file mode 100644 index 0000000000000000000000000000000000000000..c074d263c45bcaebe32fdba328d975c73d1ad5ca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/script-with-bom.py @@ -0,0 +1 @@ +result = 'passed' diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/server.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/server.py new file mode 100644 index 0000000000000000000000000000000000000000..623a49a550f161f117d3af2173f91a2af260181d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/server.py @@ -0,0 +1,86 @@ +"""Basic http server for tests to simulate PyPI or custom indexes""" + +import http.server +import os +import threading +import time +import urllib.parse +import urllib.request + + +class IndexServer(http.server.HTTPServer): + """Basic single-threaded http server simulating a package index + + You can use this server in unittest like this:: + s = IndexServer() + s.start() + index_url = s.base_url() + 'mytestindex' + # do some test requests to the index + # The index files should be located in setuptools/tests/indexes + s.stop() + """ + + def __init__( + self, + server_address=('', 0), + RequestHandlerClass=http.server.SimpleHTTPRequestHandler, + ): + http.server.HTTPServer.__init__(self, server_address, RequestHandlerClass) + self._run = True + + def start(self): + self.thread = threading.Thread(target=self.serve_forever) + self.thread.start() + + def stop(self): + "Stop the server" + + # Let the server finish the last request and wait for a new one. + time.sleep(0.1) + + self.shutdown() + self.thread.join() + self.socket.close() + + def base_url(self): + port = self.server_port + return f'http://127.0.0.1:{port}/setuptools/tests/indexes/' + + +class RequestRecorder(http.server.BaseHTTPRequestHandler): + def do_GET(self): + requests = vars(self.server).setdefault('requests', []) + requests.append(self) + self.send_response(200, 'OK') + + +class MockServer(http.server.HTTPServer, threading.Thread): + """ + A simple HTTP Server that records the requests made to it. + """ + + def __init__(self, server_address=('', 0), RequestHandlerClass=RequestRecorder): + http.server.HTTPServer.__init__(self, server_address, RequestHandlerClass) + threading.Thread.__init__(self) + self.daemon = True + self.requests = [] + + def run(self): + self.serve_forever() + + @property + def netloc(self): + return f'localhost:{self.server_port}' + + @property + def url(self): + return f'http://{self.netloc}/' + + +def path_to_url(path, authority=None): + """Convert a path to a file: URL.""" + path = os.path.normpath(os.path.abspath(path)) + base = 'file:' + if authority is not None: + base += '//' + authority + return urllib.parse.urljoin(base, urllib.request.pathname2url(path)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_archive_util.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_archive_util.py new file mode 100644 index 0000000000000000000000000000000000000000..e3efc62889994fa68bc9170e8a0e403f48a204e1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_archive_util.py @@ -0,0 +1,36 @@ +import io +import tarfile + +import pytest + +from setuptools import archive_util + + +@pytest.fixture +def tarfile_with_unicode(tmpdir): + """ + Create a tarfile containing only a file whose name is + a zero byte file called testimäge.png. + """ + tarobj = io.BytesIO() + + with tarfile.open(fileobj=tarobj, mode="w:gz") as tgz: + data = b"" + + filename = "testimäge.png" + + t = tarfile.TarInfo(filename) + t.size = len(data) + + tgz.addfile(t, io.BytesIO(data)) + + target = tmpdir / 'unicode-pkg-1.0.tar.gz' + with open(str(target), mode='wb') as tf: + tf.write(tarobj.getvalue()) + return str(target) + + +@pytest.mark.xfail(reason="#710 and #712") +def test_unicode_files(tarfile_with_unicode, tmpdir): + target = tmpdir / 'out' + archive_util.unpack_archive(tarfile_with_unicode, str(target)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_bdist_deprecations.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_bdist_deprecations.py new file mode 100644 index 0000000000000000000000000000000000000000..d9d67b06161a2b36e7b15fab09f797c5c15575c2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_bdist_deprecations.py @@ -0,0 +1,28 @@ +"""develop tests""" + +import sys +from unittest import mock + +import pytest + +from setuptools import SetuptoolsDeprecationWarning +from setuptools.dist import Distribution + + +@pytest.mark.skipif(sys.platform == 'win32', reason='non-Windows only') +@pytest.mark.xfail(reason="bdist_rpm is long deprecated, should we remove it? #1988") +@mock.patch('distutils.command.bdist_rpm.bdist_rpm') +def test_bdist_rpm_warning(distutils_cmd, tmpdir_cwd): + dist = Distribution( + dict( + script_name='setup.py', + script_args=['bdist_rpm'], + name='foo', + py_modules=['hi'], + ) + ) + dist.parse_command_line() + with pytest.warns(SetuptoolsDeprecationWarning): + dist.run_commands() + + distutils_cmd.run.assert_called_once() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_bdist_egg.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_bdist_egg.py new file mode 100644 index 0000000000000000000000000000000000000000..036167dd951e70ad543775529d5ce3f6d6544c71 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_bdist_egg.py @@ -0,0 +1,73 @@ +"""develop tests""" + +import os +import re +import zipfile + +import pytest + +from setuptools.dist import Distribution + +from . import contexts + +SETUP_PY = """\ +from setuptools import setup + +setup(py_modules=['hi']) +""" + + +@pytest.fixture +def setup_context(tmpdir): + with (tmpdir / 'setup.py').open('w') as f: + f.write(SETUP_PY) + with (tmpdir / 'hi.py').open('w') as f: + f.write('1\n') + with tmpdir.as_cwd(): + yield tmpdir + + +class Test: + @pytest.mark.usefixtures("user_override") + @pytest.mark.usefixtures("setup_context") + def test_bdist_egg(self): + dist = Distribution( + dict( + script_name='setup.py', + script_args=['bdist_egg'], + name='foo', + py_modules=['hi'], + ) + ) + os.makedirs(os.path.join('build', 'src')) + with contexts.quiet(): + dist.parse_command_line() + dist.run_commands() + + # let's see if we got our egg link at the right place + [content] = os.listdir('dist') + assert re.match(r'foo-0.0.0-py[23].\d+.egg$', content) + + @pytest.mark.xfail( + os.environ.get('PYTHONDONTWRITEBYTECODE', False), + reason="Byte code disabled", + ) + @pytest.mark.usefixtures("user_override") + @pytest.mark.usefixtures("setup_context") + def test_exclude_source_files(self): + dist = Distribution( + dict( + script_name='setup.py', + script_args=['bdist_egg', '--exclude-source-files'], + py_modules=['hi'], + ) + ) + with contexts.quiet(): + dist.parse_command_line() + dist.run_commands() + [dist_name] = os.listdir('dist') + dist_filename = os.path.join('dist', dist_name) + zip = zipfile.ZipFile(dist_filename) + names = list(zi.filename for zi in zip.filelist) + assert 'hi.pyc' in names + assert 'hi.py' not in names diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_bdist_wheel.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_bdist_wheel.py new file mode 100644 index 0000000000000000000000000000000000000000..776d21d72969e3620f90a9530226d673b2825cd0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_bdist_wheel.py @@ -0,0 +1,623 @@ +from __future__ import annotations + +import builtins +import importlib +import os.path +import platform +import shutil +import stat +import struct +import sys +import sysconfig +from contextlib import suppress +from inspect import cleandoc +from zipfile import ZipFile + +import jaraco.path +import pytest +from packaging import tags + +import setuptools +from setuptools.command.bdist_wheel import bdist_wheel, get_abi_tag +from setuptools.dist import Distribution +from setuptools.warnings import SetuptoolsDeprecationWarning + +from distutils.core import run_setup + +DEFAULT_FILES = { + "dummy_dist-1.0.dist-info/top_level.txt", + "dummy_dist-1.0.dist-info/METADATA", + "dummy_dist-1.0.dist-info/WHEEL", + "dummy_dist-1.0.dist-info/RECORD", +} +DEFAULT_LICENSE_FILES = { + "LICENSE", + "LICENSE.txt", + "LICENCE", + "LICENCE.txt", + "COPYING", + "COPYING.md", + "NOTICE", + "NOTICE.rst", + "AUTHORS", + "AUTHORS.txt", +} +OTHER_IGNORED_FILES = { + "LICENSE~", + "AUTHORS~", +} +SETUPPY_EXAMPLE = """\ +from setuptools import setup + +setup( + name='dummy_dist', + version='1.0', +) +""" + + +EXAMPLES = { + "dummy-dist": { + "setup.py": SETUPPY_EXAMPLE, + "licenses": {"DUMMYFILE": ""}, + **dict.fromkeys(DEFAULT_LICENSE_FILES | OTHER_IGNORED_FILES, ""), + }, + "simple-dist": { + "setup.py": cleandoc( + """ + from setuptools import setup + + setup( + name="simple.dist", + version="0.1", + description="A testing distribution \N{SNOWMAN}", + extras_require={"voting": ["beaglevote"]}, + ) + """ + ), + "simpledist": "", + }, + "complex-dist": { + "setup.py": cleandoc( + """ + from setuptools import setup + + setup( + name="complex-dist", + version="0.1", + description="Another testing distribution \N{SNOWMAN}", + long_description="Another testing distribution \N{SNOWMAN}", + author="Illustrious Author", + author_email="illustrious@example.org", + url="http://example.org/exemplary", + packages=["complexdist"], + setup_requires=["setuptools"], + install_requires=["quux", "splort"], + extras_require={"simple": ["simple.dist"]}, + entry_points={ + "console_scripts": [ + "complex-dist=complexdist:main", + "complex-dist2=complexdist:main", + ], + }, + ) + """ + ), + "complexdist": {"__init__.py": "def main(): return"}, + }, + "headers-dist": { + "setup.py": cleandoc( + """ + from setuptools import setup + + setup( + name="headers.dist", + version="0.1", + description="A distribution with headers", + headers=["header.h"], + ) + """ + ), + "headersdist.py": "", + "header.h": "", + }, + "commasinfilenames-dist": { + "setup.py": cleandoc( + """ + from setuptools import setup + + setup( + name="testrepo", + version="0.1", + packages=["mypackage"], + description="A test package with commas in file names", + include_package_data=True, + package_data={"mypackage.data": ["*"]}, + ) + """ + ), + "mypackage": { + "__init__.py": "", + "data": {"__init__.py": "", "1,2,3.txt": ""}, + }, + "testrepo-0.1.0": { + "mypackage": {"__init__.py": ""}, + }, + }, + "unicode-dist": { + "setup.py": cleandoc( + """ + from setuptools import setup + + setup( + name="unicode.dist", + version="0.1", + description="A testing distribution \N{SNOWMAN}", + packages=["unicodedist"], + zip_safe=True, + ) + """ + ), + "unicodedist": {"__init__.py": "", "åäö_日本語.py": ""}, + }, + "utf8-metadata-dist": { + "setup.cfg": cleandoc( + """ + [metadata] + name = utf8-metadata-dist + version = 42 + author_email = "John X. Ãørçeč" , Γαμα קּ 東 + long_description = file: README.rst + """ + ), + "README.rst": "UTF-8 描述 説明", + }, +} + + +if sys.platform != "win32": + # ABI3 extensions don't really work on Windows + EXAMPLES["abi3extension-dist"] = { + "setup.py": cleandoc( + """ + from setuptools import Extension, setup + + setup( + name="extension.dist", + version="0.1", + description="A testing distribution \N{SNOWMAN}", + ext_modules=[ + Extension( + name="extension", sources=["extension.c"], py_limited_api=True + ) + ], + ) + """ + ), + "setup.cfg": "[bdist_wheel]\npy_limited_api=cp32", + "extension.c": "#define Py_LIMITED_API 0x03020000\n#include ", + } + + +def bdist_wheel_cmd(**kwargs): + """Run command in the same process so that it is easier to collect coverage""" + dist_obj = ( + run_setup("setup.py", stop_after="init") + if os.path.exists("setup.py") + else Distribution({"script_name": "%%build_meta%%"}) + ) + dist_obj.parse_config_files() + cmd = bdist_wheel(dist_obj) + for attr, value in kwargs.items(): + setattr(cmd, attr, value) + cmd.finalize_options() + return cmd + + +def mkexample(tmp_path_factory, name): + basedir = tmp_path_factory.mktemp(name) + jaraco.path.build(EXAMPLES[name], prefix=str(basedir)) + return basedir + + +@pytest.fixture(scope="session") +def wheel_paths(tmp_path_factory): + build_base = tmp_path_factory.mktemp("build") + dist_dir = tmp_path_factory.mktemp("dist") + for name in EXAMPLES: + example_dir = mkexample(tmp_path_factory, name) + build_dir = build_base / name + with jaraco.path.DirectoryStack().context(example_dir): + bdist_wheel_cmd(bdist_dir=str(build_dir), dist_dir=str(dist_dir)).run() + + return sorted(str(fname) for fname in dist_dir.glob("*.whl")) + + +@pytest.fixture +def dummy_dist(tmp_path_factory): + return mkexample(tmp_path_factory, "dummy-dist") + + +def test_no_scripts(wheel_paths): + """Make sure entry point scripts are not generated.""" + path = next(path for path in wheel_paths if "complex_dist" in path) + for entry in ZipFile(path).infolist(): + assert ".data/scripts/" not in entry.filename + + +def test_unicode_record(wheel_paths): + path = next(path for path in wheel_paths if "unicode_dist" in path) + with ZipFile(path) as zf: + record = zf.read("unicode_dist-0.1.dist-info/RECORD") + + assert "åäö_日本語.py".encode() in record + + +UTF8_PKG_INFO = """\ +Metadata-Version: 2.1 +Name: helloworld +Version: 42 +Author-email: "John X. Ãørçeč" , Γαμα קּ 東 + + +UTF-8 描述 説明 +""" + + +def test_preserve_unicode_metadata(monkeypatch, tmp_path): + monkeypatch.chdir(tmp_path) + egginfo = tmp_path / "dummy_dist.egg-info" + distinfo = tmp_path / "dummy_dist.dist-info" + + egginfo.mkdir() + (egginfo / "PKG-INFO").write_text(UTF8_PKG_INFO, encoding="utf-8") + (egginfo / "dependency_links.txt").touch() + + class simpler_bdist_wheel(bdist_wheel): + """Avoid messing with setuptools/distutils internals""" + + def __init__(self): + pass + + @property + def license_paths(self): + return [] + + cmd_obj = simpler_bdist_wheel() + cmd_obj.egg2dist(egginfo, distinfo) + + metadata = (distinfo / "METADATA").read_text(encoding="utf-8") + assert 'Author-email: "John X. Ãørçeč"' in metadata + assert "Γαμα קּ 東 " in metadata + assert "UTF-8 描述 説明" in metadata + + +def test_licenses_default(dummy_dist, monkeypatch, tmp_path): + monkeypatch.chdir(dummy_dist) + bdist_wheel_cmd(bdist_dir=str(tmp_path)).run() + with ZipFile("dist/dummy_dist-1.0-py3-none-any.whl") as wf: + license_files = { + "dummy_dist-1.0.dist-info/" + fname for fname in DEFAULT_LICENSE_FILES + } + assert set(wf.namelist()) == DEFAULT_FILES | license_files + + +def test_licenses_deprecated(dummy_dist, monkeypatch, tmp_path): + dummy_dist.joinpath("setup.cfg").write_text( + "[metadata]\nlicense_file=licenses/DUMMYFILE", encoding="utf-8" + ) + monkeypatch.chdir(dummy_dist) + + bdist_wheel_cmd(bdist_dir=str(tmp_path)).run() + + with ZipFile("dist/dummy_dist-1.0-py3-none-any.whl") as wf: + license_files = {"dummy_dist-1.0.dist-info/DUMMYFILE"} + assert set(wf.namelist()) == DEFAULT_FILES | license_files + + +@pytest.mark.parametrize( + ("config_file", "config"), + [ + ("setup.cfg", "[metadata]\nlicense_files=licenses/*\n LICENSE"), + ("setup.cfg", "[metadata]\nlicense_files=licenses/*, LICENSE"), + ( + "setup.py", + SETUPPY_EXAMPLE.replace( + ")", " license_files=['licenses/DUMMYFILE', 'LICENSE'])" + ), + ), + ], +) +def test_licenses_override(dummy_dist, monkeypatch, tmp_path, config_file, config): + dummy_dist.joinpath(config_file).write_text(config, encoding="utf-8") + monkeypatch.chdir(dummy_dist) + bdist_wheel_cmd(bdist_dir=str(tmp_path)).run() + with ZipFile("dist/dummy_dist-1.0-py3-none-any.whl") as wf: + license_files = { + "dummy_dist-1.0.dist-info/" + fname for fname in {"DUMMYFILE", "LICENSE"} + } + assert set(wf.namelist()) == DEFAULT_FILES | license_files + + +def test_licenses_disabled(dummy_dist, monkeypatch, tmp_path): + dummy_dist.joinpath("setup.cfg").write_text( + "[metadata]\nlicense_files=\n", encoding="utf-8" + ) + monkeypatch.chdir(dummy_dist) + bdist_wheel_cmd(bdist_dir=str(tmp_path)).run() + with ZipFile("dist/dummy_dist-1.0-py3-none-any.whl") as wf: + assert set(wf.namelist()) == DEFAULT_FILES + + +def test_build_number(dummy_dist, monkeypatch, tmp_path): + monkeypatch.chdir(dummy_dist) + bdist_wheel_cmd(bdist_dir=str(tmp_path), build_number="2").run() + with ZipFile("dist/dummy_dist-1.0-2-py3-none-any.whl") as wf: + filenames = set(wf.namelist()) + assert "dummy_dist-1.0.dist-info/RECORD" in filenames + assert "dummy_dist-1.0.dist-info/METADATA" in filenames + + +def test_universal_deprecated(dummy_dist, monkeypatch, tmp_path): + monkeypatch.chdir(dummy_dist) + with pytest.warns(SetuptoolsDeprecationWarning, match=".*universal is deprecated"): + bdist_wheel_cmd(bdist_dir=str(tmp_path), universal=True).run() + + # For now we still respect the option + assert os.path.exists("dist/dummy_dist-1.0-py2.py3-none-any.whl") + + +EXTENSION_EXAMPLE = """\ +#include + +static PyMethodDef methods[] = { + { NULL, NULL, 0, NULL } +}; + +static struct PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "extension", + "Dummy extension module", + -1, + methods +}; + +PyMODINIT_FUNC PyInit_extension(void) { + return PyModule_Create(&module_def); +} +""" +EXTENSION_SETUPPY = """\ +from __future__ import annotations + +from setuptools import Extension, setup + +setup( + name="extension.dist", + version="0.1", + description="A testing distribution \N{SNOWMAN}", + ext_modules=[Extension(name="extension", sources=["extension.c"])], +) +""" + + +@pytest.mark.filterwarnings( + "once:Config variable '.*' is unset.*, Python ABI tag may be incorrect" +) +def test_limited_abi(monkeypatch, tmp_path, tmp_path_factory): + """Test that building a binary wheel with the limited ABI works.""" + source_dir = tmp_path_factory.mktemp("extension_dist") + (source_dir / "setup.py").write_text(EXTENSION_SETUPPY, encoding="utf-8") + (source_dir / "extension.c").write_text(EXTENSION_EXAMPLE, encoding="utf-8") + build_dir = tmp_path.joinpath("build") + dist_dir = tmp_path.joinpath("dist") + monkeypatch.chdir(source_dir) + bdist_wheel_cmd(bdist_dir=str(build_dir), dist_dir=str(dist_dir)).run() + + +def test_build_from_readonly_tree(dummy_dist, monkeypatch, tmp_path): + basedir = str(tmp_path.joinpath("dummy")) + shutil.copytree(str(dummy_dist), basedir) + monkeypatch.chdir(basedir) + + # Make the tree read-only + for root, _dirs, files in os.walk(basedir): + for fname in files: + os.chmod(os.path.join(root, fname), stat.S_IREAD) + + bdist_wheel_cmd().run() + + +@pytest.mark.parametrize( + ("option", "compress_type"), + list(bdist_wheel.supported_compressions.items()), + ids=list(bdist_wheel.supported_compressions), +) +def test_compression(dummy_dist, monkeypatch, tmp_path, option, compress_type): + monkeypatch.chdir(dummy_dist) + bdist_wheel_cmd(bdist_dir=str(tmp_path), compression=option).run() + with ZipFile("dist/dummy_dist-1.0-py3-none-any.whl") as wf: + filenames = set(wf.namelist()) + assert "dummy_dist-1.0.dist-info/RECORD" in filenames + assert "dummy_dist-1.0.dist-info/METADATA" in filenames + for zinfo in wf.filelist: + assert zinfo.compress_type == compress_type + + +def test_wheelfile_line_endings(wheel_paths): + for path in wheel_paths: + with ZipFile(path) as wf: + wheelfile = next(fn for fn in wf.filelist if fn.filename.endswith("WHEEL")) + wheelfile_contents = wf.read(wheelfile) + assert b"\r" not in wheelfile_contents + + +def test_unix_epoch_timestamps(dummy_dist, monkeypatch, tmp_path): + monkeypatch.setenv("SOURCE_DATE_EPOCH", "0") + monkeypatch.chdir(dummy_dist) + bdist_wheel_cmd(bdist_dir=str(tmp_path), build_number="2a").run() + with ZipFile("dist/dummy_dist-1.0-2a-py3-none-any.whl") as wf: + for zinfo in wf.filelist: + assert zinfo.date_time >= (1980, 1, 1, 0, 0, 0) # min epoch is used + + +def test_get_abi_tag_windows(monkeypatch): + monkeypatch.setattr(tags, "interpreter_name", lambda: "cp") + monkeypatch.setattr(sysconfig, "get_config_var", lambda x: "cp313-win_amd64") + assert get_abi_tag() == "cp313" + monkeypatch.setattr(sys, "gettotalrefcount", lambda: 1, False) + assert get_abi_tag() == "cp313d" + monkeypatch.setattr(sysconfig, "get_config_var", lambda x: "cp313t-win_amd64") + assert get_abi_tag() == "cp313td" + monkeypatch.delattr(sys, "gettotalrefcount") + assert get_abi_tag() == "cp313t" + + +def test_get_abi_tag_pypy_old(monkeypatch): + monkeypatch.setattr(tags, "interpreter_name", lambda: "pp") + monkeypatch.setattr(sysconfig, "get_config_var", lambda x: "pypy36-pp73") + assert get_abi_tag() == "pypy36_pp73" + + +def test_get_abi_tag_pypy_new(monkeypatch): + monkeypatch.setattr(sysconfig, "get_config_var", lambda x: "pypy37-pp73-darwin") + monkeypatch.setattr(tags, "interpreter_name", lambda: "pp") + assert get_abi_tag() == "pypy37_pp73" + + +def test_get_abi_tag_graalpy(monkeypatch): + monkeypatch.setattr( + sysconfig, "get_config_var", lambda x: "graalpy231-310-native-x86_64-linux" + ) + monkeypatch.setattr(tags, "interpreter_name", lambda: "graalpy") + assert get_abi_tag() == "graalpy231_310_native" + + +def test_get_abi_tag_fallback(monkeypatch): + monkeypatch.setattr(sysconfig, "get_config_var", lambda x: "unknown-python-310") + monkeypatch.setattr(tags, "interpreter_name", lambda: "unknown-python") + assert get_abi_tag() == "unknown_python_310" + + +def test_platform_with_space(dummy_dist, monkeypatch): + """Ensure building on platforms with a space in the name succeed.""" + monkeypatch.chdir(dummy_dist) + bdist_wheel_cmd(plat_name="isilon onefs").run() + + +def test_data_dir_with_tag_build(monkeypatch, tmp_path): + """ + Setuptools allow authors to set PEP 440's local version segments + using ``egg_info.tag_build``. This should be reflected not only in the + ``.whl`` file name, but also in the ``.dist-info`` and ``.data`` dirs. + See pypa/setuptools#3997. + """ + monkeypatch.chdir(tmp_path) + files = { + "setup.py": """ + from setuptools import setup + setup(headers=["hello.h"]) + """, + "setup.cfg": """ + [metadata] + name = test + version = 1.0 + + [options.data_files] + hello/world = file.txt + + [egg_info] + tag_build = +what + tag_date = 0 + """, + "file.txt": "", + "hello.h": "", + } + for file, content in files.items(): + with open(file, "w", encoding="utf-8") as fh: + fh.write(cleandoc(content)) + + bdist_wheel_cmd().run() + + # Ensure .whl, .dist-info and .data contain the local segment + wheel_path = "dist/test-1.0+what-py3-none-any.whl" + assert os.path.exists(wheel_path) + entries = set(ZipFile(wheel_path).namelist()) + for expected in ( + "test-1.0+what.data/headers/hello.h", + "test-1.0+what.data/data/hello/world/file.txt", + "test-1.0+what.dist-info/METADATA", + "test-1.0+what.dist-info/WHEEL", + ): + assert expected in entries + + for not_expected in ( + "test.data/headers/hello.h", + "test-1.0.data/data/hello/world/file.txt", + "test.dist-info/METADATA", + "test-1.0.dist-info/WHEEL", + ): + assert not_expected not in entries + + +@pytest.mark.parametrize( + ("reported", "expected"), + [("linux-x86_64", "linux_i686"), ("linux-aarch64", "linux_armv7l")], +) +@pytest.mark.skipif( + platform.system() != "Linux", reason="Only makes sense to test on Linux" +) +def test_platform_linux32(reported, expected, monkeypatch): + monkeypatch.setattr(struct, "calcsize", lambda x: 4) + dist = setuptools.Distribution() + cmd = bdist_wheel(dist) + cmd.plat_name = reported + cmd.root_is_pure = False + _, _, actual = cmd.get_tag() + assert actual == expected + + +def test_no_ctypes(monkeypatch) -> None: + def _fake_import(name: str, *args, **kwargs): + if name == "ctypes": + raise ModuleNotFoundError(f"No module named {name}") + + return importlib.__import__(name, *args, **kwargs) + + with suppress(KeyError): + monkeypatch.delitem(sys.modules, "wheel.macosx_libfile") + + # Install an importer shim that refuses to load ctypes + monkeypatch.setattr(builtins, "__import__", _fake_import) + with pytest.raises(ModuleNotFoundError, match="No module named ctypes"): + import wheel.macosx_libfile # noqa: F401 + + # Unload and reimport the bdist_wheel command module to make sure it won't try to + # import ctypes + monkeypatch.delitem(sys.modules, "setuptools.command.bdist_wheel") + + import setuptools.command.bdist_wheel # noqa: F401 + + +def test_dist_info_provided(dummy_dist, monkeypatch, tmp_path): + monkeypatch.chdir(dummy_dist) + distinfo = tmp_path / "dummy_dist.dist-info" + + distinfo.mkdir() + (distinfo / "METADATA").write_text("name: helloworld", encoding="utf-8") + + # We don't control the metadata. According to PEP-517, "The hook MAY also + # create other files inside this directory, and a build frontend MUST + # preserve". + (distinfo / "FOO").write_text("bar", encoding="utf-8") + + bdist_wheel_cmd(bdist_dir=str(tmp_path), dist_info_dir=str(distinfo)).run() + expected = { + "dummy_dist-1.0.dist-info/FOO", + "dummy_dist-1.0.dist-info/RECORD", + } + with ZipFile("dist/dummy_dist-1.0-py3-none-any.whl") as wf: + files_found = set(wf.namelist()) + # Check that all expected files are there. + assert expected - files_found == set() + # Make sure there is no accidental egg-info bleeding into the wheel. + assert not [path for path in files_found if 'egg-info' in str(path)] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build.py new file mode 100644 index 0000000000000000000000000000000000000000..f0f1d9dcf21bafe9dc82a76d373b366ddeecfcec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build.py @@ -0,0 +1,33 @@ +from setuptools import Command +from setuptools.command.build import build +from setuptools.dist import Distribution + + +def test_distribution_gives_setuptools_build_obj(tmpdir_cwd): + """ + Check that the setuptools Distribution uses the + setuptools specific build object. + """ + + dist = Distribution( + dict( + script_name='setup.py', + script_args=['build'], + packages=[], + package_data={'': ['path/*']}, + ) + ) + assert isinstance(dist.get_command_obj("build"), build) + + +class Subcommand(Command): + """Dummy command to be used in tests""" + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + raise NotImplementedError("just to check if the command runs") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_clib.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_clib.py new file mode 100644 index 0000000000000000000000000000000000000000..b5315df4f6599cd376e628c0d74cb14129cd89b8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_clib.py @@ -0,0 +1,84 @@ +import random +from unittest import mock + +import pytest + +from setuptools.command.build_clib import build_clib +from setuptools.dist import Distribution + +from distutils.errors import DistutilsSetupError + + +class TestBuildCLib: + @mock.patch('setuptools.command.build_clib.newer_pairwise_group') + def test_build_libraries(self, mock_newer): + dist = Distribution() + cmd = build_clib(dist) + + # this will be a long section, just making sure all + # exceptions are properly raised + libs = [('example', {'sources': 'broken.c'})] + with pytest.raises(DistutilsSetupError): + cmd.build_libraries(libs) + + obj_deps = 'some_string' + libs = [('example', {'sources': ['source.c'], 'obj_deps': obj_deps})] + with pytest.raises(DistutilsSetupError): + cmd.build_libraries(libs) + + obj_deps = {'': ''} + libs = [('example', {'sources': ['source.c'], 'obj_deps': obj_deps})] + with pytest.raises(DistutilsSetupError): + cmd.build_libraries(libs) + + obj_deps = {'source.c': ''} + libs = [('example', {'sources': ['source.c'], 'obj_deps': obj_deps})] + with pytest.raises(DistutilsSetupError): + cmd.build_libraries(libs) + + # with that out of the way, let's see if the crude dependency + # system works + cmd.compiler = mock.MagicMock(spec=cmd.compiler) + mock_newer.return_value = ([], []) + + obj_deps = {'': ('global.h',), 'example.c': ('example.h',)} + libs = [('example', {'sources': ['example.c'], 'obj_deps': obj_deps})] + + cmd.build_libraries(libs) + assert [['example.c', 'global.h', 'example.h']] in mock_newer.call_args[0] + assert not cmd.compiler.compile.called + assert cmd.compiler.create_static_lib.call_count == 1 + + # reset the call numbers so we can test again + cmd.compiler.reset_mock() + + mock_newer.return_value = '' # anything as long as it's not ([],[]) + cmd.build_libraries(libs) + assert cmd.compiler.compile.call_count == 1 + assert cmd.compiler.create_static_lib.call_count == 1 + + @mock.patch('setuptools.command.build_clib.newer_pairwise_group') + def test_build_libraries_reproducible(self, mock_newer): + dist = Distribution() + cmd = build_clib(dist) + + # with that out of the way, let's see if the crude dependency + # system works + cmd.compiler = mock.MagicMock(spec=cmd.compiler) + mock_newer.return_value = ([], []) + + original_sources = ['a-example.c', 'example.c'] + sources = original_sources + + obj_deps = {'': ('global.h',), 'example.c': ('example.h',)} + libs = [('example', {'sources': sources, 'obj_deps': obj_deps})] + + cmd.build_libraries(libs) + computed_call_args = mock_newer.call_args[0] + + while sources == original_sources: + sources = random.sample(original_sources, len(original_sources)) + libs = [('example', {'sources': sources, 'obj_deps': obj_deps})] + + cmd.build_libraries(libs) + assert computed_call_args == mock_newer.call_args[0] diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_ext.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_ext.py new file mode 100644 index 0000000000000000000000000000000000000000..c7b60ac32fcd6628cf96396a7e5d8fdb50f67c19 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_ext.py @@ -0,0 +1,293 @@ +from __future__ import annotations + +import os +import sys +from importlib.util import cache_from_source as _compiled_file_name + +import pytest +from jaraco import path + +from setuptools.command.build_ext import build_ext, get_abi3_suffix +from setuptools.dist import Distribution +from setuptools.errors import CompileError +from setuptools.extension import Extension + +from . import environment +from .textwrap import DALS + +import distutils.command.build_ext as orig +from distutils.sysconfig import get_config_var + +IS_PYPY = '__pypy__' in sys.builtin_module_names + + +class TestBuildExt: + def test_get_ext_filename(self): + """ + Setuptools needs to give back the same + result as distutils, even if the fullname + is not in ext_map. + """ + dist = Distribution() + cmd = build_ext(dist) + cmd.ext_map['foo/bar'] = '' + res = cmd.get_ext_filename('foo') + wanted = orig.build_ext.get_ext_filename(cmd, 'foo') + assert res == wanted + + def test_abi3_filename(self): + """ + Filename needs to be loadable by several versions + of Python 3 if 'is_abi3' is truthy on Extension() + """ + print(get_abi3_suffix()) + + extension = Extension('spam.eggs', ['eggs.c'], py_limited_api=True) + dist = Distribution(dict(ext_modules=[extension])) + cmd = build_ext(dist) + cmd.finalize_options() + assert 'spam.eggs' in cmd.ext_map + res = cmd.get_ext_filename('spam.eggs') + + if not get_abi3_suffix(): + assert res.endswith(get_config_var('EXT_SUFFIX')) + elif sys.platform == 'win32': + assert res.endswith('eggs.pyd') + else: + assert 'abi3' in res + + def test_ext_suffix_override(self): + """ + SETUPTOOLS_EXT_SUFFIX variable always overrides + default extension options. + """ + dist = Distribution() + cmd = build_ext(dist) + cmd.ext_map['for_abi3'] = ext = Extension( + 'for_abi3', + ['s.c'], + # Override shouldn't affect abi3 modules + py_limited_api=True, + ) + # Mock value needed to pass tests + ext._links_to_dynamic = False + + if not IS_PYPY: + expect = cmd.get_ext_filename('for_abi3') + else: + # PyPy builds do not use ABI3 tag, so they will + # also get the overridden suffix. + expect = 'for_abi3.test-suffix' + + try: + os.environ['SETUPTOOLS_EXT_SUFFIX'] = '.test-suffix' + res = cmd.get_ext_filename('normal') + assert 'normal.test-suffix' == res + res = cmd.get_ext_filename('for_abi3') + assert expect == res + finally: + del os.environ['SETUPTOOLS_EXT_SUFFIX'] + + def dist_with_example(self): + files = { + "src": {"mypkg": {"subpkg": {"ext2.c": ""}}}, + "c-extensions": {"ext1": {"main.c": ""}}, + } + + ext1 = Extension("mypkg.ext1", ["c-extensions/ext1/main.c"]) + ext2 = Extension("mypkg.subpkg.ext2", ["src/mypkg/subpkg/ext2.c"]) + ext3 = Extension("ext3", ["c-extension/ext3.c"]) + + path.build(files) + return Distribution({ + "script_name": "%test%", + "ext_modules": [ext1, ext2, ext3], + "package_dir": {"": "src"}, + }) + + def test_get_outputs(self, tmpdir_cwd, monkeypatch): + monkeypatch.setenv('SETUPTOOLS_EXT_SUFFIX', '.mp3') # make test OS-independent + monkeypatch.setattr('setuptools.command.build_ext.use_stubs', False) + dist = self.dist_with_example() + + # Regular build: get_outputs not empty, but get_output_mappings is empty + build_ext = dist.get_command_obj("build_ext") + build_ext.editable_mode = False + build_ext.ensure_finalized() + build_lib = build_ext.build_lib.replace(os.sep, "/") + outputs = [x.replace(os.sep, "/") for x in build_ext.get_outputs()] + assert outputs == [ + f"{build_lib}/ext3.mp3", + f"{build_lib}/mypkg/ext1.mp3", + f"{build_lib}/mypkg/subpkg/ext2.mp3", + ] + assert build_ext.get_output_mapping() == {} + + # Editable build: get_output_mappings should contain everything in get_outputs + dist.reinitialize_command("build_ext") + build_ext.editable_mode = True + build_ext.ensure_finalized() + mapping = { + k.replace(os.sep, "/"): v.replace(os.sep, "/") + for k, v in build_ext.get_output_mapping().items() + } + assert mapping == { + f"{build_lib}/ext3.mp3": "src/ext3.mp3", + f"{build_lib}/mypkg/ext1.mp3": "src/mypkg/ext1.mp3", + f"{build_lib}/mypkg/subpkg/ext2.mp3": "src/mypkg/subpkg/ext2.mp3", + } + + def test_get_output_mapping_with_stub(self, tmpdir_cwd, monkeypatch): + monkeypatch.setenv('SETUPTOOLS_EXT_SUFFIX', '.mp3') # make test OS-independent + monkeypatch.setattr('setuptools.command.build_ext.use_stubs', True) + dist = self.dist_with_example() + + # Editable build should create compiled stubs (.pyc files only, no .py) + build_ext = dist.get_command_obj("build_ext") + build_ext.editable_mode = True + build_ext.ensure_finalized() + for ext in build_ext.extensions: + monkeypatch.setattr(ext, "_needs_stub", True) + + build_lib = build_ext.build_lib.replace(os.sep, "/") + mapping = { + k.replace(os.sep, "/"): v.replace(os.sep, "/") + for k, v in build_ext.get_output_mapping().items() + } + + def C(file): + """Make it possible to do comparisons and tests in a OS-independent way""" + return _compiled_file_name(file).replace(os.sep, "/") + + assert mapping == { + C(f"{build_lib}/ext3.py"): C("src/ext3.py"), + f"{build_lib}/ext3.mp3": "src/ext3.mp3", + C(f"{build_lib}/mypkg/ext1.py"): C("src/mypkg/ext1.py"), + f"{build_lib}/mypkg/ext1.mp3": "src/mypkg/ext1.mp3", + C(f"{build_lib}/mypkg/subpkg/ext2.py"): C("src/mypkg/subpkg/ext2.py"), + f"{build_lib}/mypkg/subpkg/ext2.mp3": "src/mypkg/subpkg/ext2.mp3", + } + + # Ensure only the compiled stubs are present not the raw .py stub + assert f"{build_lib}/mypkg/ext1.py" not in mapping + assert f"{build_lib}/mypkg/subpkg/ext2.py" not in mapping + + # Visualize what the cached stub files look like + example_stub = C(f"{build_lib}/mypkg/ext1.py") + assert example_stub in mapping + assert example_stub.startswith(f"{build_lib}/mypkg/__pycache__/ext1") + assert example_stub.endswith(".pyc") + + +class TestBuildExtInplace: + def get_build_ext_cmd(self, optional: bool, **opts) -> build_ext: + files: dict[str, str | dict[str, dict[str, str]]] = { + "eggs.c": "#include missingheader.h\n", + ".build": {"lib": {}, "tmp": {}}, + } + path.build(files) + extension = Extension('spam.eggs', ['eggs.c'], optional=optional) + dist = Distribution(dict(ext_modules=[extension])) + dist.script_name = 'setup.py' + cmd = build_ext(dist) + vars(cmd).update(build_lib=".build/lib", build_temp=".build/tmp", **opts) + cmd.ensure_finalized() + return cmd + + def get_log_messages(self, caplog, capsys): + """ + Historically, distutils "logged" by printing to sys.std*. + Later versions adopted the logging framework. Grab + messages regardless of how they were captured. + """ + std = capsys.readouterr() + return std.out.splitlines() + std.err.splitlines() + caplog.messages + + def test_optional(self, tmpdir_cwd, caplog, capsys): + """ + If optional extensions fail to build, setuptools should show the error + in the logs but not fail to build + """ + cmd = self.get_build_ext_cmd(optional=True, inplace=True) + cmd.run() + assert any( + 'build_ext: building extension "spam.eggs" failed' + for msg in self.get_log_messages(caplog, capsys) + ) + # No compile error exception should be raised + + def test_non_optional(self, tmpdir_cwd): + # Non-optional extensions should raise an exception + cmd = self.get_build_ext_cmd(optional=False, inplace=True) + with pytest.raises(CompileError): + cmd.run() + + +def test_build_ext_config_handling(tmpdir_cwd): + files = { + 'setup.py': DALS( + """ + from setuptools import Extension, setup + setup( + name='foo', + version='0.0.0', + ext_modules=[Extension('foo', ['foo.c'])], + ) + """ + ), + 'foo.c': DALS( + """ + #include "Python.h" + + #if PY_MAJOR_VERSION >= 3 + + static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "foo", + NULL, + 0, + NULL, + NULL, + NULL, + NULL, + NULL + }; + + #define INITERROR return NULL + + PyMODINIT_FUNC PyInit_foo(void) + + #else + + #define INITERROR return + + void initfoo(void) + + #endif + { + #if PY_MAJOR_VERSION >= 3 + PyObject *module = PyModule_Create(&moduledef); + #else + PyObject *module = Py_InitModule("extension", NULL); + #endif + if (module == NULL) + INITERROR; + #if PY_MAJOR_VERSION >= 3 + return module; + #endif + } + """ + ), + 'setup.cfg': DALS( + """ + [build] + build_base = foo_build + """ + ), + } + path.build(files) + code, (stdout, stderr) = environment.run_setup_py( + cmd=['build'], + data_stream=(0, 2), + ) + assert code == 0, f'\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}' diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_meta.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_meta.py new file mode 100644 index 0000000000000000000000000000000000000000..121f409057b2ef84f388db3afa93ef613bef9a37 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_meta.py @@ -0,0 +1,970 @@ +import contextlib +import importlib +import os +import re +import shutil +import signal +import sys +import tarfile +from concurrent import futures +from pathlib import Path +from typing import Any, Callable +from zipfile import ZipFile + +import pytest +from jaraco import path +from packaging.requirements import Requirement + +from .textwrap import DALS + +SETUP_SCRIPT_STUB = "__import__('setuptools').setup()" + + +TIMEOUT = int(os.getenv("TIMEOUT_BACKEND_TEST", "180")) # in seconds +IS_PYPY = '__pypy__' in sys.builtin_module_names + + +pytestmark = pytest.mark.skipif( + sys.platform == "win32" and IS_PYPY, + reason="The combination of PyPy + Windows + pytest-xdist + ProcessPoolExecutor " + "is flaky and problematic", +) + + +class BuildBackendBase: + def __init__(self, cwd='.', env=None, backend_name='setuptools.build_meta'): + self.cwd = cwd + self.env = env or {} + self.backend_name = backend_name + + +class BuildBackend(BuildBackendBase): + """PEP 517 Build Backend""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.pool = futures.ProcessPoolExecutor(max_workers=1) + + def __getattr__(self, name: str) -> Callable[..., Any]: + """Handles arbitrary function invocations on the build backend.""" + + def method(*args, **kw): + root = os.path.abspath(self.cwd) + caller = BuildBackendCaller(root, self.env, self.backend_name) + pid = None + try: + pid = self.pool.submit(os.getpid).result(TIMEOUT) + return self.pool.submit(caller, name, *args, **kw).result(TIMEOUT) + except futures.TimeoutError: + self.pool.shutdown(wait=False) # doesn't stop already running processes + self._kill(pid) + pytest.xfail(f"Backend did not respond before timeout ({TIMEOUT} s)") + except (futures.process.BrokenProcessPool, MemoryError, OSError): + if IS_PYPY: + pytest.xfail("PyPy frequently fails tests with ProcessPoolExector") + raise + + return method + + def _kill(self, pid): + if pid is None: + return + with contextlib.suppress(ProcessLookupError, OSError): + os.kill(pid, signal.SIGTERM if os.name == "nt" else signal.SIGKILL) + + +class BuildBackendCaller(BuildBackendBase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + (self.backend_name, _, self.backend_obj) = self.backend_name.partition(':') + + def __call__(self, name, *args, **kw): + """Handles arbitrary function invocations on the build backend.""" + os.chdir(self.cwd) + os.environ.update(self.env) + mod = importlib.import_module(self.backend_name) + + if self.backend_obj: + backend = getattr(mod, self.backend_obj) + else: + backend = mod + + return getattr(backend, name)(*args, **kw) + + +defns = [ + { # simple setup.py script + 'setup.py': DALS( + """ + __import__('setuptools').setup( + name='foo', + version='0.0.0', + py_modules=['hello'], + setup_requires=['six'], + ) + """ + ), + 'hello.py': DALS( + """ + def run(): + print('hello') + """ + ), + }, + { # setup.py that relies on __name__ + 'setup.py': DALS( + """ + assert __name__ == '__main__' + __import__('setuptools').setup( + name='foo', + version='0.0.0', + py_modules=['hello'], + setup_requires=['six'], + ) + """ + ), + 'hello.py': DALS( + """ + def run(): + print('hello') + """ + ), + }, + { # setup.py script that runs arbitrary code + 'setup.py': DALS( + """ + variable = True + def function(): + return variable + assert variable + __import__('setuptools').setup( + name='foo', + version='0.0.0', + py_modules=['hello'], + setup_requires=['six'], + ) + """ + ), + 'hello.py': DALS( + """ + def run(): + print('hello') + """ + ), + }, + { # setup.py script that constructs temp files to be included in the distribution + 'setup.py': DALS( + """ + # Some packages construct files on the fly, include them in the package, + # and immediately remove them after `setup()` (e.g. pybind11==2.9.1). + # Therefore, we cannot use `distutils.core.run_setup(..., stop_after=...)` + # to obtain a distribution object first, and then run the distutils + # commands later, because these files will be removed in the meantime. + + with open('world.py', 'w', encoding="utf-8") as f: + f.write('x = 42') + + try: + __import__('setuptools').setup( + name='foo', + version='0.0.0', + py_modules=['world'], + setup_requires=['six'], + ) + finally: + # Some packages will clean temporary files + __import__('os').unlink('world.py') + """ + ), + }, + { # setup.cfg only + 'setup.cfg': DALS( + """ + [metadata] + name = foo + version = 0.0.0 + + [options] + py_modules=hello + setup_requires=six + """ + ), + 'hello.py': DALS( + """ + def run(): + print('hello') + """ + ), + }, + { # setup.cfg and setup.py + 'setup.cfg': DALS( + """ + [metadata] + name = foo + version = 0.0.0 + + [options] + py_modules=hello + setup_requires=six + """ + ), + 'setup.py': "__import__('setuptools').setup()", + 'hello.py': DALS( + """ + def run(): + print('hello') + """ + ), + }, +] + + +class TestBuildMetaBackend: + backend_name = 'setuptools.build_meta' + + def get_build_backend(self): + return BuildBackend(backend_name=self.backend_name) + + @pytest.fixture(params=defns) + def build_backend(self, tmpdir, request): + path.build(request.param, prefix=str(tmpdir)) + with tmpdir.as_cwd(): + yield self.get_build_backend() + + def test_get_requires_for_build_wheel(self, build_backend): + actual = build_backend.get_requires_for_build_wheel() + expected = ['six'] + assert sorted(actual) == sorted(expected) + + def test_get_requires_for_build_sdist(self, build_backend): + actual = build_backend.get_requires_for_build_sdist() + expected = ['six'] + assert sorted(actual) == sorted(expected) + + def test_build_wheel(self, build_backend): + dist_dir = os.path.abspath('pip-wheel') + os.makedirs(dist_dir) + wheel_name = build_backend.build_wheel(dist_dir) + + wheel_file = os.path.join(dist_dir, wheel_name) + assert os.path.isfile(wheel_file) + + # Temporary files should be removed + assert not os.path.isfile('world.py') + + with ZipFile(wheel_file) as zipfile: + wheel_contents = set(zipfile.namelist()) + + # Each one of the examples have a single module + # that should be included in the distribution + python_scripts = (f for f in wheel_contents if f.endswith('.py')) + modules = [f for f in python_scripts if not f.endswith('setup.py')] + assert len(modules) == 1 + + @pytest.mark.parametrize('build_type', ('wheel', 'sdist')) + def test_build_with_existing_file_present(self, build_type, tmpdir_cwd): + # Building a sdist/wheel should still succeed if there's + # already a sdist/wheel in the destination directory. + files = { + 'setup.py': "from setuptools import setup\nsetup()", + 'VERSION': "0.0.1", + 'setup.cfg': DALS( + """ + [metadata] + name = foo + version = file: VERSION + """ + ), + 'pyproject.toml': DALS( + """ + [build-system] + requires = ["setuptools", "wheel"] + build-backend = "setuptools.build_meta" + """ + ), + } + + path.build(files) + + dist_dir = os.path.abspath('preexisting-' + build_type) + + build_backend = self.get_build_backend() + build_method = getattr(build_backend, 'build_' + build_type) + + # Build a first sdist/wheel. + # Note: this also check the destination directory is + # successfully created if it does not exist already. + first_result = build_method(dist_dir) + + # Change version. + with open("VERSION", "wt", encoding="utf-8") as version_file: + version_file.write("0.0.2") + + # Build a *second* sdist/wheel. + second_result = build_method(dist_dir) + + assert os.path.isfile(os.path.join(dist_dir, first_result)) + assert first_result != second_result + + # And if rebuilding the exact same sdist/wheel? + open(os.path.join(dist_dir, second_result), 'wb').close() + third_result = build_method(dist_dir) + assert third_result == second_result + assert os.path.getsize(os.path.join(dist_dir, third_result)) > 0 + + @pytest.mark.parametrize("setup_script", [None, SETUP_SCRIPT_STUB]) + def test_build_with_pyproject_config(self, tmpdir, setup_script): + files = { + 'pyproject.toml': DALS( + """ + [build-system] + requires = ["setuptools", "wheel"] + build-backend = "setuptools.build_meta" + + [project] + name = "foo" + license = {text = "MIT"} + description = "This is a Python package" + dynamic = ["version", "readme"] + classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers" + ] + urls = {Homepage = "http://github.com"} + dependencies = [ + "appdirs", + ] + + [project.optional-dependencies] + all = [ + "tomli>=1", + "pyscaffold>=4,<5", + 'importlib; python_version == "2.6"', + ] + + [project.scripts] + foo = "foo.cli:main" + + [tool.setuptools] + zip-safe = false + package-dir = {"" = "src"} + packages = {find = {where = ["src"]}} + license-files = ["LICENSE*"] + + [tool.setuptools.dynamic] + version = {attr = "foo.__version__"} + readme = {file = "README.rst"} + + [tool.distutils.sdist] + formats = "gztar" + """ + ), + "MANIFEST.in": DALS( + """ + global-include *.py *.txt + global-exclude *.py[cod] + """ + ), + "README.rst": "This is a ``README``", + "LICENSE.txt": "---- placeholder MIT license ----", + "src": { + "foo": { + "__init__.py": "__version__ = '0.1'", + "__init__.pyi": "__version__: str", + "cli.py": "def main(): print('hello world')", + "data.txt": "def main(): print('hello world')", + "py.typed": "", + } + }, + } + if setup_script: + files["setup.py"] = setup_script + + build_backend = self.get_build_backend() + with tmpdir.as_cwd(): + path.build(files) + sdist_path = build_backend.build_sdist("temp") + wheel_file = build_backend.build_wheel("temp") + + with tarfile.open(os.path.join(tmpdir, "temp", sdist_path)) as tar: + sdist_contents = set(tar.getnames()) + + with ZipFile(os.path.join(tmpdir, "temp", wheel_file)) as zipfile: + wheel_contents = set(zipfile.namelist()) + metadata = str(zipfile.read("foo-0.1.dist-info/METADATA"), "utf-8") + license = str(zipfile.read("foo-0.1.dist-info/LICENSE.txt"), "utf-8") + epoints = str(zipfile.read("foo-0.1.dist-info/entry_points.txt"), "utf-8") + + assert sdist_contents - {"foo-0.1/setup.py"} == { + 'foo-0.1', + 'foo-0.1/LICENSE.txt', + 'foo-0.1/MANIFEST.in', + 'foo-0.1/PKG-INFO', + 'foo-0.1/README.rst', + 'foo-0.1/pyproject.toml', + 'foo-0.1/setup.cfg', + 'foo-0.1/src', + 'foo-0.1/src/foo', + 'foo-0.1/src/foo/__init__.py', + 'foo-0.1/src/foo/__init__.pyi', + 'foo-0.1/src/foo/cli.py', + 'foo-0.1/src/foo/data.txt', + 'foo-0.1/src/foo/py.typed', + 'foo-0.1/src/foo.egg-info', + 'foo-0.1/src/foo.egg-info/PKG-INFO', + 'foo-0.1/src/foo.egg-info/SOURCES.txt', + 'foo-0.1/src/foo.egg-info/dependency_links.txt', + 'foo-0.1/src/foo.egg-info/entry_points.txt', + 'foo-0.1/src/foo.egg-info/requires.txt', + 'foo-0.1/src/foo.egg-info/top_level.txt', + 'foo-0.1/src/foo.egg-info/not-zip-safe', + } + assert wheel_contents == { + "foo/__init__.py", + "foo/__init__.pyi", # include type information by default + "foo/cli.py", + "foo/data.txt", # include_package_data defaults to True + "foo/py.typed", # include type information by default + "foo-0.1.dist-info/LICENSE.txt", + "foo-0.1.dist-info/METADATA", + "foo-0.1.dist-info/WHEEL", + "foo-0.1.dist-info/entry_points.txt", + "foo-0.1.dist-info/top_level.txt", + "foo-0.1.dist-info/RECORD", + } + assert license == "---- placeholder MIT license ----" + + for line in ( + "Summary: This is a Python package", + "License: MIT", + "Classifier: Intended Audience :: Developers", + "Requires-Dist: appdirs", + "Requires-Dist: " + str(Requirement('tomli>=1 ; extra == "all"')), + "Requires-Dist: " + + str(Requirement('importlib; python_version=="2.6" and extra =="all"')), + ): + assert line in metadata, (line, metadata) + + assert metadata.strip().endswith("This is a ``README``") + assert epoints.strip() == "[console_scripts]\nfoo = foo.cli:main" + + def test_static_metadata_in_pyproject_config(self, tmpdir): + # Make sure static metadata in pyproject.toml is not overwritten by setup.py + # as required by PEP 621 + files = { + 'pyproject.toml': DALS( + """ + [build-system] + requires = ["setuptools", "wheel"] + build-backend = "setuptools.build_meta" + + [project] + name = "foo" + description = "This is a Python package" + version = "42" + dependencies = ["six"] + """ + ), + 'hello.py': DALS( + """ + def run(): + print('hello') + """ + ), + 'setup.py': DALS( + """ + __import__('setuptools').setup( + name='bar', + version='13', + ) + """ + ), + } + build_backend = self.get_build_backend() + with tmpdir.as_cwd(): + path.build(files) + sdist_path = build_backend.build_sdist("temp") + wheel_file = build_backend.build_wheel("temp") + + assert (tmpdir / "temp/foo-42.tar.gz").exists() + assert (tmpdir / "temp/foo-42-py3-none-any.whl").exists() + assert not (tmpdir / "temp/bar-13.tar.gz").exists() + assert not (tmpdir / "temp/bar-42.tar.gz").exists() + assert not (tmpdir / "temp/foo-13.tar.gz").exists() + assert not (tmpdir / "temp/bar-13-py3-none-any.whl").exists() + assert not (tmpdir / "temp/bar-42-py3-none-any.whl").exists() + assert not (tmpdir / "temp/foo-13-py3-none-any.whl").exists() + + with tarfile.open(os.path.join(tmpdir, "temp", sdist_path)) as tar: + pkg_info = str(tar.extractfile('foo-42/PKG-INFO').read(), "utf-8") + members = tar.getnames() + assert "bar-13/PKG-INFO" not in members + + with ZipFile(os.path.join(tmpdir, "temp", wheel_file)) as zipfile: + metadata = str(zipfile.read("foo-42.dist-info/METADATA"), "utf-8") + members = zipfile.namelist() + assert "bar-13.dist-info/METADATA" not in members + + for file in pkg_info, metadata: + for line in ("Name: foo", "Version: 42"): + assert line in file + for line in ("Name: bar", "Version: 13"): + assert line not in file + + def test_build_sdist(self, build_backend): + dist_dir = os.path.abspath('pip-sdist') + os.makedirs(dist_dir) + sdist_name = build_backend.build_sdist(dist_dir) + + assert os.path.isfile(os.path.join(dist_dir, sdist_name)) + + def test_prepare_metadata_for_build_wheel(self, build_backend): + dist_dir = os.path.abspath('pip-dist-info') + os.makedirs(dist_dir) + + dist_info = build_backend.prepare_metadata_for_build_wheel(dist_dir) + + assert os.path.isfile(os.path.join(dist_dir, dist_info, 'METADATA')) + + def test_prepare_metadata_inplace(self, build_backend): + """ + Some users might pass metadata_directory pre-populated with `.tox` or `.venv`. + See issue #3523. + """ + for pre_existing in [ + ".tox/python/lib/python3.10/site-packages/attrs-22.1.0.dist-info", + ".tox/python/lib/python3.10/site-packages/autocommand-2.2.1.dist-info", + ".nox/python/lib/python3.10/site-packages/build-0.8.0.dist-info", + ".venv/python3.10/site-packages/click-8.1.3.dist-info", + "venv/python3.10/site-packages/distlib-0.3.5.dist-info", + "env/python3.10/site-packages/docutils-0.19.dist-info", + ]: + os.makedirs(pre_existing, exist_ok=True) + dist_info = build_backend.prepare_metadata_for_build_wheel(".") + assert os.path.isfile(os.path.join(dist_info, 'METADATA')) + + def test_build_sdist_explicit_dist(self, build_backend): + # explicitly specifying the dist folder should work + # the folder sdist_directory and the ``--dist-dir`` can be the same + dist_dir = os.path.abspath('dist') + sdist_name = build_backend.build_sdist(dist_dir) + assert os.path.isfile(os.path.join(dist_dir, sdist_name)) + + def test_build_sdist_version_change(self, build_backend): + sdist_into_directory = os.path.abspath("out_sdist") + os.makedirs(sdist_into_directory) + + sdist_name = build_backend.build_sdist(sdist_into_directory) + assert os.path.isfile(os.path.join(sdist_into_directory, sdist_name)) + + # if the setup.py changes subsequent call of the build meta + # should still succeed, given the + # sdist_directory the frontend specifies is empty + setup_loc = os.path.abspath("setup.py") + if not os.path.exists(setup_loc): + setup_loc = os.path.abspath("setup.cfg") + + with open(setup_loc, 'rt', encoding="utf-8") as file_handler: + content = file_handler.read() + with open(setup_loc, 'wt', encoding="utf-8") as file_handler: + file_handler.write(content.replace("version='0.0.0'", "version='0.0.1'")) + + shutil.rmtree(sdist_into_directory) + os.makedirs(sdist_into_directory) + + sdist_name = build_backend.build_sdist("out_sdist") + assert os.path.isfile(os.path.join(os.path.abspath("out_sdist"), sdist_name)) + + def test_build_sdist_pyproject_toml_exists(self, tmpdir_cwd): + files = { + 'setup.py': DALS( + """ + __import__('setuptools').setup( + name='foo', + version='0.0.0', + py_modules=['hello'] + )""" + ), + 'hello.py': '', + 'pyproject.toml': DALS( + """ + [build-system] + requires = ["setuptools", "wheel"] + build-backend = "setuptools.build_meta" + """ + ), + } + path.build(files) + build_backend = self.get_build_backend() + targz_path = build_backend.build_sdist("temp") + with tarfile.open(os.path.join("temp", targz_path)) as tar: + assert any('pyproject.toml' in name for name in tar.getnames()) + + def test_build_sdist_setup_py_exists(self, tmpdir_cwd): + # If build_sdist is called from a script other than setup.py, + # ensure setup.py is included + path.build(defns[0]) + + build_backend = self.get_build_backend() + targz_path = build_backend.build_sdist("temp") + with tarfile.open(os.path.join("temp", targz_path)) as tar: + assert any('setup.py' in name for name in tar.getnames()) + + def test_build_sdist_setup_py_manifest_excluded(self, tmpdir_cwd): + # Ensure that MANIFEST.in can exclude setup.py + files = { + 'setup.py': DALS( + """ + __import__('setuptools').setup( + name='foo', + version='0.0.0', + py_modules=['hello'] + )""" + ), + 'hello.py': '', + 'MANIFEST.in': DALS( + """ + exclude setup.py + """ + ), + } + + path.build(files) + + build_backend = self.get_build_backend() + targz_path = build_backend.build_sdist("temp") + with tarfile.open(os.path.join("temp", targz_path)) as tar: + assert not any('setup.py' in name for name in tar.getnames()) + + def test_build_sdist_builds_targz_even_if_zip_indicated(self, tmpdir_cwd): + files = { + 'setup.py': DALS( + """ + __import__('setuptools').setup( + name='foo', + version='0.0.0', + py_modules=['hello'] + )""" + ), + 'hello.py': '', + 'setup.cfg': DALS( + """ + [sdist] + formats=zip + """ + ), + } + + path.build(files) + + build_backend = self.get_build_backend() + build_backend.build_sdist("temp") + + _relative_path_import_files = { + 'setup.py': DALS( + """ + __import__('setuptools').setup( + name='foo', + version=__import__('hello').__version__, + py_modules=['hello'] + )""" + ), + 'hello.py': '__version__ = "0.0.0"', + 'setup.cfg': DALS( + """ + [sdist] + formats=zip + """ + ), + } + + def test_build_sdist_relative_path_import(self, tmpdir_cwd): + path.build(self._relative_path_import_files) + build_backend = self.get_build_backend() + with pytest.raises(ImportError, match="^No module named 'hello'$"): + build_backend.build_sdist("temp") + + _simple_pyproject_example = { + "pyproject.toml": DALS( + """ + [project] + name = "proj" + version = "42" + """ + ), + "src": {"proj": {"__init__.py": ""}}, + } + + def _assert_link_tree(self, parent_dir): + """All files in the directory should be either links or hard links""" + files = list(Path(parent_dir).glob("**/*")) + assert files # Should not be empty + for file in files: + assert file.is_symlink() or os.stat(file).st_nlink > 0 + + def test_editable_without_config_settings(self, tmpdir_cwd): + """ + Sanity check to ensure tests with --mode=strict are different from the ones + without --mode. + + --mode=strict should create a local directory with a package tree. + The directory should not get created otherwise. + """ + path.build(self._simple_pyproject_example) + build_backend = self.get_build_backend() + assert not Path("build").exists() + build_backend.build_editable("temp") + assert not Path("build").exists() + + def test_build_wheel_inplace(self, tmpdir_cwd): + config_settings = {"--build-option": ["build_ext", "--inplace"]} + path.build(self._simple_pyproject_example) + build_backend = self.get_build_backend() + assert not Path("build").exists() + Path("build").mkdir() + build_backend.prepare_metadata_for_build_wheel("build", config_settings) + build_backend.build_wheel("build", config_settings) + assert Path("build/proj-42-py3-none-any.whl").exists() + + @pytest.mark.parametrize("config_settings", [{"editable-mode": "strict"}]) + def test_editable_with_config_settings(self, tmpdir_cwd, config_settings): + path.build({**self._simple_pyproject_example, '_meta': {}}) + assert not Path("build").exists() + build_backend = self.get_build_backend() + build_backend.prepare_metadata_for_build_editable("_meta", config_settings) + build_backend.build_editable("temp", config_settings, "_meta") + self._assert_link_tree(next(Path("build").glob("__editable__.*"))) + + @pytest.mark.parametrize( + ("setup_literal", "requirements"), + [ + ("'foo'", ['foo']), + ("['foo']", ['foo']), + (r"'foo\n'", ['foo']), + (r"'foo\n\n'", ['foo']), + ("['foo', 'bar']", ['foo', 'bar']), + (r"'# Has a comment line\nfoo'", ['foo']), + (r"'foo # Has an inline comment'", ['foo']), + (r"'foo \\\n >=3.0'", ['foo>=3.0']), + (r"'foo\nbar'", ['foo', 'bar']), + (r"'foo\nbar\n'", ['foo', 'bar']), + (r"['foo\n', 'bar\n']", ['foo', 'bar']), + ], + ) + @pytest.mark.parametrize('use_wheel', [True, False]) + def test_setup_requires(self, setup_literal, requirements, use_wheel, tmpdir_cwd): + files = { + 'setup.py': DALS( + """ + from setuptools import setup + + setup( + name="qux", + version="0.0.0", + py_modules=["hello"], + setup_requires={setup_literal}, + ) + """ + ).format(setup_literal=setup_literal), + 'hello.py': DALS( + """ + def run(): + print('hello') + """ + ), + } + + path.build(files) + + build_backend = self.get_build_backend() + + if use_wheel: + get_requires = build_backend.get_requires_for_build_wheel + else: + get_requires = build_backend.get_requires_for_build_sdist + + # Ensure that the build requirements are properly parsed + expected = sorted(requirements) + actual = get_requires() + + assert expected == sorted(actual) + + def test_setup_requires_with_auto_discovery(self, tmpdir_cwd): + # Make sure patches introduced to retrieve setup_requires don't accidentally + # activate auto-discovery and cause problems due to the incomplete set of + # attributes passed to MinimalDistribution + files = { + 'pyproject.toml': DALS( + """ + [project] + name = "proj" + version = "42" + """ + ), + "setup.py": DALS( + """ + __import__('setuptools').setup( + setup_requires=["foo"], + py_modules = ["hello", "world"] + ) + """ + ), + 'hello.py': "'hello'", + 'world.py': "'world'", + } + path.build(files) + build_backend = self.get_build_backend() + setup_requires = build_backend.get_requires_for_build_wheel() + assert setup_requires == ["foo"] + + def test_dont_install_setup_requires(self, tmpdir_cwd): + files = { + 'setup.py': DALS( + """ + from setuptools import setup + + setup( + name="qux", + version="0.0.0", + py_modules=["hello"], + setup_requires=["does-not-exist >99"], + ) + """ + ), + 'hello.py': DALS( + """ + def run(): + print('hello') + """ + ), + } + + path.build(files) + + build_backend = self.get_build_backend() + + dist_dir = os.path.abspath('pip-dist-info') + os.makedirs(dist_dir) + + # does-not-exist can't be satisfied, so if it attempts to install + # setup_requires, it will fail. + build_backend.prepare_metadata_for_build_wheel(dist_dir) + + _sys_argv_0_passthrough = { + 'setup.py': DALS( + """ + import os + import sys + + __import__('setuptools').setup( + name='foo', + version='0.0.0', + ) + + sys_argv = os.path.abspath(sys.argv[0]) + file_path = os.path.abspath('setup.py') + assert sys_argv == file_path + """ + ) + } + + def test_sys_argv_passthrough(self, tmpdir_cwd): + path.build(self._sys_argv_0_passthrough) + build_backend = self.get_build_backend() + with pytest.raises(AssertionError): + build_backend.build_sdist("temp") + + _setup_py_file_abspath = { + 'setup.py': DALS( + """ + import os + assert os.path.isabs(__file__) + __import__('setuptools').setup( + name='foo', + version='0.0.0', + py_modules=['hello'], + setup_requires=['six'], + ) + """ + ) + } + + def test_setup_py_file_abspath(self, tmpdir_cwd): + path.build(self._setup_py_file_abspath) + build_backend = self.get_build_backend() + build_backend.build_sdist("temp") + + @pytest.mark.parametrize('build_hook', ('build_sdist', 'build_wheel')) + def test_build_with_empty_setuppy(self, build_backend, build_hook): + files = {'setup.py': ''} + path.build(files) + + msg = re.escape('No distribution was found.') + with pytest.raises(ValueError, match=msg): + getattr(build_backend, build_hook)("temp") + + +class TestBuildMetaLegacyBackend(TestBuildMetaBackend): + backend_name = 'setuptools.build_meta:__legacy__' + + # build_meta_legacy-specific tests + def test_build_sdist_relative_path_import(self, tmpdir_cwd): + # This must fail in build_meta, but must pass in build_meta_legacy + path.build(self._relative_path_import_files) + + build_backend = self.get_build_backend() + build_backend.build_sdist("temp") + + def test_sys_argv_passthrough(self, tmpdir_cwd): + path.build(self._sys_argv_0_passthrough) + + build_backend = self.get_build_backend() + build_backend.build_sdist("temp") + + +def test_legacy_editable_install(venv, tmpdir, tmpdir_cwd): + pyproject = """ + [build-system] + requires = ["setuptools"] + build-backend = "setuptools.build_meta" + [project] + name = "myproj" + version = "42" + """ + path.build({"pyproject.toml": DALS(pyproject), "mymod.py": ""}) + + # First: sanity check + cmd = ["pip", "install", "--no-build-isolation", "-e", "."] + output = venv.run(cmd, cwd=tmpdir).lower() + assert "running setup.py develop for myproj" not in output + assert "created wheel for myproj" in output + + # Then: real test + env = {**os.environ, "SETUPTOOLS_ENABLE_FEATURES": "legacy-editable"} + cmd = ["pip", "install", "--no-build-isolation", "-e", "."] + output = venv.run(cmd, cwd=tmpdir, env=env).lower() + assert "running setup.py develop for myproj" in output + + +@pytest.mark.filterwarnings("ignore::setuptools.SetuptoolsDeprecationWarning") +def test_sys_exit_0_in_setuppy(monkeypatch, tmp_path): + """Setuptools should be resilient to setup.py with ``sys.exit(0)`` (#3973).""" + monkeypatch.chdir(tmp_path) + setuppy = """ + import sys, setuptools + setuptools.setup(name='foo', version='0.0.0') + sys.exit(0) + """ + (tmp_path / "setup.py").write_text(DALS(setuppy), encoding="utf-8") + backend = BuildBackend(backend_name="setuptools.build_meta") + assert backend.get_requires_for_build_wheel() == [] + + +def test_system_exit_in_setuppy(monkeypatch, tmp_path): + monkeypatch.chdir(tmp_path) + setuppy = "import sys; sys.exit('some error')" + (tmp_path / "setup.py").write_text(setuppy, encoding="utf-8") + with pytest.raises(SystemExit, match="some error"): + backend = BuildBackend(backend_name="setuptools.build_meta") + backend.get_requires_for_build_wheel() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_py.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_py.py new file mode 100644 index 0000000000000000000000000000000000000000..e64cfa2e4bee38371c0e9194c4dc41457d492f7b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_build_py.py @@ -0,0 +1,480 @@ +import os +import shutil +import stat +import warnings +from pathlib import Path +from unittest.mock import Mock + +import jaraco.path +import pytest + +from setuptools import SetuptoolsDeprecationWarning +from setuptools.dist import Distribution + +from .textwrap import DALS + + +def test_directories_in_package_data_glob(tmpdir_cwd): + """ + Directories matching the glob in package_data should + not be included in the package data. + + Regression test for #261. + """ + dist = Distribution( + dict( + script_name='setup.py', + script_args=['build_py'], + packages=[''], + package_data={'': ['path/*']}, + ) + ) + os.makedirs('path/subpath') + dist.parse_command_line() + dist.run_commands() + + +def test_recursive_in_package_data_glob(tmpdir_cwd): + """ + Files matching recursive globs (**) in package_data should + be included in the package data. + + #1806 + """ + dist = Distribution( + dict( + script_name='setup.py', + script_args=['build_py'], + packages=[''], + package_data={'': ['path/**/data']}, + ) + ) + os.makedirs('path/subpath/subsubpath') + open('path/subpath/subsubpath/data', 'wb').close() + + dist.parse_command_line() + dist.run_commands() + + assert stat.S_ISREG(os.stat('build/lib/path/subpath/subsubpath/data').st_mode), ( + "File is not included" + ) + + +def test_read_only(tmpdir_cwd): + """ + Ensure read-only flag is not preserved in copy + for package modules and package data, as that + causes problems with deleting read-only files on + Windows. + + #1451 + """ + dist = Distribution( + dict( + script_name='setup.py', + script_args=['build_py'], + packages=['pkg'], + package_data={'pkg': ['data.dat']}, + ) + ) + os.makedirs('pkg') + open('pkg/__init__.py', 'wb').close() + open('pkg/data.dat', 'wb').close() + os.chmod('pkg/__init__.py', stat.S_IREAD) + os.chmod('pkg/data.dat', stat.S_IREAD) + dist.parse_command_line() + dist.run_commands() + shutil.rmtree('build') + + +@pytest.mark.xfail( + 'platform.system() == "Windows"', + reason="On Windows, files do not have executable bits", + raises=AssertionError, + strict=True, +) +def test_executable_data(tmpdir_cwd): + """ + Ensure executable bit is preserved in copy for + package data, as users rely on it for scripts. + + #2041 + """ + dist = Distribution( + dict( + script_name='setup.py', + script_args=['build_py'], + packages=['pkg'], + package_data={'pkg': ['run-me']}, + ) + ) + os.makedirs('pkg') + open('pkg/__init__.py', 'wb').close() + open('pkg/run-me', 'wb').close() + os.chmod('pkg/run-me', 0o700) + + dist.parse_command_line() + dist.run_commands() + + assert os.stat('build/lib/pkg/run-me').st_mode & stat.S_IEXEC, ( + "Script is not executable" + ) + + +EXAMPLE_WITH_MANIFEST = { + "setup.cfg": DALS( + """ + [metadata] + name = mypkg + version = 42 + + [options] + include_package_data = True + packages = find: + + [options.packages.find] + exclude = *.tests* + """ + ), + "mypkg": { + "__init__.py": "", + "resource_file.txt": "", + "tests": { + "__init__.py": "", + "test_mypkg.py": "", + "test_file.txt": "", + }, + }, + "MANIFEST.in": DALS( + """ + global-include *.py *.txt + global-exclude *.py[cod] + prune dist + prune build + prune *.egg-info + """ + ), +} + + +def test_excluded_subpackages(tmpdir_cwd): + jaraco.path.build(EXAMPLE_WITH_MANIFEST) + dist = Distribution({"script_name": "%PEP 517%"}) + dist.parse_config_files() + + build_py = dist.get_command_obj("build_py") + + msg = r"Python recognizes 'mypkg\.tests' as an importable package" + with pytest.warns(SetuptoolsDeprecationWarning, match=msg): + # TODO: To fix #3260 we need some transition period to deprecate the + # existing behavior of `include_package_data`. After the transition, we + # should remove the warning and fix the behaviour. + + if os.getenv("SETUPTOOLS_USE_DISTUTILS") == "stdlib": + # pytest.warns reset the warning filter temporarily + # https://github.com/pytest-dev/pytest/issues/4011#issuecomment-423494810 + warnings.filterwarnings( + "ignore", + "'encoding' argument not specified", + module="distutils.text_file", + # This warning is already fixed in pypa/distutils but not in stdlib + ) + + build_py.finalize_options() + build_py.run() + + build_dir = Path(dist.get_command_obj("build_py").build_lib) + assert (build_dir / "mypkg/__init__.py").exists() + assert (build_dir / "mypkg/resource_file.txt").exists() + + # Setuptools is configured to ignore `mypkg.tests`, therefore the following + # files/dirs should not be included in the distribution. + for f in [ + "mypkg/tests/__init__.py", + "mypkg/tests/test_mypkg.py", + "mypkg/tests/test_file.txt", + "mypkg/tests", + ]: + with pytest.raises(AssertionError): + # TODO: Enforce the following assertion once #3260 is fixed + # (remove context manager and the following xfail). + assert not (build_dir / f).exists() + + pytest.xfail("#3260") + + +@pytest.mark.filterwarnings("ignore::setuptools.SetuptoolsDeprecationWarning") +def test_existing_egg_info(tmpdir_cwd, monkeypatch): + """When provided with the ``existing_egg_info_dir`` attribute, build_py should not + attempt to run egg_info again. + """ + # == Pre-condition == + # Generate an egg-info dir + jaraco.path.build(EXAMPLE_WITH_MANIFEST) + dist = Distribution({"script_name": "%PEP 517%"}) + dist.parse_config_files() + assert dist.include_package_data + + egg_info = dist.get_command_obj("egg_info") + dist.run_command("egg_info") + egg_info_dir = next(Path(egg_info.egg_base).glob("*.egg-info")) + assert egg_info_dir.is_dir() + + # == Setup == + build_py = dist.get_command_obj("build_py") + build_py.finalize_options() + egg_info = dist.get_command_obj("egg_info") + egg_info_run = Mock(side_effect=egg_info.run) + monkeypatch.setattr(egg_info, "run", egg_info_run) + + # == Remove caches == + # egg_info is called when build_py looks for data_files, which gets cached. + # We need to ensure it is not cached yet, otherwise it may impact on the tests + build_py.__dict__.pop('data_files', None) + dist.reinitialize_command(egg_info) + + # == Sanity check == + # Ensure that if existing_egg_info is not given, build_py attempts to run egg_info + build_py.existing_egg_info_dir = None + build_py.run() + egg_info_run.assert_called() + + # == Remove caches == + egg_info_run.reset_mock() + build_py.__dict__.pop('data_files', None) + dist.reinitialize_command(egg_info) + + # == Actual test == + # Ensure that if existing_egg_info_dir is given, egg_info doesn't run + build_py.existing_egg_info_dir = egg_info_dir + build_py.run() + egg_info_run.assert_not_called() + assert build_py.data_files + + # Make sure the list of outputs is actually OK + outputs = map(lambda x: x.replace(os.sep, "/"), build_py.get_outputs()) + assert outputs + example = str(Path(build_py.build_lib, "mypkg/__init__.py")).replace(os.sep, "/") + assert example in outputs + + +EXAMPLE_ARBITRARY_MAPPING = { + "pyproject.toml": DALS( + """ + [project] + name = "mypkg" + version = "42" + + [tool.setuptools] + packages = ["mypkg", "mypkg.sub1", "mypkg.sub2", "mypkg.sub2.nested"] + + [tool.setuptools.package-dir] + "" = "src" + "mypkg.sub2" = "src/mypkg/_sub2" + "mypkg.sub2.nested" = "other" + """ + ), + "src": { + "mypkg": { + "__init__.py": "", + "resource_file.txt": "", + "sub1": { + "__init__.py": "", + "mod1.py": "", + }, + "_sub2": { + "mod2.py": "", + }, + }, + }, + "other": { + "__init__.py": "", + "mod3.py": "", + }, + "MANIFEST.in": DALS( + """ + global-include *.py *.txt + global-exclude *.py[cod] + """ + ), +} + + +def test_get_outputs(tmpdir_cwd): + jaraco.path.build(EXAMPLE_ARBITRARY_MAPPING) + dist = Distribution({"script_name": "%test%"}) + dist.parse_config_files() + + build_py = dist.get_command_obj("build_py") + build_py.editable_mode = True + build_py.ensure_finalized() + build_lib = build_py.build_lib.replace(os.sep, "/") + outputs = {x.replace(os.sep, "/") for x in build_py.get_outputs()} + assert outputs == { + f"{build_lib}/mypkg/__init__.py", + f"{build_lib}/mypkg/resource_file.txt", + f"{build_lib}/mypkg/sub1/__init__.py", + f"{build_lib}/mypkg/sub1/mod1.py", + f"{build_lib}/mypkg/sub2/mod2.py", + f"{build_lib}/mypkg/sub2/nested/__init__.py", + f"{build_lib}/mypkg/sub2/nested/mod3.py", + } + mapping = { + k.replace(os.sep, "/"): v.replace(os.sep, "/") + for k, v in build_py.get_output_mapping().items() + } + assert mapping == { + f"{build_lib}/mypkg/__init__.py": "src/mypkg/__init__.py", + f"{build_lib}/mypkg/resource_file.txt": "src/mypkg/resource_file.txt", + f"{build_lib}/mypkg/sub1/__init__.py": "src/mypkg/sub1/__init__.py", + f"{build_lib}/mypkg/sub1/mod1.py": "src/mypkg/sub1/mod1.py", + f"{build_lib}/mypkg/sub2/mod2.py": "src/mypkg/_sub2/mod2.py", + f"{build_lib}/mypkg/sub2/nested/__init__.py": "other/__init__.py", + f"{build_lib}/mypkg/sub2/nested/mod3.py": "other/mod3.py", + } + + +class TestTypeInfoFiles: + PYPROJECTS = { + "default_pyproject": DALS( + """ + [project] + name = "foo" + version = "1" + """ + ), + "dont_include_package_data": DALS( + """ + [project] + name = "foo" + version = "1" + + [tool.setuptools] + include-package-data = false + """ + ), + "exclude_type_info": DALS( + """ + [project] + name = "foo" + version = "1" + + [tool.setuptools] + include-package-data = false + + [tool.setuptools.exclude-package-data] + "*" = ["py.typed", "*.pyi"] + """ + ), + } + + EXAMPLES = { + "simple_namespace": { + "directory_structure": { + "foo": { + "bar.pyi": "", + "py.typed": "", + "__init__.py": "", + } + }, + "expected_type_files": {"foo/bar.pyi", "foo/py.typed"}, + }, + "nested_inside_namespace": { + "directory_structure": { + "foo": { + "bar": { + "py.typed": "", + "mod.pyi": "", + } + } + }, + "expected_type_files": {"foo/bar/mod.pyi", "foo/bar/py.typed"}, + }, + "namespace_nested_inside_regular": { + "directory_structure": { + "foo": { + "namespace": { + "foo.pyi": "", + }, + "__init__.pyi": "", + "py.typed": "", + } + }, + "expected_type_files": { + "foo/namespace/foo.pyi", + "foo/__init__.pyi", + "foo/py.typed", + }, + }, + } + + @pytest.mark.parametrize( + "pyproject", + [ + "default_pyproject", + pytest.param( + "dont_include_package_data", + marks=pytest.mark.xfail(reason="pypa/setuptools#4350"), + ), + ], + ) + @pytest.mark.parametrize("example", EXAMPLES.keys()) + def test_type_files_included_by_default(self, tmpdir_cwd, pyproject, example): + structure = { + **self.EXAMPLES[example]["directory_structure"], + "pyproject.toml": self.PYPROJECTS[pyproject], + } + expected_type_files = self.EXAMPLES[example]["expected_type_files"] + jaraco.path.build(structure) + + build_py = get_finalized_build_py() + outputs = get_outputs(build_py) + assert expected_type_files <= outputs + + @pytest.mark.parametrize("pyproject", ["exclude_type_info"]) + @pytest.mark.parametrize("example", EXAMPLES.keys()) + def test_type_files_can_be_excluded(self, tmpdir_cwd, pyproject, example): + structure = { + **self.EXAMPLES[example]["directory_structure"], + "pyproject.toml": self.PYPROJECTS[pyproject], + } + expected_type_files = self.EXAMPLES[example]["expected_type_files"] + jaraco.path.build(structure) + + build_py = get_finalized_build_py() + outputs = get_outputs(build_py) + assert expected_type_files.isdisjoint(outputs) + + def test_stub_only_package(self, tmpdir_cwd): + structure = { + "pyproject.toml": DALS( + """ + [project] + name = "foo-stubs" + version = "1" + """ + ), + "foo-stubs": {"__init__.pyi": "", "bar.pyi": ""}, + } + expected_type_files = {"foo-stubs/__init__.pyi", "foo-stubs/bar.pyi"} + jaraco.path.build(structure) + + build_py = get_finalized_build_py() + outputs = get_outputs(build_py) + assert expected_type_files <= outputs + + +def get_finalized_build_py(script_name="%build_py-test%"): + dist = Distribution({"script_name": script_name}) + dist.parse_config_files() + build_py = dist.get_command_obj("build_py") + build_py.finalize_options() + return build_py + + +def get_outputs(build_py): + build_dir = Path(build_py.build_lib) + return { + os.path.relpath(x, build_dir).replace(os.sep, "/") + for x in build_py.get_outputs() + } diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_config_discovery.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_config_discovery.py new file mode 100644 index 0000000000000000000000000000000000000000..b5df8203cdb6f9129a65d0c503c4f51e21315b1f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_config_discovery.py @@ -0,0 +1,647 @@ +import os +import sys +from configparser import ConfigParser +from itertools import product +from typing import cast + +import jaraco.path +import pytest +from path import Path + +import setuptools # noqa: F401 # force distutils.core to be patched +from setuptools.command.sdist import sdist +from setuptools.discovery import find_package_path, find_parent_package +from setuptools.dist import Distribution +from setuptools.errors import PackageDiscoveryError + +from .contexts import quiet +from .integration.helpers import get_sdist_members, get_wheel_members, run +from .textwrap import DALS + +import distutils.core + + +class TestFindParentPackage: + def test_single_package(self, tmp_path): + # find_parent_package should find a non-namespace parent package + (tmp_path / "src/namespace/pkg/nested").mkdir(exist_ok=True, parents=True) + (tmp_path / "src/namespace/pkg/nested/__init__.py").touch() + (tmp_path / "src/namespace/pkg/__init__.py").touch() + packages = ["namespace", "namespace.pkg", "namespace.pkg.nested"] + assert find_parent_package(packages, {"": "src"}, tmp_path) == "namespace.pkg" + + def test_multiple_toplevel(self, tmp_path): + # find_parent_package should return null if the given list of packages does not + # have a single parent package + multiple = ["pkg", "pkg1", "pkg2"] + for name in multiple: + (tmp_path / f"src/{name}").mkdir(exist_ok=True, parents=True) + (tmp_path / f"src/{name}/__init__.py").touch() + assert find_parent_package(multiple, {"": "src"}, tmp_path) is None + + +class TestDiscoverPackagesAndPyModules: + """Make sure discovered values for ``packages`` and ``py_modules`` work + similarly to explicit configuration for the simple scenarios. + """ + + OPTIONS = { + # Different options according to the circumstance being tested + "explicit-src": {"package_dir": {"": "src"}, "packages": ["pkg"]}, + "variation-lib": { + "package_dir": {"": "lib"}, # variation of the source-layout + }, + "explicit-flat": {"packages": ["pkg"]}, + "explicit-single_module": {"py_modules": ["pkg"]}, + "explicit-namespace": {"packages": ["ns", "ns.pkg"]}, + "automatic-src": {}, + "automatic-flat": {}, + "automatic-single_module": {}, + "automatic-namespace": {}, + } + FILES = { + "src": ["src/pkg/__init__.py", "src/pkg/main.py"], + "lib": ["lib/pkg/__init__.py", "lib/pkg/main.py"], + "flat": ["pkg/__init__.py", "pkg/main.py"], + "single_module": ["pkg.py"], + "namespace": ["ns/pkg/__init__.py"], + } + + def _get_info(self, circumstance): + _, _, layout = circumstance.partition("-") + files = self.FILES[layout] + options = self.OPTIONS[circumstance] + return files, options + + @pytest.mark.parametrize("circumstance", OPTIONS.keys()) + def test_sdist_filelist(self, tmp_path, circumstance): + files, options = self._get_info(circumstance) + _populate_project_dir(tmp_path, files, options) + + _, cmd = _run_sdist_programatically(tmp_path, options) + + manifest = [f.replace(os.sep, "/") for f in cmd.filelist.files] + for file in files: + assert any(f.endswith(file) for f in manifest) + + @pytest.mark.parametrize("circumstance", OPTIONS.keys()) + def test_project(self, tmp_path, circumstance): + files, options = self._get_info(circumstance) + _populate_project_dir(tmp_path, files, options) + + # Simulate a pre-existing `build` directory + (tmp_path / "build").mkdir() + (tmp_path / "build/lib").mkdir() + (tmp_path / "build/bdist.linux-x86_64").mkdir() + (tmp_path / "build/bdist.linux-x86_64/file.py").touch() + (tmp_path / "build/lib/__init__.py").touch() + (tmp_path / "build/lib/file.py").touch() + (tmp_path / "dist").mkdir() + (tmp_path / "dist/file.py").touch() + + _run_build(tmp_path) + + sdist_files = get_sdist_members(next(tmp_path.glob("dist/*.tar.gz"))) + print("~~~~~ sdist_members ~~~~~") + print('\n'.join(sdist_files)) + assert sdist_files >= set(files) + + wheel_files = get_wheel_members(next(tmp_path.glob("dist/*.whl"))) + print("~~~~~ wheel_members ~~~~~") + print('\n'.join(wheel_files)) + orig_files = {f.replace("src/", "").replace("lib/", "") for f in files} + assert wheel_files >= orig_files + + # Make sure build files are not included by mistake + for file in wheel_files: + assert "build" not in files + assert "dist" not in files + + PURPOSEFULLY_EMPY = { + "setup.cfg": DALS( + """ + [metadata] + name = myproj + version = 0.0.0 + + [options] + {param} = + """ + ), + "setup.py": DALS( + """ + __import__('setuptools').setup( + name="myproj", + version="0.0.0", + {param}=[] + ) + """ + ), + "pyproject.toml": DALS( + """ + [build-system] + requires = [] + build-backend = 'setuptools.build_meta' + + [project] + name = "myproj" + version = "0.0.0" + + [tool.setuptools] + {param} = [] + """ + ), + "template-pyproject.toml": DALS( + """ + [build-system] + requires = [] + build-backend = 'setuptools.build_meta' + """ + ), + } + + @pytest.mark.parametrize( + ("config_file", "param", "circumstance"), + product( + ["setup.cfg", "setup.py", "pyproject.toml"], + ["packages", "py_modules"], + FILES.keys(), + ), + ) + def test_purposefully_empty(self, tmp_path, config_file, param, circumstance): + files = self.FILES[circumstance] + ["mod.py", "other.py", "src/pkg/__init__.py"] + _populate_project_dir(tmp_path, files, {}) + + if config_file == "pyproject.toml": + template_param = param.replace("_", "-") + else: + # Make sure build works with or without setup.cfg + pyproject = self.PURPOSEFULLY_EMPY["template-pyproject.toml"] + (tmp_path / "pyproject.toml").write_text(pyproject, encoding="utf-8") + template_param = param + + config = self.PURPOSEFULLY_EMPY[config_file].format(param=template_param) + (tmp_path / config_file).write_text(config, encoding="utf-8") + + dist = _get_dist(tmp_path, {}) + # When either parameter package or py_modules is an empty list, + # then there should be no discovery + assert getattr(dist, param) == [] + other = {"py_modules": "packages", "packages": "py_modules"}[param] + assert getattr(dist, other) is None + + @pytest.mark.parametrize( + ("extra_files", "pkgs"), + [ + (["venv/bin/simulate_venv"], {"pkg"}), + (["pkg-stubs/__init__.pyi"], {"pkg", "pkg-stubs"}), + (["other-stubs/__init__.pyi"], {"pkg", "other-stubs"}), + ( + # Type stubs can also be namespaced + ["namespace-stubs/pkg/__init__.pyi"], + {"pkg", "namespace-stubs", "namespace-stubs.pkg"}, + ), + ( + # Just the top-level package can have `-stubs`, ignore nested ones + ["namespace-stubs/pkg-stubs/__init__.pyi"], + {"pkg", "namespace-stubs"}, + ), + (["_hidden/file.py"], {"pkg"}), + (["news/finalize.py"], {"pkg"}), + ], + ) + def test_flat_layout_with_extra_files(self, tmp_path, extra_files, pkgs): + files = self.FILES["flat"] + extra_files + _populate_project_dir(tmp_path, files, {}) + dist = _get_dist(tmp_path, {}) + assert set(dist.packages) == pkgs + + @pytest.mark.parametrize( + "extra_files", + [ + ["other/__init__.py"], + ["other/finalize.py"], + ], + ) + def test_flat_layout_with_dangerous_extra_files(self, tmp_path, extra_files): + files = self.FILES["flat"] + extra_files + _populate_project_dir(tmp_path, files, {}) + with pytest.raises(PackageDiscoveryError, match="multiple (packages|modules)"): + _get_dist(tmp_path, {}) + + def test_flat_layout_with_single_module(self, tmp_path): + files = self.FILES["single_module"] + ["invalid-module-name.py"] + _populate_project_dir(tmp_path, files, {}) + dist = _get_dist(tmp_path, {}) + assert set(dist.py_modules) == {"pkg"} + + def test_flat_layout_with_multiple_modules(self, tmp_path): + files = self.FILES["single_module"] + ["valid_module_name.py"] + _populate_project_dir(tmp_path, files, {}) + with pytest.raises(PackageDiscoveryError, match="multiple (packages|modules)"): + _get_dist(tmp_path, {}) + + def test_py_modules_when_wheel_dir_is_cwd(self, tmp_path): + """Regression for issue 3692""" + from setuptools import build_meta + + pyproject = '[project]\nname = "test"\nversion = "1"' + (tmp_path / "pyproject.toml").write_text(DALS(pyproject), encoding="utf-8") + (tmp_path / "foo.py").touch() + with jaraco.path.DirectoryStack().context(tmp_path): + build_meta.build_wheel(".") + # Ensure py_modules are found + wheel_files = get_wheel_members(next(tmp_path.glob("*.whl"))) + assert "foo.py" in wheel_files + + +class TestNoConfig: + DEFAULT_VERSION = "0.0.0" # Default version given by setuptools + + EXAMPLES = { + "pkg1": ["src/pkg1.py"], + "pkg2": ["src/pkg2/__init__.py"], + "pkg3": ["src/pkg3/__init__.py", "src/pkg3-stubs/__init__.py"], + "pkg4": ["pkg4/__init__.py", "pkg4-stubs/__init__.py"], + "ns.nested.pkg1": ["src/ns/nested/pkg1/__init__.py"], + "ns.nested.pkg2": ["ns/nested/pkg2/__init__.py"], + } + + @pytest.mark.parametrize("example", EXAMPLES.keys()) + def test_discover_name(self, tmp_path, example): + _populate_project_dir(tmp_path, self.EXAMPLES[example], {}) + dist = _get_dist(tmp_path, {}) + assert dist.get_name() == example + + def test_build_with_discovered_name(self, tmp_path): + files = ["src/ns/nested/pkg/__init__.py"] + _populate_project_dir(tmp_path, files, {}) + _run_build(tmp_path, "--sdist") + # Expected distribution file + dist_file = tmp_path / f"dist/ns_nested_pkg-{self.DEFAULT_VERSION}.tar.gz" + assert dist_file.is_file() + + +class TestWithAttrDirective: + @pytest.mark.parametrize( + ("folder", "opts"), + [ + ("src", {}), + ("lib", {"packages": "find:", "packages.find": {"where": "lib"}}), + ], + ) + def test_setupcfg_metadata(self, tmp_path, folder, opts): + files = [f"{folder}/pkg/__init__.py", "setup.cfg"] + _populate_project_dir(tmp_path, files, opts) + + config = (tmp_path / "setup.cfg").read_text(encoding="utf-8") + overwrite = { + folder: {"pkg": {"__init__.py": "version = 42"}}, + "setup.cfg": "[metadata]\nversion = attr: pkg.version\n" + config, + } + jaraco.path.build(overwrite, prefix=tmp_path) + + dist = _get_dist(tmp_path, {}) + assert dist.get_name() == "pkg" + assert dist.get_version() == "42" + assert dist.package_dir + package_path = find_package_path("pkg", dist.package_dir, tmp_path) + assert os.path.exists(package_path) + assert folder in Path(package_path).parts() + + _run_build(tmp_path, "--sdist") + dist_file = tmp_path / "dist/pkg-42.tar.gz" + assert dist_file.is_file() + + def test_pyproject_metadata(self, tmp_path): + _populate_project_dir(tmp_path, ["src/pkg/__init__.py"], {}) + + overwrite = { + "src": {"pkg": {"__init__.py": "version = 42"}}, + "pyproject.toml": ( + "[project]\nname = 'pkg'\ndynamic = ['version']\n" + "[tool.setuptools.dynamic]\nversion = {attr = 'pkg.version'}\n" + ), + } + jaraco.path.build(overwrite, prefix=tmp_path) + + dist = _get_dist(tmp_path, {}) + assert dist.get_version() == "42" + assert dist.package_dir == {"": "src"} + + +class TestWithCExtension: + def _simulate_package_with_extension(self, tmp_path): + # This example is based on: https://github.com/nucleic/kiwi/tree/1.4.0 + files = [ + "benchmarks/file.py", + "docs/Makefile", + "docs/requirements.txt", + "docs/source/conf.py", + "proj/header.h", + "proj/file.py", + "py/proj.cpp", + "py/other.cpp", + "py/file.py", + "py/py.typed", + "py/tests/test_proj.py", + "README.rst", + ] + _populate_project_dir(tmp_path, files, {}) + + setup_script = """ + from setuptools import Extension, setup + + ext_modules = [ + Extension( + "proj", + ["py/proj.cpp", "py/other.cpp"], + include_dirs=["."], + language="c++", + ), + ] + setup(ext_modules=ext_modules) + """ + (tmp_path / "setup.py").write_text(DALS(setup_script), encoding="utf-8") + + def test_skip_discovery_with_setupcfg_metadata(self, tmp_path): + """Ensure that auto-discovery is not triggered when the project is based on + C-extensions only, for backward compatibility. + """ + self._simulate_package_with_extension(tmp_path) + + pyproject = """ + [build-system] + requires = [] + build-backend = 'setuptools.build_meta' + """ + (tmp_path / "pyproject.toml").write_text(DALS(pyproject), encoding="utf-8") + + setupcfg = """ + [metadata] + name = proj + version = 42 + """ + (tmp_path / "setup.cfg").write_text(DALS(setupcfg), encoding="utf-8") + + dist = _get_dist(tmp_path, {}) + assert dist.get_name() == "proj" + assert dist.get_version() == "42" + assert dist.py_modules is None + assert dist.packages is None + assert len(dist.ext_modules) == 1 + assert dist.ext_modules[0].name == "proj" + + def test_dont_skip_discovery_with_pyproject_metadata(self, tmp_path): + """When opting-in to pyproject.toml metadata, auto-discovery will be active if + the package lists C-extensions, but does not configure py-modules or packages. + + This way we ensure users with complex package layouts that would lead to the + discovery of multiple top-level modules/packages see errors and are forced to + explicitly set ``packages`` or ``py-modules``. + """ + self._simulate_package_with_extension(tmp_path) + + pyproject = """ + [project] + name = 'proj' + version = '42' + """ + (tmp_path / "pyproject.toml").write_text(DALS(pyproject), encoding="utf-8") + with pytest.raises(PackageDiscoveryError, match="multiple (packages|modules)"): + _get_dist(tmp_path, {}) + + +class TestWithPackageData: + def _simulate_package_with_data_files(self, tmp_path, src_root): + files = [ + f"{src_root}/proj/__init__.py", + f"{src_root}/proj/file1.txt", + f"{src_root}/proj/nested/file2.txt", + ] + _populate_project_dir(tmp_path, files, {}) + + manifest = """ + global-include *.py *.txt + """ + (tmp_path / "MANIFEST.in").write_text(DALS(manifest), encoding="utf-8") + + EXAMPLE_SETUPCFG = """ + [metadata] + name = proj + version = 42 + + [options] + include_package_data = True + """ + EXAMPLE_PYPROJECT = """ + [project] + name = "proj" + version = "42" + """ + + PYPROJECT_PACKAGE_DIR = """ + [tool.setuptools] + package-dir = {"" = "src"} + """ + + @pytest.mark.parametrize( + ("src_root", "files"), + [ + (".", {"setup.cfg": DALS(EXAMPLE_SETUPCFG)}), + (".", {"pyproject.toml": DALS(EXAMPLE_PYPROJECT)}), + ("src", {"setup.cfg": DALS(EXAMPLE_SETUPCFG)}), + ("src", {"pyproject.toml": DALS(EXAMPLE_PYPROJECT)}), + ( + "src", + { + "setup.cfg": DALS(EXAMPLE_SETUPCFG) + + DALS( + """ + packages = find: + package_dir = + =src + + [options.packages.find] + where = src + """ + ) + }, + ), + ( + "src", + { + "pyproject.toml": DALS(EXAMPLE_PYPROJECT) + + DALS( + """ + [tool.setuptools] + package-dir = {"" = "src"} + """ + ) + }, + ), + ], + ) + def test_include_package_data(self, tmp_path, src_root, files): + """ + Make sure auto-discovery does not affect package include_package_data. + See issue #3196. + """ + jaraco.path.build(files, prefix=str(tmp_path)) + self._simulate_package_with_data_files(tmp_path, src_root) + + expected = { + os.path.normpath(f"{src_root}/proj/file1.txt").replace(os.sep, "/"), + os.path.normpath(f"{src_root}/proj/nested/file2.txt").replace(os.sep, "/"), + } + + _run_build(tmp_path) + + sdist_files = get_sdist_members(next(tmp_path.glob("dist/*.tar.gz"))) + print("~~~~~ sdist_members ~~~~~") + print('\n'.join(sdist_files)) + assert sdist_files >= expected + + wheel_files = get_wheel_members(next(tmp_path.glob("dist/*.whl"))) + print("~~~~~ wheel_members ~~~~~") + print('\n'.join(wheel_files)) + orig_files = {f.replace("src/", "").replace("lib/", "") for f in expected} + assert wheel_files >= orig_files + + +def test_compatible_with_numpy_configuration(tmp_path): + files = [ + "dir1/__init__.py", + "dir2/__init__.py", + "file.py", + ] + _populate_project_dir(tmp_path, files, {}) + dist = Distribution({}) + dist.configuration = object() + dist.set_defaults() + assert dist.py_modules is None + assert dist.packages is None + + +def test_name_discovery_doesnt_break_cli(tmpdir_cwd): + jaraco.path.build({"pkg.py": ""}) + dist = Distribution({}) + dist.script_args = ["--name"] + dist.set_defaults() + dist.parse_command_line() # <-- no exception should be raised here. + assert dist.get_name() == "pkg" + + +def test_preserve_explicit_name_with_dynamic_version(tmpdir_cwd, monkeypatch): + """According to #3545 it seems that ``name`` discovery is running, + even when the project already explicitly sets it. + This seems to be related to parsing of dynamic versions (via ``attr`` directive), + which requires the auto-discovery of ``package_dir``. + """ + files = { + "src": { + "pkg": {"__init__.py": "__version__ = 42\n"}, + }, + "pyproject.toml": DALS( + """ + [project] + name = "myproj" # purposefully different from package name + dynamic = ["version"] + [tool.setuptools.dynamic] + version = {"attr" = "pkg.__version__"} + """ + ), + } + jaraco.path.build(files) + dist = Distribution({}) + orig_analyse_name = dist.set_defaults.analyse_name + + def spy_analyse_name(): + # We can check if name discovery was triggered by ensuring the original + # name remains instead of the package name. + orig_analyse_name() + assert dist.get_name() == "myproj" + + monkeypatch.setattr(dist.set_defaults, "analyse_name", spy_analyse_name) + dist.parse_config_files() + assert dist.get_version() == "42" + assert set(dist.packages) == {"pkg"} + + +def _populate_project_dir(root, files, options): + # NOTE: Currently pypa/build will refuse to build the project if no + # `pyproject.toml` or `setup.py` is found. So it is impossible to do + # completely "config-less" projects. + basic = { + "setup.py": "import setuptools\nsetuptools.setup()", + "README.md": "# Example Package", + "LICENSE": "Copyright (c) 2018", + } + jaraco.path.build(basic, prefix=root) + _write_setupcfg(root, options) + paths = (root / f for f in files) + for path in paths: + path.parent.mkdir(exist_ok=True, parents=True) + path.touch() + + +def _write_setupcfg(root, options): + if not options: + print("~~~~~ **NO** setup.cfg ~~~~~") + return + setupcfg = ConfigParser() + setupcfg.add_section("options") + for key, value in options.items(): + if key == "packages.find": + setupcfg.add_section(f"options.{key}") + setupcfg[f"options.{key}"].update(value) + elif isinstance(value, list): + setupcfg["options"][key] = ", ".join(value) + elif isinstance(value, dict): + str_value = "\n".join(f"\t{k} = {v}" for k, v in value.items()) + setupcfg["options"][key] = "\n" + str_value + else: + setupcfg["options"][key] = str(value) + with open(root / "setup.cfg", "w", encoding="utf-8") as f: + setupcfg.write(f) + print("~~~~~ setup.cfg ~~~~~") + print((root / "setup.cfg").read_text(encoding="utf-8")) + + +def _run_build(path, *flags): + cmd = [sys.executable, "-m", "build", "--no-isolation", *flags, str(path)] + return run(cmd, env={'DISTUTILS_DEBUG': ''}) + + +def _get_dist(dist_path, attrs): + root = "/".join(os.path.split(dist_path)) # POSIX-style + + script = dist_path / 'setup.py' + if script.exists(): + with Path(dist_path): + dist = cast( + Distribution, + distutils.core.run_setup("setup.py", {}, stop_after="init"), + ) + else: + dist = Distribution(attrs) + + dist.src_root = root + dist.script_name = "setup.py" + with Path(dist_path): + dist.parse_config_files() + + dist.set_defaults() + return dist + + +def _run_sdist_programatically(dist_path, attrs): + dist = _get_dist(dist_path, attrs) + cmd = sdist(dist) + cmd.ensure_finalized() + assert cmd.distribution.packages or cmd.distribution.py_modules + + with quiet(), Path(dist_path): + cmd.run() + + return dist, cmd diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_core_metadata.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_core_metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..b1edb79b404dd85a38b074cf52b4299049f304c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_core_metadata.py @@ -0,0 +1,577 @@ +from __future__ import annotations + +import functools +import importlib +import io +from email import message_from_string +from email.generator import Generator +from email.message import EmailMessage, Message +from email.parser import Parser +from email.policy import EmailPolicy +from inspect import cleandoc +from pathlib import Path +from unittest.mock import Mock + +import pytest +from packaging.metadata import Metadata +from packaging.requirements import Requirement + +from setuptools import _reqs, sic +from setuptools._core_metadata import rfc822_escape, rfc822_unescape +from setuptools.command.egg_info import egg_info, write_requirements +from setuptools.config import expand, setupcfg +from setuptools.dist import Distribution + +from .config.downloads import retrieve_file, urls_from_file + +EXAMPLE_BASE_INFO = dict( + name="package", + version="0.0.1", + author="Foo Bar", + author_email="foo@bar.net", + long_description="Long\ndescription", + description="Short description", + keywords=["one", "two"], +) + + +@pytest.mark.parametrize( + ("content", "result"), + ( + pytest.param( + "Just a single line", + None, + id="single_line", + ), + pytest.param( + "Multiline\nText\nwithout\nextra indents\n", + None, + id="multiline", + ), + pytest.param( + "Multiline\n With\n\nadditional\n indentation", + None, + id="multiline_with_indentation", + ), + pytest.param( + " Leading whitespace", + "Leading whitespace", + id="remove_leading_whitespace", + ), + pytest.param( + " Leading whitespace\nIn\n Multiline comment", + "Leading whitespace\nIn\n Multiline comment", + id="remove_leading_whitespace_multiline", + ), + ), +) +def test_rfc822_unescape(content, result): + assert (result or content) == rfc822_unescape(rfc822_escape(content)) + + +def __read_test_cases(): + base = EXAMPLE_BASE_INFO + + params = functools.partial(dict, base) + + return [ + ('Metadata version 1.0', params()), + ( + 'Metadata Version 1.0: Short long description', + params( + long_description='Short long description', + ), + ), + ( + 'Metadata version 1.1: Classifiers', + params( + classifiers=[ + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7', + 'License :: OSI Approved :: MIT License', + ], + ), + ), + ( + 'Metadata version 1.1: Download URL', + params( + download_url='https://example.com', + ), + ), + ( + 'Metadata Version 1.2: Requires-Python', + params( + python_requires='>=3.7', + ), + ), + pytest.param( + 'Metadata Version 1.2: Project-Url', + params(project_urls=dict(Foo='https://example.bar')), + marks=pytest.mark.xfail( + reason="Issue #1578: project_urls not read", + ), + ), + ( + 'Metadata Version 2.1: Long Description Content Type', + params( + long_description_content_type='text/x-rst; charset=UTF-8', + ), + ), + ( + 'License', + params( + license='MIT', + ), + ), + ( + 'License multiline', + params( + license='This is a long license \nover multiple lines', + ), + ), + pytest.param( + 'Metadata Version 2.1: Provides Extra', + params(provides_extras=['foo', 'bar']), + marks=pytest.mark.xfail(reason="provides_extras not read"), + ), + ( + 'Missing author', + dict( + name='foo', + version='1.0.0', + author_email='snorri@sturluson.name', + ), + ), + ( + 'Missing author e-mail', + dict( + name='foo', + version='1.0.0', + author='Snorri Sturluson', + ), + ), + ( + 'Missing author and e-mail', + dict( + name='foo', + version='1.0.0', + ), + ), + ( + 'Bypass normalized version', + dict( + name='foo', + version=sic('1.0.0a'), + ), + ), + ] + + +@pytest.mark.parametrize(("name", "attrs"), __read_test_cases()) +def test_read_metadata(name, attrs): + dist = Distribution(attrs) + metadata_out = dist.metadata + dist_class = metadata_out.__class__ + + # Write to PKG_INFO and then load into a new metadata object + PKG_INFO = io.StringIO() + + metadata_out.write_pkg_file(PKG_INFO) + PKG_INFO.seek(0) + pkg_info = PKG_INFO.read() + assert _valid_metadata(pkg_info) + + PKG_INFO.seek(0) + metadata_in = dist_class() + metadata_in.read_pkg_file(PKG_INFO) + + tested_attrs = [ + ('name', dist_class.get_name), + ('version', dist_class.get_version), + ('author', dist_class.get_contact), + ('author_email', dist_class.get_contact_email), + ('metadata_version', dist_class.get_metadata_version), + ('provides', dist_class.get_provides), + ('description', dist_class.get_description), + ('long_description', dist_class.get_long_description), + ('download_url', dist_class.get_download_url), + ('keywords', dist_class.get_keywords), + ('platforms', dist_class.get_platforms), + ('obsoletes', dist_class.get_obsoletes), + ('requires', dist_class.get_requires), + ('classifiers', dist_class.get_classifiers), + ('project_urls', lambda s: getattr(s, 'project_urls', {})), + ('provides_extras', lambda s: getattr(s, 'provides_extras', {})), + ] + + for attr, getter in tested_attrs: + assert getter(metadata_in) == getter(metadata_out) + + +def __maintainer_test_cases(): + attrs = {"name": "package", "version": "1.0", "description": "xxx"} + + def merge_dicts(d1, d2): + d1 = d1.copy() + d1.update(d2) + + return d1 + + return [ + ('No author, no maintainer', attrs.copy()), + ( + 'Author (no e-mail), no maintainer', + merge_dicts(attrs, {'author': 'Author Name'}), + ), + ( + 'Author (e-mail), no maintainer', + merge_dicts( + attrs, {'author': 'Author Name', 'author_email': 'author@name.com'} + ), + ), + ( + 'No author, maintainer (no e-mail)', + merge_dicts(attrs, {'maintainer': 'Maintainer Name'}), + ), + ( + 'No author, maintainer (e-mail)', + merge_dicts( + attrs, + { + 'maintainer': 'Maintainer Name', + 'maintainer_email': 'maintainer@name.com', + }, + ), + ), + ( + 'Author (no e-mail), Maintainer (no-email)', + merge_dicts( + attrs, {'author': 'Author Name', 'maintainer': 'Maintainer Name'} + ), + ), + ( + 'Author (e-mail), Maintainer (e-mail)', + merge_dicts( + attrs, + { + 'author': 'Author Name', + 'author_email': 'author@name.com', + 'maintainer': 'Maintainer Name', + 'maintainer_email': 'maintainer@name.com', + }, + ), + ), + ( + 'No author (e-mail), no maintainer (e-mail)', + merge_dicts( + attrs, + { + 'author_email': 'author@name.com', + 'maintainer_email': 'maintainer@name.com', + }, + ), + ), + ('Author unicode', merge_dicts(attrs, {'author': '鉄沢寛'})), + ('Maintainer unicode', merge_dicts(attrs, {'maintainer': 'Jan Łukasiewicz'})), + ] + + +@pytest.mark.parametrize(("name", "attrs"), __maintainer_test_cases()) +def test_maintainer_author(name, attrs, tmpdir): + tested_keys = { + 'author': 'Author', + 'author_email': 'Author-email', + 'maintainer': 'Maintainer', + 'maintainer_email': 'Maintainer-email', + } + + # Generate a PKG-INFO file + dist = Distribution(attrs) + fn = tmpdir.mkdir('pkg_info') + fn_s = str(fn) + + dist.metadata.write_pkg_info(fn_s) + + with open(str(fn.join('PKG-INFO')), 'r', encoding='utf-8') as f: + pkg_info = f.read() + + assert _valid_metadata(pkg_info) + + # Drop blank lines and strip lines from default description + raw_pkg_lines = pkg_info.splitlines() + pkg_lines = list(filter(None, raw_pkg_lines[:-2])) + + pkg_lines_set = set(pkg_lines) + + # Duplicate lines should not be generated + assert len(pkg_lines) == len(pkg_lines_set) + + for fkey, dkey in tested_keys.items(): + val = attrs.get(dkey, None) + if val is None: + for line in pkg_lines: + assert not line.startswith(fkey + ':') + else: + line = f'{fkey}: {val}' + assert line in pkg_lines_set + + +class TestParityWithMetadataFromPyPaWheel: + def base_example(self): + attrs = dict( + **EXAMPLE_BASE_INFO, + # Example with complex requirement definition + python_requires=">=3.8", + install_requires=""" + packaging==23.2 + more-itertools==8.8.0; extra == "other" + jaraco.text==3.7.0 + importlib-resources==5.10.2; python_version<"3.8" + importlib-metadata==6.0.0 ; python_version<"3.8" + colorama>=0.4.4; sys_platform == "win32" + """, + extras_require={ + "testing": """ + pytest >= 6 + pytest-checkdocs >= 2.4 + tomli ; \\ + # Using stdlib when possible + python_version < "3.11" + ini2toml[lite]>=0.9 + """, + "other": [], + }, + ) + # Generate a PKG-INFO file using setuptools + return Distribution(attrs) + + def test_requires_dist(self, tmp_path): + dist = self.base_example() + pkg_info = _get_pkginfo(dist) + assert _valid_metadata(pkg_info) + + # Ensure Requires-Dist is present + expected = [ + 'Metadata-Version:', + 'Requires-Python: >=3.8', + 'Provides-Extra: other', + 'Provides-Extra: testing', + 'Requires-Dist: tomli; python_version < "3.11" and extra == "testing"', + 'Requires-Dist: more-itertools==8.8.0; extra == "other"', + 'Requires-Dist: ini2toml[lite]>=0.9; extra == "testing"', + ] + for line in expected: + assert line in pkg_info + + HERE = Path(__file__).parent + EXAMPLES_FILE = HERE / "config/setupcfg_examples.txt" + + @pytest.fixture(params=[None, *urls_from_file(EXAMPLES_FILE)]) + def dist(self, request, monkeypatch, tmp_path): + """Example of distribution with arbitrary configuration""" + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(expand, "read_attr", Mock(return_value="0.42")) + monkeypatch.setattr(expand, "read_files", Mock(return_value="hello world")) + if request.param is None: + yield self.base_example() + else: + # Real-world usage + config = retrieve_file(request.param) + yield setupcfg.apply_configuration(Distribution({}), config) + + @pytest.mark.uses_network + def test_equivalent_output(self, tmp_path, dist): + """Ensure output from setuptools is equivalent to the one from `pypa/wheel`""" + # Generate a METADATA file using pypa/wheel for comparison + wheel_metadata = importlib.import_module("wheel.metadata") + pkginfo_to_metadata = getattr(wheel_metadata, "pkginfo_to_metadata", None) + + if pkginfo_to_metadata is None: # pragma: nocover + pytest.xfail( + "wheel.metadata.pkginfo_to_metadata is undefined, " + "(this is likely to be caused by API changes in pypa/wheel" + ) + + # Generate an simplified "egg-info" dir for pypa/wheel to convert + pkg_info = _get_pkginfo(dist) + egg_info_dir = tmp_path / "pkg.egg-info" + egg_info_dir.mkdir(parents=True) + (egg_info_dir / "PKG-INFO").write_text(pkg_info, encoding="utf-8") + write_requirements(egg_info(dist), egg_info_dir, egg_info_dir / "requires.txt") + + # Get pypa/wheel generated METADATA but normalize requirements formatting + metadata_msg = pkginfo_to_metadata(egg_info_dir, egg_info_dir / "PKG-INFO") + metadata_str = _normalize_metadata(metadata_msg) + pkg_info_msg = message_from_string(pkg_info) + pkg_info_str = _normalize_metadata(pkg_info_msg) + + # Compare setuptools PKG-INFO x pypa/wheel METADATA + assert metadata_str == pkg_info_str + + # Make sure it parses/serializes well in pypa/wheel + _assert_roundtrip_message(pkg_info) + + +class TestPEP643: + STATIC_CONFIG = { + "setup.cfg": cleandoc( + """ + [metadata] + name = package + version = 0.0.1 + author = Foo Bar + author_email = foo@bar.net + long_description = Long + description + description = Short description + keywords = one, two + platforms = abcd + [options] + install_requires = requests + """ + ), + "pyproject.toml": cleandoc( + """ + [project] + name = "package" + version = "0.0.1" + authors = [ + {name = "Foo Bar", email = "foo@bar.net"} + ] + description = "Short description" + readme = {text = "Long\\ndescription", content-type = "text/plain"} + keywords = ["one", "two"] + dependencies = ["requests"] + [tool.setuptools] + provides = ["abcd"] + obsoletes = ["abcd"] + """ + ), + } + + @pytest.mark.parametrize("file", STATIC_CONFIG.keys()) + def test_static_config_has_no_dynamic(self, file, tmpdir_cwd): + Path(file).write_text(self.STATIC_CONFIG[file], encoding="utf-8") + metadata = _get_metadata() + assert metadata.get_all("Dynamic") is None + assert metadata.get_all("dynamic") is None + + @pytest.mark.parametrize("file", STATIC_CONFIG.keys()) + @pytest.mark.parametrize( + "fields", + [ + # Single dynamic field + {"requires-python": ("python_requires", ">=3.12")}, + {"author-email": ("author_email", "snoopy@peanuts.com")}, + {"keywords": ("keywords", ["hello", "world"])}, + {"platform": ("platforms", ["abcd"])}, + # Multiple dynamic fields + { + "summary": ("description", "hello world"), + "description": ("long_description", "bla bla bla bla"), + "requires-dist": ("install_requires", ["hello-world"]), + }, + ], + ) + def test_modified_fields_marked_as_dynamic(self, file, fields, tmpdir_cwd): + # We start with a static config + Path(file).write_text(self.STATIC_CONFIG[file], encoding="utf-8") + dist = _makedist() + + # ... but then we simulate the effects of a plugin modifying the distribution + for attr, value in fields.values(): + # `dist` and `dist.metadata` are complicated... + # Some attributes work when set on `dist`, others on `dist.metadata`... + # Here we set in both just in case (this also avoids calling `_finalize_*`) + setattr(dist, attr, value) + setattr(dist.metadata, attr, value) + + # Then we should be able to list the modified fields as Dynamic + metadata = _get_metadata(dist) + assert set(metadata.get_all("Dynamic")) == set(fields) + + +def _makedist(**attrs): + dist = Distribution(attrs) + dist.parse_config_files() + return dist + + +def _assert_roundtrip_message(metadata: str) -> None: + """Emulate the way wheel.bdist_wheel parses and regenerates the message, + then ensures the metadata generated by setuptools is compatible. + """ + with io.StringIO(metadata) as buffer: + msg = Parser(EmailMessage).parse(buffer) + + serialization_policy = EmailPolicy( + utf8=True, + mangle_from_=False, + max_line_length=0, + ) + with io.BytesIO() as buffer: + out = io.TextIOWrapper(buffer, encoding="utf-8") + Generator(out, policy=serialization_policy).flatten(msg) + out.flush() + regenerated = buffer.getvalue() + + raw_metadata = bytes(metadata, "utf-8") + # Normalise newlines to avoid test errors on Windows: + raw_metadata = b"\n".join(raw_metadata.splitlines()) + regenerated = b"\n".join(regenerated.splitlines()) + assert regenerated == raw_metadata + + +def _normalize_metadata(msg: Message) -> str: + """Allow equivalent metadata to be compared directly""" + # The main challenge regards the requirements and extras. + # Both setuptools and wheel already apply some level of normalization + # but they differ regarding which character is chosen, according to the + # following spec it should be "-": + # https://packaging.python.org/en/latest/specifications/name-normalization/ + + # Related issues: + # https://github.com/pypa/packaging/issues/845 + # https://github.com/pypa/packaging/issues/644#issuecomment-2429813968 + + extras = {x.replace("_", "-"): x for x in msg.get_all("Provides-Extra", [])} + reqs = [ + _normalize_req(req, extras) + for req in _reqs.parse(msg.get_all("Requires-Dist", [])) + ] + del msg["Requires-Dist"] + del msg["Provides-Extra"] + + # Ensure consistent ord + for req in sorted(reqs): + msg["Requires-Dist"] = req + for extra in sorted(extras): + msg["Provides-Extra"] = extra + + # TODO: Handle lack of PEP 643 implementation in pypa/wheel? + del msg["Metadata-Version"] + + return msg.as_string() + + +def _normalize_req(req: Requirement, extras: dict[str, str]) -> str: + """Allow equivalent requirement objects to be compared directly""" + as_str = str(req).replace(req.name, req.name.replace("_", "-")) + for norm, orig in extras.items(): + as_str = as_str.replace(orig, norm) + return as_str + + +def _get_pkginfo(dist: Distribution): + with io.StringIO() as fp: + dist.metadata.write_pkg_file(fp) + return fp.getvalue() + + +def _get_metadata(dist: Distribution | None = None): + return message_from_string(_get_pkginfo(dist or _makedist())) + + +def _valid_metadata(text: str) -> bool: + metadata = Metadata.from_email(text, validate=True) # can raise exceptions + return metadata is not None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_depends.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_depends.py new file mode 100644 index 0000000000000000000000000000000000000000..1714c041f7a23e1ecbfc3245bf964f75c13734ca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_depends.py @@ -0,0 +1,15 @@ +import sys + +from setuptools import depends + + +class TestGetModuleConstant: + def test_basic(self): + """ + Invoke get_module_constant on a module in + the test package. + """ + mod_name = 'setuptools.tests.mod_with_constant' + val = depends.get_module_constant(mod_name, 'value') + assert val == 'three, sir!' + assert 'setuptools.tests.mod_with_constant' not in sys.modules diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_develop.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_develop.py new file mode 100644 index 0000000000000000000000000000000000000000..929fa9c285eb4d11e646dab2864be6d5fa023e2b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_develop.py @@ -0,0 +1,175 @@ +"""develop tests""" + +import os +import pathlib +import platform +import subprocess +import sys + +import pytest + +from setuptools._path import paths_on_pythonpath +from setuptools.command.develop import develop +from setuptools.dist import Distribution + +from . import contexts, namespaces + +SETUP_PY = """\ +from setuptools import setup + +setup(name='foo', + packages=['foo'], +) +""" + +INIT_PY = """print "foo" +""" + + +@pytest.fixture +def temp_user(monkeypatch): + with contexts.tempdir() as user_base: + with contexts.tempdir() as user_site: + monkeypatch.setattr('site.USER_BASE', user_base) + monkeypatch.setattr('site.USER_SITE', user_site) + yield + + +@pytest.fixture +def test_env(tmpdir, temp_user): + target = tmpdir + foo = target.mkdir('foo') + setup = target / 'setup.py' + if setup.isfile(): + raise ValueError(dir(target)) + with setup.open('w') as f: + f.write(SETUP_PY) + init = foo / '__init__.py' + with init.open('w') as f: + f.write(INIT_PY) + with target.as_cwd(): + yield target + + +class TestDevelop: + in_virtualenv = hasattr(sys, 'real_prefix') + in_venv = hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix + + def test_console_scripts(self, tmpdir): + """ + Test that console scripts are installed and that they reference + only the project by name and not the current version. + """ + pytest.skip( + "TODO: needs a fixture to cause 'develop' " + "to be invoked without mutating environment." + ) + settings = dict( + name='foo', + packages=['foo'], + version='0.0', + entry_points={ + 'console_scripts': [ + 'foocmd = foo:foo', + ], + }, + ) + dist = Distribution(settings) + dist.script_name = 'setup.py' + cmd = develop(dist) + cmd.ensure_finalized() + cmd.install_dir = tmpdir + cmd.run() + # assert '0.0' not in foocmd_text + + @pytest.mark.xfail(reason="legacy behavior retained for compatibility #4167") + def test_egg_link_filename(self): + settings = dict( + name='Foo $$$ Bar_baz-bing', + ) + dist = Distribution(settings) + cmd = develop(dist) + cmd.ensure_finalized() + link = pathlib.Path(cmd.egg_link) + assert link.suffix == '.egg-link' + assert link.stem == 'Foo_Bar_baz_bing' + + +class TestResolver: + """ + TODO: These tests were written with a minimal understanding + of what _resolve_setup_path is intending to do. Come up with + more meaningful cases that look like real-world scenarios. + """ + + def test_resolve_setup_path_cwd(self): + assert develop._resolve_setup_path('.', '.', '.') == '.' + + def test_resolve_setup_path_one_dir(self): + assert develop._resolve_setup_path('pkgs', '.', 'pkgs') == '../' + + def test_resolve_setup_path_one_dir_trailing_slash(self): + assert develop._resolve_setup_path('pkgs/', '.', 'pkgs') == '../' + + +class TestNamespaces: + @staticmethod + def install_develop(src_dir, target): + develop_cmd = [ + sys.executable, + 'setup.py', + 'develop', + '--install-dir', + str(target), + ] + with src_dir.as_cwd(): + with paths_on_pythonpath([str(target)]): + subprocess.check_call(develop_cmd) + + @pytest.mark.skipif( + bool(os.environ.get("APPVEYOR")), + reason="https://github.com/pypa/setuptools/issues/851", + ) + @pytest.mark.skipif( + platform.python_implementation() == 'PyPy', + reason="https://github.com/pypa/setuptools/issues/1202", + ) + def test_namespace_package_importable(self, tmpdir): + """ + Installing two packages sharing the same namespace, one installed + naturally using pip or `--single-version-externally-managed` + and the other installed using `develop` should leave the namespace + in tact and both packages reachable by import. + """ + pkg_A = namespaces.build_namespace_package(tmpdir, 'myns.pkgA') + pkg_B = namespaces.build_namespace_package(tmpdir, 'myns.pkgB') + target = tmpdir / 'packages' + # use pip to install to the target directory + install_cmd = [ + sys.executable, + '-m', + 'pip', + 'install', + str(pkg_A), + '-t', + str(target), + ] + subprocess.check_call(install_cmd) + self.install_develop(pkg_B, target) + namespaces.make_site_dir(target) + try_import = [ + sys.executable, + '-c', + 'import myns.pkgA; import myns.pkgB', + ] + with paths_on_pythonpath([str(target)]): + subprocess.check_call(try_import) + + # additionally ensure that pkg_resources import works + pkg_resources_imp = [ + sys.executable, + '-c', + 'import pkg_resources', + ] + with paths_on_pythonpath([str(target)]): + subprocess.check_call(pkg_resources_imp) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_dist.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..e65ab310e7f2ad98b591affb352c855a2f86eda1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_dist.py @@ -0,0 +1,278 @@ +import os +import re +import urllib.parse +import urllib.request + +import pytest + +from setuptools import Distribution +from setuptools.dist import check_package_data, check_specifier + +from .test_easy_install import make_trivial_sdist +from .test_find_packages import ensure_files +from .textwrap import DALS + +from distutils.errors import DistutilsSetupError + + +def test_dist_fetch_build_egg(tmpdir): + """ + Check multiple calls to `Distribution.fetch_build_egg` work as expected. + """ + index = tmpdir.mkdir('index') + index_url = urllib.parse.urljoin('file://', urllib.request.pathname2url(str(index))) + + def sdist_with_index(distname, version): + dist_dir = index.mkdir(distname) + dist_sdist = f'{distname}-{version}.tar.gz' + make_trivial_sdist(str(dist_dir.join(dist_sdist)), distname, version) + with dist_dir.join('index.html').open('w') as fp: + fp.write( + DALS( + """ + + {dist_sdist}
+ + """ + ).format(dist_sdist=dist_sdist) + ) + + sdist_with_index('barbazquux', '3.2.0') + sdist_with_index('barbazquux-runner', '2.11.1') + with tmpdir.join('setup.cfg').open('w') as fp: + fp.write( + DALS( + """ + [easy_install] + index_url = {index_url} + """ + ).format(index_url=index_url) + ) + reqs = """ + barbazquux-runner + barbazquux + """.split() + with tmpdir.as_cwd(): + dist = Distribution() + dist.parse_config_files() + resolved_dists = [dist.fetch_build_egg(r) for r in reqs] + assert [dist.key for dist in resolved_dists if dist] == reqs + + +EXAMPLE_BASE_INFO = dict( + name="package", + version="0.0.1", + author="Foo Bar", + author_email="foo@bar.net", + long_description="Long\ndescription", + description="Short description", + keywords=["one", "two"], +) + + +def test_provides_extras_deterministic_order(): + attrs = dict(extras_require=dict(a=['foo'], b=['bar'])) + dist = Distribution(attrs) + assert list(dist.metadata.provides_extras) == ['a', 'b'] + attrs['extras_require'] = dict(reversed(attrs['extras_require'].items())) + dist = Distribution(attrs) + assert list(dist.metadata.provides_extras) == ['b', 'a'] + + +CHECK_PACKAGE_DATA_TESTS = ( + # Valid. + ( + { + '': ['*.txt', '*.rst'], + 'hello': ['*.msg'], + }, + None, + ), + # Not a dictionary. + ( + ( + ('', ['*.txt', '*.rst']), + ('hello', ['*.msg']), + ), + ( + "'package_data' must be a dictionary mapping package" + " names to lists of string wildcard patterns" + ), + ), + # Invalid key type. + ( + { + 400: ['*.txt', '*.rst'], + }, + ("keys of 'package_data' dict must be strings (got 400)"), + ), + # Invalid value type. + ( + { + 'hello': '*.msg', + }, + ( + "\"values of 'package_data' dict\" must be of type " + " (got '*.msg')" + ), + ), + # Invalid value type (generators are single use) + ( + { + 'hello': (x for x in "generator"), + }, + ( + "\"values of 'package_data' dict\" must be of type " + " (got =3.0, !=3.1'} + dist = Distribution(attrs) + check_specifier(dist, attrs, attrs['python_requires']) + + attrs = {'name': 'foo', 'python_requires': ['>=3.0', '!=3.1']} + dist = Distribution(attrs) + check_specifier(dist, attrs, attrs['python_requires']) + + # invalid specifier value + attrs = {'name': 'foo', 'python_requires': '>=invalid-version'} + with pytest.raises(DistutilsSetupError): + dist = Distribution(attrs) + + +def test_metadata_name(): + with pytest.raises(DistutilsSetupError, match='missing.*name'): + Distribution()._validate_metadata() + + +@pytest.mark.parametrize( + ('dist_name', 'py_module'), + [ + ("my.pkg", "my_pkg"), + ("my-pkg", "my_pkg"), + ("my_pkg", "my_pkg"), + ("pkg", "pkg"), + ], +) +def test_dist_default_py_modules(tmp_path, dist_name, py_module): + (tmp_path / f"{py_module}.py").touch() + + (tmp_path / "setup.py").touch() + (tmp_path / "noxfile.py").touch() + # ^-- make sure common tool files are ignored + + attrs = {**EXAMPLE_BASE_INFO, "name": dist_name, "src_root": str(tmp_path)} + # Find `py_modules` corresponding to dist_name if not given + dist = Distribution(attrs) + dist.set_defaults() + assert dist.py_modules == [py_module] + # When `py_modules` is given, don't do anything + dist = Distribution({**attrs, "py_modules": ["explicity_py_module"]}) + dist.set_defaults() + assert dist.py_modules == ["explicity_py_module"] + # When `packages` is given, don't do anything + dist = Distribution({**attrs, "packages": ["explicity_package"]}) + dist.set_defaults() + assert not dist.py_modules + + +@pytest.mark.parametrize( + ('dist_name', 'package_dir', 'package_files', 'packages'), + [ + ("my.pkg", None, ["my_pkg/__init__.py", "my_pkg/mod.py"], ["my_pkg"]), + ("my-pkg", None, ["my_pkg/__init__.py", "my_pkg/mod.py"], ["my_pkg"]), + ("my_pkg", None, ["my_pkg/__init__.py", "my_pkg/mod.py"], ["my_pkg"]), + ("my.pkg", None, ["my/pkg/__init__.py"], ["my", "my.pkg"]), + ( + "my_pkg", + None, + ["src/my_pkg/__init__.py", "src/my_pkg2/__init__.py"], + ["my_pkg", "my_pkg2"], + ), + ( + "my_pkg", + {"pkg": "lib", "pkg2": "lib2"}, + ["lib/__init__.py", "lib/nested/__init__.pyt", "lib2/__init__.py"], + ["pkg", "pkg.nested", "pkg2"], + ), + ], +) +def test_dist_default_packages( + tmp_path, dist_name, package_dir, package_files, packages +): + ensure_files(tmp_path, package_files) + + (tmp_path / "setup.py").touch() + (tmp_path / "noxfile.py").touch() + # ^-- should not be included by default + + attrs = { + **EXAMPLE_BASE_INFO, + "name": dist_name, + "src_root": str(tmp_path), + "package_dir": package_dir, + } + # Find `packages` either corresponding to dist_name or inside src + dist = Distribution(attrs) + dist.set_defaults() + assert not dist.py_modules + assert not dist.py_modules + assert set(dist.packages) == set(packages) + # When `py_modules` is given, don't do anything + dist = Distribution({**attrs, "py_modules": ["explicit_py_module"]}) + dist.set_defaults() + assert not dist.packages + assert set(dist.py_modules) == {"explicit_py_module"} + # When `packages` is given, don't do anything + dist = Distribution({**attrs, "packages": ["explicit_package"]}) + dist.set_defaults() + assert not dist.py_modules + assert set(dist.packages) == {"explicit_package"} + + +@pytest.mark.parametrize( + ('dist_name', 'package_dir', 'package_files'), + [ + ("my.pkg.nested", None, ["my/pkg/nested/__init__.py"]), + ("my.pkg", None, ["my/pkg/__init__.py", "my/pkg/file.py"]), + ("my_pkg", None, ["my_pkg.py"]), + ("my_pkg", None, ["my_pkg/__init__.py", "my_pkg/nested/__init__.py"]), + ("my_pkg", None, ["src/my_pkg/__init__.py", "src/my_pkg/nested/__init__.py"]), + ( + "my_pkg", + {"my_pkg": "lib", "my_pkg.lib2": "lib2"}, + ["lib/__init__.py", "lib/nested/__init__.pyt", "lib2/__init__.py"], + ), + # Should not try to guess a name from multiple py_modules/packages + ("UNKNOWN", None, ["src/mod1.py", "src/mod2.py"]), + ("UNKNOWN", None, ["src/pkg1/__ini__.py", "src/pkg2/__init__.py"]), + ], +) +def test_dist_default_name(tmp_path, dist_name, package_dir, package_files): + """Make sure dist.name is discovered from packages/py_modules""" + ensure_files(tmp_path, package_files) + attrs = { + **EXAMPLE_BASE_INFO, + "src_root": "/".join(os.path.split(tmp_path)), # POSIX-style + "package_dir": package_dir, + } + del attrs["name"] + + dist = Distribution(attrs) + dist.set_defaults() + assert dist.py_modules or dist.packages + assert dist.get_name() == dist_name diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_dist_info.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_dist_info.py new file mode 100644 index 0000000000000000000000000000000000000000..426694e0198fdbe6ee7466822bc7c15634e2b648 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_dist_info.py @@ -0,0 +1,210 @@ +"""Test .dist-info style distributions.""" + +import pathlib +import re +import shutil +import subprocess +import sys +from functools import partial + +import pytest + +import pkg_resources +from setuptools.archive_util import unpack_archive + +from .textwrap import DALS + +read = partial(pathlib.Path.read_text, encoding="utf-8") + + +class TestDistInfo: + metadata_base = DALS( + """ + Metadata-Version: 1.2 + Requires-Dist: splort (==4) + Provides-Extra: baz + Requires-Dist: quux (>=1.1); extra == 'baz' + """ + ) + + @classmethod + def build_metadata(cls, **kwargs): + lines = ('{key}: {value}\n'.format(**locals()) for key, value in kwargs.items()) + return cls.metadata_base + ''.join(lines) + + @pytest.fixture + def metadata(self, tmpdir): + dist_info_name = 'VersionedDistribution-2.718.dist-info' + versioned = tmpdir / dist_info_name + versioned.mkdir() + filename = versioned / 'METADATA' + content = self.build_metadata( + Name='VersionedDistribution', + ) + filename.write_text(content, encoding='utf-8') + + dist_info_name = 'UnversionedDistribution.dist-info' + unversioned = tmpdir / dist_info_name + unversioned.mkdir() + filename = unversioned / 'METADATA' + content = self.build_metadata( + Name='UnversionedDistribution', + Version='0.3', + ) + filename.write_text(content, encoding='utf-8') + + return str(tmpdir) + + def test_distinfo(self, metadata): + dists = dict( + (d.project_name, d) for d in pkg_resources.find_distributions(metadata) + ) + + assert len(dists) == 2, dists + + unversioned = dists['UnversionedDistribution'] + versioned = dists['VersionedDistribution'] + + assert versioned.version == '2.718' # from filename + assert unversioned.version == '0.3' # from METADATA + + def test_conditional_dependencies(self, metadata): + specs = 'splort==4', 'quux>=1.1' + requires = list(map(pkg_resources.Requirement.parse, specs)) + + for d in pkg_resources.find_distributions(metadata): + assert d.requires() == requires[:1] + assert d.requires(extras=('baz',)) == [ + requires[0], + pkg_resources.Requirement.parse('quux>=1.1;extra=="baz"'), + ] + assert d.extras == ['baz'] + + def test_invalid_version(self, tmp_path): + """ + Supplying an invalid version crashes dist_info. + """ + config = "[metadata]\nname=proj\nversion=42\n[egg_info]\ntag_build=invalid!!!\n" + (tmp_path / "setup.cfg").write_text(config, encoding="utf-8") + msg = re.compile("invalid version", re.M | re.I) + proc = run_command_inner("dist_info", cwd=tmp_path, check=False) + assert proc.returncode + assert msg.search(proc.stdout) + assert not list(tmp_path.glob("*.dist-info")) + + def test_tag_arguments(self, tmp_path): + config = """ + [metadata] + name=proj + version=42 + [egg_info] + tag_date=1 + tag_build=.post + """ + (tmp_path / "setup.cfg").write_text(config, encoding="utf-8") + + print(run_command("dist_info", "--no-date", cwd=tmp_path)) + dist_info = next(tmp_path.glob("*.dist-info")) + assert dist_info.name.startswith("proj-42") + shutil.rmtree(dist_info) + + print(run_command("dist_info", "--tag-build", ".a", cwd=tmp_path)) + dist_info = next(tmp_path.glob("*.dist-info")) + assert dist_info.name.startswith("proj-42a") + + @pytest.mark.parametrize("keep_egg_info", (False, True)) + def test_output_dir(self, tmp_path, keep_egg_info): + config = "[metadata]\nname=proj\nversion=42\n" + (tmp_path / "setup.cfg").write_text(config, encoding="utf-8") + out = tmp_path / "__out" + out.mkdir() + opts = ["--keep-egg-info"] if keep_egg_info else [] + run_command("dist_info", "--output-dir", out, *opts, cwd=tmp_path) + assert len(list(out.glob("*.dist-info"))) == 1 + assert len(list(tmp_path.glob("*.dist-info"))) == 0 + expected_egg_info = int(keep_egg_info) + assert len(list(out.glob("*.egg-info"))) == expected_egg_info + assert len(list(tmp_path.glob("*.egg-info"))) == 0 + assert len(list(out.glob("*.__bkp__"))) == 0 + assert len(list(tmp_path.glob("*.__bkp__"))) == 0 + + +class TestWheelCompatibility: + """Make sure the .dist-info directory produced with the ``dist_info`` command + is the same as the one produced by ``bdist_wheel``. + """ + + SETUPCFG = DALS( + """ + [metadata] + name = {name} + version = {version} + + [options] + install_requires = + foo>=12; sys_platform != "linux" + + [options.extras_require] + test = pytest + + [options.entry_points] + console_scripts = + executable-name = my_package.module:function + discover = + myproj = my_package.other_module:function + """ + ) + + EGG_INFO_OPTS = [ + # Related: #3088 #2872 + ("", ""), + (".post", "[egg_info]\ntag_build = post\n"), + (".post", "[egg_info]\ntag_build = .post\n"), + (".post", "[egg_info]\ntag_build = post\ntag_date = 1\n"), + (".dev", "[egg_info]\ntag_build = .dev\n"), + (".dev", "[egg_info]\ntag_build = .dev\ntag_date = 1\n"), + ("a1", "[egg_info]\ntag_build = .a1\n"), + ("+local", "[egg_info]\ntag_build = +local\n"), + ] + + @pytest.mark.parametrize("name", "my-proj my_proj my.proj My.Proj".split()) + @pytest.mark.parametrize("version", ["0.42.13"]) + @pytest.mark.parametrize(("suffix", "cfg"), EGG_INFO_OPTS) + def test_dist_info_is_the_same_as_in_wheel( + self, name, version, tmp_path, suffix, cfg + ): + config = self.SETUPCFG.format(name=name, version=version) + cfg + + for i in "dir_wheel", "dir_dist": + (tmp_path / i).mkdir() + (tmp_path / i / "setup.cfg").write_text(config, encoding="utf-8") + + run_command("bdist_wheel", cwd=tmp_path / "dir_wheel") + wheel = next(tmp_path.glob("dir_wheel/dist/*.whl")) + unpack_archive(wheel, tmp_path / "unpack") + wheel_dist_info = next(tmp_path.glob("unpack/*.dist-info")) + + run_command("dist_info", cwd=tmp_path / "dir_dist") + dist_info = next(tmp_path.glob("dir_dist/*.dist-info")) + + assert dist_info.name == wheel_dist_info.name + assert dist_info.name.startswith(f"my_proj-{version}{suffix}") + for file in "METADATA", "entry_points.txt": + assert read(dist_info / file) == read(wheel_dist_info / file) + + +def run_command_inner(*cmd, **kwargs): + opts = { + "stderr": subprocess.STDOUT, + "stdout": subprocess.PIPE, + "text": True, + "encoding": "utf-8", + "check": True, + **kwargs, + } + cmd = [sys.executable, "-c", "__import__('setuptools').setup()", *map(str, cmd)] + return subprocess.run(cmd, **opts) + + +def run_command(*args, **kwargs): + return run_command_inner(*args, **kwargs).stdout diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_distutils_adoption.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_distutils_adoption.py new file mode 100644 index 0000000000000000000000000000000000000000..f99a58849950029f53322c19cde1c171fe26622c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_distutils_adoption.py @@ -0,0 +1,198 @@ +import os +import platform +import sys +import textwrap + +import pytest + +IS_PYPY = '__pypy__' in sys.builtin_module_names + +_TEXT_KWARGS = {"text": True, "encoding": "utf-8"} # For subprocess.run + + +def win_sr(env): + """ + On Windows, SYSTEMROOT must be present to avoid + + > Fatal Python error: _Py_HashRandomization_Init: failed to + > get random numbers to initialize Python + """ + if env and platform.system() == 'Windows': + env['SYSTEMROOT'] = os.environ['SYSTEMROOT'] + return env + + +def find_distutils(venv, imports='distutils', env=None, **kwargs): + py_cmd = 'import {imports}; print(distutils.__file__)'.format(**locals()) + cmd = ['python', '-c', py_cmd] + return venv.run(cmd, env=win_sr(env), **_TEXT_KWARGS, **kwargs) + + +def count_meta_path(venv, env=None): + py_cmd = textwrap.dedent( + """ + import sys + is_distutils = lambda finder: finder.__class__.__name__ == "DistutilsMetaFinder" + print(len(list(filter(is_distutils, sys.meta_path)))) + """ + ) + cmd = ['python', '-c', py_cmd] + return int(venv.run(cmd, env=win_sr(env), **_TEXT_KWARGS)) + + +skip_without_stdlib_distutils = pytest.mark.skipif( + sys.version_info >= (3, 12), + reason='stdlib distutils is removed from Python 3.12+', +) + + +@skip_without_stdlib_distutils +def test_distutils_stdlib(venv): + """ + Ensure stdlib distutils is used when appropriate. + """ + env = dict(SETUPTOOLS_USE_DISTUTILS='stdlib') + assert venv.name not in find_distutils(venv, env=env).split(os.sep) + assert count_meta_path(venv, env=env) == 0 + + +def test_distutils_local_with_setuptools(venv): + """ + Ensure local distutils is used when appropriate. + """ + env = dict(SETUPTOOLS_USE_DISTUTILS='local') + loc = find_distutils(venv, imports='setuptools, distutils', env=env) + assert venv.name in loc.split(os.sep) + assert count_meta_path(venv, env=env) <= 1 + + +@pytest.mark.xfail('IS_PYPY', reason='pypy imports distutils on startup') +def test_distutils_local(venv): + """ + Even without importing, the setuptools-local copy of distutils is + preferred. + """ + env = dict(SETUPTOOLS_USE_DISTUTILS='local') + assert venv.name in find_distutils(venv, env=env).split(os.sep) + assert count_meta_path(venv, env=env) <= 1 + + +def test_pip_import(venv): + """ + Ensure pip can be imported. + Regression test for #3002. + """ + cmd = ['python', '-c', 'import pip'] + venv.run(cmd, **_TEXT_KWARGS) + + +def test_distutils_has_origin(): + """ + Distutils module spec should have an origin. #2990. + """ + assert __import__('distutils').__spec__.origin + + +ENSURE_IMPORTS_ARE_NOT_DUPLICATED = r""" +# Depending on the importlib machinery and _distutils_hack, some imports are +# duplicated resulting in different module objects being loaded, which prevents +# patches as shown in #3042. +# This script provides a way of verifying if this duplication is happening. + +from distutils import cmd +import distutils.command.sdist as sdist + +# import last to prevent caching +from distutils import {imported_module} + +for mod in (cmd, sdist): + assert mod.{imported_module} == {imported_module}, ( + f"\n{{mod.dir_util}}\n!=\n{{{imported_module}}}" + ) + +print("success") +""" + + +@pytest.mark.usefixtures("tmpdir_cwd") +@pytest.mark.parametrize( + ('distutils_version', 'imported_module'), + [ + pytest.param("stdlib", "dir_util", marks=skip_without_stdlib_distutils), + pytest.param("stdlib", "file_util", marks=skip_without_stdlib_distutils), + pytest.param("stdlib", "archive_util", marks=skip_without_stdlib_distutils), + ("local", "dir_util"), + ("local", "file_util"), + ("local", "archive_util"), + ], +) +def test_modules_are_not_duplicated_on_import(distutils_version, imported_module, venv): + env = dict(SETUPTOOLS_USE_DISTUTILS=distutils_version) + script = ENSURE_IMPORTS_ARE_NOT_DUPLICATED.format(imported_module=imported_module) + cmd = ['python', '-c', script] + output = venv.run(cmd, env=win_sr(env), **_TEXT_KWARGS).strip() + assert output == "success" + + +ENSURE_LOG_IMPORT_IS_NOT_DUPLICATED = r""" +import types +import distutils.dist as dist +from distutils import log +if isinstance(dist.log, types.ModuleType): + assert dist.log == log, f"\n{dist.log}\n!=\n{log}" +print("success") +""" + + +@pytest.mark.usefixtures("tmpdir_cwd") +@pytest.mark.parametrize( + "distutils_version", + [ + "local", + pytest.param("stdlib", marks=skip_without_stdlib_distutils), + ], +) +def test_log_module_is_not_duplicated_on_import(distutils_version, venv): + env = dict(SETUPTOOLS_USE_DISTUTILS=distutils_version) + cmd = ['python', '-c', ENSURE_LOG_IMPORT_IS_NOT_DUPLICATED] + output = venv.run(cmd, env=win_sr(env), **_TEXT_KWARGS).strip() + assert output == "success" + + +ENSURE_CONSISTENT_ERROR_FROM_MODIFIED_PY = r""" +from setuptools.modified import newer +from {imported_module}.errors import DistutilsError + +# Can't use pytest.raises in this context +try: + newer("", "") +except DistutilsError: + print("success") +else: + raise AssertionError("Expected to raise") +""" + + +@pytest.mark.usefixtures("tmpdir_cwd") +@pytest.mark.parametrize( + ('distutils_version', 'imported_module'), + [ + ("local", "distutils"), + # Unfortunately we still get ._distutils.errors.DistutilsError with SETUPTOOLS_USE_DISTUTILS=stdlib + # But that's a deprecated use-case we don't mind not fully supporting in newer code + pytest.param( + "stdlib", "setuptools._distutils", marks=skip_without_stdlib_distutils + ), + ], +) +def test_consistent_error_from_modified_py(distutils_version, imported_module, venv): + env = dict(SETUPTOOLS_USE_DISTUTILS=distutils_version) + cmd = [ + 'python', + '-c', + ENSURE_CONSISTENT_ERROR_FROM_MODIFIED_PY.format( + imported_module=imported_module + ), + ] + output = venv.run(cmd, env=win_sr(env), **_TEXT_KWARGS).strip() + assert output == "success" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_easy_install.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_easy_install.py new file mode 100644 index 0000000000000000000000000000000000000000..b58b0b66663e4bfb6e26bd730e62bff8267eeff5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_easy_install.py @@ -0,0 +1,1476 @@ +"""Easy install Tests""" + +import contextlib +import io +import itertools +import logging +import os +import pathlib +import re +import site +import subprocess +import sys +import tarfile +import tempfile +import time +import warnings +import zipfile +from pathlib import Path +from typing import NamedTuple +from unittest import mock + +import pytest +from jaraco import path + +import pkg_resources +import setuptools.command.easy_install as ei +from pkg_resources import Distribution as PRDistribution, normalize_path, working_set +from setuptools import sandbox +from setuptools._normalization import safer_name +from setuptools.command.easy_install import PthDistributions +from setuptools.dist import Distribution +from setuptools.sandbox import run_setup +from setuptools.tests import fail_on_ascii +from setuptools.tests.server import MockServer, path_to_url + +from . import contexts +from .textwrap import DALS + +import distutils.errors + + +@pytest.fixture(autouse=True) +def pip_disable_index(monkeypatch): + """ + Important: Disable the default index for pip to avoid + querying packages in the index and potentially resolving + and installing packages there. + """ + monkeypatch.setenv('PIP_NO_INDEX', 'true') + + +class FakeDist: + def get_entry_map(self, group): + if group != 'console_scripts': + return {} + return {'name': 'ep'} + + def as_requirement(self): + return 'spec' + + +SETUP_PY = DALS( + """ + from setuptools import setup + + setup() + """ +) + + +class TestEasyInstallTest: + def test_get_script_args(self): + header = ei.CommandSpec.best().from_environment().as_header() + dist = FakeDist() + args = next(ei.ScriptWriter.get_args(dist)) + _name, script = itertools.islice(args, 2) + assert script.startswith(header) + assert "'spec'" in script + assert "'console_scripts'" in script + assert "'name'" in script + assert re.search('^# EASY-INSTALL-ENTRY-SCRIPT', script, flags=re.MULTILINE) + + def test_no_find_links(self): + # new option '--no-find-links', that blocks find-links added at + # the project level + dist = Distribution() + cmd = ei.easy_install(dist) + cmd.check_pth_processing = lambda: True + cmd.no_find_links = True + cmd.find_links = ['link1', 'link2'] + cmd.install_dir = os.path.join(tempfile.mkdtemp(), 'ok') + cmd.args = ['ok'] + cmd.ensure_finalized() + assert cmd.package_index.scanned_urls == {} + + # let's try without it (default behavior) + cmd = ei.easy_install(dist) + cmd.check_pth_processing = lambda: True + cmd.find_links = ['link1', 'link2'] + cmd.install_dir = os.path.join(tempfile.mkdtemp(), 'ok') + cmd.args = ['ok'] + cmd.ensure_finalized() + keys = sorted(cmd.package_index.scanned_urls.keys()) + assert keys == ['link1', 'link2'] + + def test_write_exception(self): + """ + Test that `cant_write_to_target` is rendered as a DistutilsError. + """ + dist = Distribution() + cmd = ei.easy_install(dist) + cmd.install_dir = os.getcwd() + with pytest.raises(distutils.errors.DistutilsError): + cmd.cant_write_to_target() + + def test_all_site_dirs(self, monkeypatch): + """ + get_site_dirs should always return site dirs reported by + site.getsitepackages. + """ + path = normalize_path('/setuptools/test/site-packages') + + def mock_gsp(): + return [path] + + monkeypatch.setattr(site, 'getsitepackages', mock_gsp, raising=False) + assert path in ei.get_site_dirs() + + def test_all_site_dirs_works_without_getsitepackages(self, monkeypatch): + monkeypatch.delattr(site, 'getsitepackages', raising=False) + assert ei.get_site_dirs() + + @pytest.fixture + def sdist_unicode(self, tmpdir): + files = [ + ( + 'setup.py', + DALS( + """ + import setuptools + setuptools.setup( + name="setuptools-test-unicode", + version="1.0", + packages=["mypkg"], + include_package_data=True, + ) + """ + ), + ), + ( + 'mypkg/__init__.py', + "", + ), + ( + 'mypkg/☃.txt', + "", + ), + ] + sdist_name = 'setuptools-test-unicode-1.0.zip' + sdist = tmpdir / sdist_name + # can't use make_sdist, because the issue only occurs + # with zip sdists. + sdist_zip = zipfile.ZipFile(str(sdist), 'w') + for filename, content in files: + sdist_zip.writestr(filename, content) + sdist_zip.close() + return str(sdist) + + @fail_on_ascii + def test_unicode_filename_in_sdist(self, sdist_unicode, tmpdir, monkeypatch): + """ + The install command should execute correctly even if + the package has unicode filenames. + """ + dist = Distribution({'script_args': ['easy_install']}) + target = (tmpdir / 'target').ensure_dir() + cmd = ei.easy_install( + dist, + install_dir=str(target), + args=['x'], + ) + monkeypatch.setitem(os.environ, 'PYTHONPATH', str(target)) + cmd.ensure_finalized() + cmd.easy_install(sdist_unicode) + + @pytest.fixture + def sdist_unicode_in_script(self, tmpdir): + files = [ + ( + "setup.py", + DALS( + """ + import setuptools + setuptools.setup( + name="setuptools-test-unicode", + version="1.0", + packages=["mypkg"], + include_package_data=True, + scripts=['mypkg/unicode_in_script'], + ) + """ + ), + ), + ("mypkg/__init__.py", ""), + ( + "mypkg/unicode_in_script", + DALS( + """ + #!/bin/sh + # á + + non_python_fn() { + } + """ + ), + ), + ] + sdist_name = "setuptools-test-unicode-script-1.0.zip" + sdist = tmpdir / sdist_name + # can't use make_sdist, because the issue only occurs + # with zip sdists. + sdist_zip = zipfile.ZipFile(str(sdist), "w") + for filename, content in files: + sdist_zip.writestr(filename, content.encode('utf-8')) + sdist_zip.close() + return str(sdist) + + @fail_on_ascii + def test_unicode_content_in_sdist( + self, sdist_unicode_in_script, tmpdir, monkeypatch + ): + """ + The install command should execute correctly even if + the package has unicode in scripts. + """ + dist = Distribution({"script_args": ["easy_install"]}) + target = (tmpdir / "target").ensure_dir() + cmd = ei.easy_install(dist, install_dir=str(target), args=["x"]) + monkeypatch.setitem(os.environ, "PYTHONPATH", str(target)) + cmd.ensure_finalized() + cmd.easy_install(sdist_unicode_in_script) + + @pytest.fixture + def sdist_script(self, tmpdir): + files = [ + ( + 'setup.py', + DALS( + """ + import setuptools + setuptools.setup( + name="setuptools-test-script", + version="1.0", + scripts=["mypkg_script"], + ) + """ + ), + ), + ( + 'mypkg_script', + DALS( + """ + #/usr/bin/python + print('mypkg_script') + """ + ), + ), + ] + sdist_name = 'setuptools-test-script-1.0.zip' + sdist = str(tmpdir / sdist_name) + make_sdist(sdist, files) + return sdist + + @pytest.mark.skipif( + not sys.platform.startswith('linux'), reason="Test can only be run on Linux" + ) + def test_script_install(self, sdist_script, tmpdir, monkeypatch): + """ + Check scripts are installed. + """ + dist = Distribution({'script_args': ['easy_install']}) + target = (tmpdir / 'target').ensure_dir() + cmd = ei.easy_install( + dist, + install_dir=str(target), + args=['x'], + ) + monkeypatch.setitem(os.environ, 'PYTHONPATH', str(target)) + cmd.ensure_finalized() + cmd.easy_install(sdist_script) + assert (target / 'mypkg_script').exists() + + +@pytest.mark.filterwarnings('ignore:Unbuilt egg') +class TestPTHFileWriter: + def test_add_from_cwd_site_sets_dirty(self): + """a pth file manager should set dirty + if a distribution is in site but also the cwd + """ + pth = PthDistributions('does-not_exist', [os.getcwd()]) + assert not pth.dirty + pth.add(PRDistribution(os.getcwd())) + assert pth.dirty + + def test_add_from_site_is_ignored(self): + location = '/test/location/does-not-have-to-exist' + # PthDistributions expects all locations to be normalized + location = pkg_resources.normalize_path(location) + pth = PthDistributions( + 'does-not_exist', + [ + location, + ], + ) + assert not pth.dirty + pth.add(PRDistribution(location)) + assert not pth.dirty + + def test_many_pth_distributions_merge_together(self, tmpdir): + """ + If the pth file is modified under the hood, then PthDistribution + will refresh its content before saving, merging contents when + necessary. + """ + # putting the pth file in a dedicated sub-folder, + pth_subdir = tmpdir.join("pth_subdir") + pth_subdir.mkdir() + pth_path = str(pth_subdir.join("file1.pth")) + pth1 = PthDistributions(pth_path) + pth2 = PthDistributions(pth_path) + assert pth1.paths == pth2.paths == [], ( + "unless there would be some default added at some point" + ) + # and so putting the src_subdir in folder distinct than the pth one, + # so to keep it absolute by PthDistributions + new_src_path = tmpdir.join("src_subdir") + new_src_path.mkdir() # must exist to be accounted + new_src_path_str = str(new_src_path) + pth1.paths.append(new_src_path_str) + pth1.save() + assert pth1.paths, ( + "the new_src_path added must still be present/valid in pth1 after save" + ) + # now, + assert new_src_path_str not in pth2.paths, ( + "right before we save the entry should still not be present" + ) + pth2.save() + assert new_src_path_str in pth2.paths, ( + "the new_src_path entry should have been added by pth2 with its save() call" + ) + assert pth2.paths[-1] == new_src_path, ( + "and it should match exactly on the last entry actually " + "given we append to it in save()" + ) + # finally, + assert PthDistributions(pth_path).paths == pth2.paths, ( + "and we should have the exact same list at the end " + "with a fresh PthDistributions instance" + ) + + +@pytest.fixture +def setup_context(tmpdir): + with (tmpdir / 'setup.py').open('w', encoding="utf-8") as f: + f.write(SETUP_PY) + with tmpdir.as_cwd(): + yield tmpdir + + +@pytest.mark.usefixtures("user_override") +@pytest.mark.usefixtures("setup_context") +class TestUserInstallTest: + # prevent check that site-packages is writable. easy_install + # shouldn't be writing to system site-packages during finalize + # options, but while it does, bypass the behavior. + prev_sp_write = mock.patch( + 'setuptools.command.easy_install.easy_install.check_site_dir', + mock.Mock(), + ) + + # simulate setuptools installed in user site packages + @mock.patch('setuptools.command.easy_install.__file__', site.USER_SITE) + @mock.patch('site.ENABLE_USER_SITE', True) + @prev_sp_write + def test_user_install_not_implied_user_site_enabled(self): + self.assert_not_user_site() + + @mock.patch('site.ENABLE_USER_SITE', False) + @prev_sp_write + def test_user_install_not_implied_user_site_disabled(self): + self.assert_not_user_site() + + @staticmethod + def assert_not_user_site(): + # create a finalized easy_install command + dist = Distribution() + dist.script_name = 'setup.py' + cmd = ei.easy_install(dist) + cmd.args = ['py'] + cmd.ensure_finalized() + assert not cmd.user, 'user should not be implied' + + def test_multiproc_atexit(self): + pytest.importorskip('multiprocessing') + + log = logging.getLogger('test_easy_install') + logging.basicConfig(level=logging.INFO, stream=sys.stderr) + log.info('this should not break') + + @pytest.fixture + def foo_package(self, tmpdir): + egg_file = tmpdir / 'foo-1.0.egg-info' + with egg_file.open('w') as f: + f.write('Name: foo\n') + return str(tmpdir) + + @pytest.fixture + def install_target(self, tmpdir): + target = str(tmpdir) + with mock.patch('sys.path', sys.path + [target]): + python_path = os.path.pathsep.join(sys.path) + with mock.patch.dict(os.environ, PYTHONPATH=python_path): + yield target + + def test_local_index(self, foo_package, install_target): + """ + The local index must be used when easy_install locates installed + packages. + """ + dist = Distribution() + dist.script_name = 'setup.py' + cmd = ei.easy_install(dist) + cmd.install_dir = install_target + cmd.args = ['foo'] + cmd.ensure_finalized() + cmd.local_index.scan([foo_package]) + res = cmd.easy_install('foo') + actual = os.path.normcase(os.path.realpath(res.location)) + expected = os.path.normcase(os.path.realpath(foo_package)) + assert actual == expected + + @contextlib.contextmanager + def user_install_setup_context(self, *args, **kwargs): + """ + Wrap sandbox.setup_context to patch easy_install in that context to + appear as user-installed. + """ + with self.orig_context(*args, **kwargs): + import setuptools.command.easy_install as ei + + ei.__file__ = site.USER_SITE + yield + + def patched_setup_context(self): + self.orig_context = sandbox.setup_context + + return mock.patch( + 'setuptools.sandbox.setup_context', + self.user_install_setup_context, + ) + + +@pytest.fixture +def distutils_package(): + distutils_setup_py = SETUP_PY.replace( + 'from setuptools import setup', + 'from distutils.core import setup', + ) + with contexts.tempdir(cd=os.chdir): + with open('setup.py', 'w', encoding="utf-8") as f: + f.write(distutils_setup_py) + yield + + +@pytest.mark.usefixtures("distutils_package") +class TestDistutilsPackage: + def test_bdist_egg_available_on_distutils_pkg(self): + run_setup('setup.py', ['bdist_egg']) + + +@pytest.fixture +def mock_index(): + # set up a server which will simulate an alternate package index. + p_index = MockServer() + if p_index.server_port == 0: + # Some platforms (Jython) don't find a port to which to bind, + # so skip test for them. + pytest.skip("could not find a valid port") + p_index.start() + return p_index + + +class TestInstallRequires: + def test_setup_install_includes_dependencies(self, tmp_path, mock_index): + """ + When ``python setup.py install`` is called directly, it will use easy_install + to fetch dependencies. + """ + # TODO: Remove these tests once `setup.py install` is completely removed + project_root = tmp_path / "project" + project_root.mkdir(exist_ok=True) + install_root = tmp_path / "install" + install_root.mkdir(exist_ok=True) + + self.create_project(project_root) + cmd = [ + sys.executable, + '-c', + '__import__("setuptools").setup()', + 'install', + '--install-base', + str(install_root), + '--install-lib', + str(install_root), + '--install-headers', + str(install_root), + '--install-scripts', + str(install_root), + '--install-data', + str(install_root), + '--install-purelib', + str(install_root), + '--install-platlib', + str(install_root), + ] + env = {**os.environ, "__EASYINSTALL_INDEX": mock_index.url} + cp = subprocess.run( + cmd, + cwd=str(project_root), + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + encoding="utf-8", + ) + assert cp.returncode != 0 + try: + assert '/does-not-exist/' in {r.path for r in mock_index.requests} + assert next( + line + for line in cp.stdout.splitlines() + if "not find suitable distribution for" in line + and "does-not-exist" in line + ) + except Exception: + if "failed to get random numbers" in cp.stdout: + pytest.xfail(f"{sys.platform} failure - {cp.stdout}") + raise + + def create_project(self, root): + config = """ + [metadata] + name = project + version = 42 + + [options] + install_requires = does-not-exist + py_modules = mod + """ + (root / 'setup.cfg').write_text(DALS(config), encoding="utf-8") + (root / 'mod.py').touch() + + +class TestSetupRequires: + def test_setup_requires_honors_fetch_params(self, mock_index, monkeypatch): + """ + When easy_install installs a source distribution which specifies + setup_requires, it should honor the fetch parameters (such as + index-url, and find-links). + """ + monkeypatch.setenv('PIP_RETRIES', '0') + monkeypatch.setenv('PIP_TIMEOUT', '0') + monkeypatch.setenv('PIP_NO_INDEX', 'false') + with contexts.quiet(): + # create an sdist that has a build-time dependency. + with TestSetupRequires.create_sdist() as dist_file: + with contexts.tempdir() as temp_install_dir: + with contexts.environment(PYTHONPATH=temp_install_dir): + cmd = [ + sys.executable, + '-c', + '__import__("setuptools").setup()', + 'easy_install', + '--index-url', + mock_index.url, + '--exclude-scripts', + '--install-dir', + temp_install_dir, + dist_file, + ] + subprocess.Popen(cmd).wait() + # there should have been one requests to the server + assert [r.path for r in mock_index.requests] == ['/does-not-exist/'] + + @staticmethod + @contextlib.contextmanager + def create_sdist(): + """ + Return an sdist with a setup_requires dependency (of something that + doesn't exist) + """ + with contexts.tempdir() as dir: + dist_path = os.path.join(dir, 'setuptools-test-fetcher-1.0.tar.gz') + make_sdist( + dist_path, + [ + ( + 'setup.py', + DALS( + """ + import setuptools + setuptools.setup( + name="setuptools-test-fetcher", + version="1.0", + setup_requires = ['does-not-exist'], + ) + """ + ), + ), + ('setup.cfg', ''), + ], + ) + yield dist_path + + use_setup_cfg = ( + (), + ('dependency_links',), + ('setup_requires',), + ('dependency_links', 'setup_requires'), + ) + + @pytest.mark.parametrize('use_setup_cfg', use_setup_cfg) + def test_setup_requires_overrides_version_conflict(self, use_setup_cfg): + """ + Regression test for distribution issue 323: + https://bitbucket.org/tarek/distribute/issues/323 + + Ensures that a distribution's setup_requires requirements can still be + installed and used locally even if a conflicting version of that + requirement is already on the path. + """ + + fake_dist = PRDistribution( + 'does-not-matter', project_name='foobar', version='0.0' + ) + working_set.add(fake_dist) + + with contexts.save_pkg_resources_state(): + with contexts.tempdir() as temp_dir: + test_pkg = create_setup_requires_package( + temp_dir, use_setup_cfg=use_setup_cfg + ) + test_setup_py = os.path.join(test_pkg, 'setup.py') + with contexts.quiet() as (stdout, _stderr): + # Don't even need to install the package, just + # running the setup.py at all is sufficient + run_setup(test_setup_py, ['--name']) + + lines = stdout.readlines() + assert len(lines) > 0 + assert lines[-1].strip() == 'test_pkg' + + @pytest.mark.parametrize('use_setup_cfg', use_setup_cfg) + def test_setup_requires_override_nspkg(self, use_setup_cfg): + """ + Like ``test_setup_requires_overrides_version_conflict`` but where the + ``setup_requires`` package is part of a namespace package that has + *already* been imported. + """ + + with contexts.save_pkg_resources_state(): + with contexts.tempdir() as temp_dir: + foobar_1_archive = os.path.join(temp_dir, 'foo_bar-0.1.tar.gz') + make_nspkg_sdist(foobar_1_archive, 'foo.bar', '0.1') + # Now actually go ahead an extract to the temp dir and add the + # extracted path to sys.path so foo.bar v0.1 is importable + foobar_1_dir = os.path.join(temp_dir, 'foo_bar-0.1') + os.mkdir(foobar_1_dir) + with tarfile.open(foobar_1_archive) as tf: + tf.extraction_filter = lambda member, path: member + tf.extractall(foobar_1_dir) + sys.path.insert(1, foobar_1_dir) + + dist = PRDistribution( + foobar_1_dir, project_name='foo.bar', version='0.1' + ) + working_set.add(dist) + + template = DALS( + """\ + import foo # Even with foo imported first the + # setup_requires package should override + import setuptools + setuptools.setup(**%r) + + if not (hasattr(foo, '__path__') and + len(foo.__path__) == 2): + print('FAIL') + + if 'foo_bar-0.2' not in foo.__path__[0]: + print('FAIL') + """ + ) + + test_pkg = create_setup_requires_package( + temp_dir, + 'foo.bar', + '0.2', + make_nspkg_sdist, + template, + use_setup_cfg=use_setup_cfg, + ) + + test_setup_py = os.path.join(test_pkg, 'setup.py') + + with contexts.quiet() as (stdout, _stderr): + try: + # Don't even need to install the package, just + # running the setup.py at all is sufficient + run_setup(test_setup_py, ['--name']) + except pkg_resources.VersionConflict: # pragma: nocover + pytest.fail( + 'Installing setup.py requirements caused a VersionConflict' + ) + + assert 'FAIL' not in stdout.getvalue() + lines = stdout.readlines() + assert len(lines) > 0 + assert lines[-1].strip() == 'test_pkg' + + @pytest.mark.parametrize('use_setup_cfg', use_setup_cfg) + def test_setup_requires_with_attr_version(self, use_setup_cfg): + def make_dependency_sdist(dist_path, distname, version): + files = [ + ( + 'setup.py', + DALS( + f""" + import setuptools + setuptools.setup( + name={distname!r}, + version={version!r}, + py_modules=[{distname!r}], + ) + """ + ), + ), + ( + distname + '.py', + DALS( + """ + version = 42 + """ + ), + ), + ] + make_sdist(dist_path, files) + + with contexts.save_pkg_resources_state(): + with contexts.tempdir() as temp_dir: + test_pkg = create_setup_requires_package( + temp_dir, + setup_attrs=dict(version='attr: foobar.version'), + make_package=make_dependency_sdist, + use_setup_cfg=use_setup_cfg + ('version',), + ) + test_setup_py = os.path.join(test_pkg, 'setup.py') + with contexts.quiet() as (stdout, _stderr): + run_setup(test_setup_py, ['--version']) + lines = stdout.readlines() + assert len(lines) > 0 + assert lines[-1].strip() == '42' + + def test_setup_requires_honors_pip_env(self, mock_index, monkeypatch): + monkeypatch.setenv('PIP_RETRIES', '0') + monkeypatch.setenv('PIP_TIMEOUT', '0') + monkeypatch.setenv('PIP_NO_INDEX', 'false') + monkeypatch.setenv('PIP_INDEX_URL', mock_index.url) + with contexts.save_pkg_resources_state(): + with contexts.tempdir() as temp_dir: + test_pkg = create_setup_requires_package( + temp_dir, + 'python-xlib', + '0.19', + setup_attrs=dict(dependency_links=[]), + ) + test_setup_cfg = os.path.join(test_pkg, 'setup.cfg') + with open(test_setup_cfg, 'w', encoding="utf-8") as fp: + fp.write( + DALS( + """ + [easy_install] + index_url = https://pypi.org/legacy/ + """ + ) + ) + test_setup_py = os.path.join(test_pkg, 'setup.py') + with pytest.raises(distutils.errors.DistutilsError): + run_setup(test_setup_py, ['--version']) + assert len(mock_index.requests) == 1 + assert mock_index.requests[0].path == '/python-xlib/' + + def test_setup_requires_with_pep508_url(self, mock_index, monkeypatch): + monkeypatch.setenv('PIP_RETRIES', '0') + monkeypatch.setenv('PIP_TIMEOUT', '0') + monkeypatch.setenv('PIP_INDEX_URL', mock_index.url) + with contexts.save_pkg_resources_state(): + with contexts.tempdir() as temp_dir: + dep_sdist = os.path.join(temp_dir, 'dep.tar.gz') + make_trivial_sdist(dep_sdist, 'dependency', '42') + dep_url = path_to_url(dep_sdist, authority='localhost') + test_pkg = create_setup_requires_package( + temp_dir, + # Ignored (overridden by setup_attrs) + 'python-xlib', + '0.19', + setup_attrs=dict(setup_requires=f'dependency @ {dep_url}'), + ) + test_setup_py = os.path.join(test_pkg, 'setup.py') + run_setup(test_setup_py, ['--version']) + assert len(mock_index.requests) == 0 + + def test_setup_requires_with_allow_hosts(self, mock_index): + """The `allow-hosts` option in not supported anymore.""" + files = { + 'test_pkg': { + 'setup.py': DALS( + """ + from setuptools import setup + setup(setup_requires='python-xlib') + """ + ), + 'setup.cfg': DALS( + """ + [easy_install] + allow_hosts = * + """ + ), + } + } + with contexts.save_pkg_resources_state(): + with contexts.tempdir() as temp_dir: + path.build(files, prefix=temp_dir) + setup_py = str(pathlib.Path(temp_dir, 'test_pkg', 'setup.py')) + with pytest.raises(distutils.errors.DistutilsError): + run_setup(setup_py, ['--version']) + assert len(mock_index.requests) == 0 + + def test_setup_requires_with_python_requires(self, monkeypatch, tmpdir): + """Check `python_requires` is honored.""" + monkeypatch.setenv('PIP_RETRIES', '0') + monkeypatch.setenv('PIP_TIMEOUT', '0') + monkeypatch.setenv('PIP_NO_INDEX', '1') + monkeypatch.setenv('PIP_VERBOSE', '1') + dep_1_0_sdist = 'dep-1.0.tar.gz' + dep_1_0_url = path_to_url(str(tmpdir / dep_1_0_sdist)) + dep_1_0_python_requires = '>=2.7' + make_python_requires_sdist( + str(tmpdir / dep_1_0_sdist), 'dep', '1.0', dep_1_0_python_requires + ) + dep_2_0_sdist = 'dep-2.0.tar.gz' + dep_2_0_url = path_to_url(str(tmpdir / dep_2_0_sdist)) + dep_2_0_python_requires = ( + f'!={sys.version_info.major}.{sys.version_info.minor}.*' + ) + make_python_requires_sdist( + str(tmpdir / dep_2_0_sdist), 'dep', '2.0', dep_2_0_python_requires + ) + index = tmpdir / 'index.html' + index.write_text( + DALS( + """ + + Links for dep + +

Links for dep

+ {dep_1_0_sdist}
+ {dep_2_0_sdist}
+ + + """ + ).format( + dep_1_0_url=dep_1_0_url, + dep_1_0_sdist=dep_1_0_sdist, + dep_1_0_python_requires=dep_1_0_python_requires, + dep_2_0_url=dep_2_0_url, + dep_2_0_sdist=dep_2_0_sdist, + dep_2_0_python_requires=dep_2_0_python_requires, + ), + 'utf-8', + ) + index_url = path_to_url(str(index)) + with contexts.save_pkg_resources_state(): + test_pkg = create_setup_requires_package( + str(tmpdir), + 'python-xlib', + '0.19', # Ignored (overridden by setup_attrs). + setup_attrs=dict(setup_requires='dep', dependency_links=[index_url]), + ) + test_setup_py = os.path.join(test_pkg, 'setup.py') + run_setup(test_setup_py, ['--version']) + eggs = list( + map(str, pkg_resources.find_distributions(os.path.join(test_pkg, '.eggs'))) + ) + assert eggs == ['dep 1.0'] + + @pytest.mark.parametrize('with_dependency_links_in_setup_py', (False, True)) + def test_setup_requires_with_find_links_in_setup_cfg( + self, monkeypatch, with_dependency_links_in_setup_py + ): + monkeypatch.setenv('PIP_RETRIES', '0') + monkeypatch.setenv('PIP_TIMEOUT', '0') + with contexts.save_pkg_resources_state(): + with contexts.tempdir() as temp_dir: + make_trivial_sdist( + os.path.join(temp_dir, 'python-xlib-42.tar.gz'), 'python-xlib', '42' + ) + test_pkg = os.path.join(temp_dir, 'test_pkg') + test_setup_py = os.path.join(test_pkg, 'setup.py') + test_setup_cfg = os.path.join(test_pkg, 'setup.cfg') + os.mkdir(test_pkg) + with open(test_setup_py, 'w', encoding="utf-8") as fp: + if with_dependency_links_in_setup_py: + dependency_links = [os.path.join(temp_dir, 'links')] + else: + dependency_links = [] + fp.write( + DALS( + """ + from setuptools import installer, setup + setup(setup_requires='python-xlib==42', + dependency_links={dependency_links!r}) + """ + ).format(dependency_links=dependency_links) + ) + with open(test_setup_cfg, 'w', encoding="utf-8") as fp: + fp.write( + DALS( + """ + [easy_install] + index_url = {index_url} + find_links = {find_links} + """ + ).format( + index_url=os.path.join(temp_dir, 'index'), + find_links=temp_dir, + ) + ) + run_setup(test_setup_py, ['--version']) + + def test_setup_requires_with_transitive_extra_dependency(self, monkeypatch): + """ + Use case: installing a package with a build dependency on + an already installed `dep[extra]`, which in turn depends + on `extra_dep` (whose is not already installed). + """ + with contexts.save_pkg_resources_state(): + with contexts.tempdir() as temp_dir: + # Create source distribution for `extra_dep`. + make_trivial_sdist( + os.path.join(temp_dir, 'extra_dep-1.0.tar.gz'), 'extra_dep', '1.0' + ) + # Create source tree for `dep`. + dep_pkg = os.path.join(temp_dir, 'dep') + os.mkdir(dep_pkg) + path.build( + { + 'setup.py': DALS( + """ + import setuptools + setuptools.setup( + name='dep', version='2.0', + extras_require={'extra': ['extra_dep']}, + ) + """ + ), + 'setup.cfg': '', + }, + prefix=dep_pkg, + ) + # "Install" dep. + run_setup(os.path.join(dep_pkg, 'setup.py'), ['dist_info']) + working_set.add_entry(dep_pkg) + # Create source tree for test package. + test_pkg = os.path.join(temp_dir, 'test_pkg') + test_setup_py = os.path.join(test_pkg, 'setup.py') + os.mkdir(test_pkg) + with open(test_setup_py, 'w', encoding="utf-8") as fp: + fp.write( + DALS( + """ + from setuptools import installer, setup + setup(setup_requires='dep[extra]') + """ + ) + ) + # Check... + monkeypatch.setenv('PIP_FIND_LINKS', str(temp_dir)) + monkeypatch.setenv('PIP_NO_INDEX', '1') + monkeypatch.setenv('PIP_RETRIES', '0') + monkeypatch.setenv('PIP_TIMEOUT', '0') + run_setup(test_setup_py, ['--version']) + + def test_setup_requires_with_distutils_command_dep(self, monkeypatch): + """ + Use case: ensure build requirements' extras + are properly installed and activated. + """ + with contexts.save_pkg_resources_state(): + with contexts.tempdir() as temp_dir: + # Create source distribution for `extra_dep`. + make_sdist( + os.path.join(temp_dir, 'extra_dep-1.0.tar.gz'), + [ + ( + 'setup.py', + DALS( + """ + import setuptools + setuptools.setup( + name='extra_dep', + version='1.0', + py_modules=['extra_dep'], + ) + """ + ), + ), + ('setup.cfg', ''), + ('extra_dep.py', ''), + ], + ) + # Create source tree for `epdep`. + dep_pkg = os.path.join(temp_dir, 'epdep') + os.mkdir(dep_pkg) + path.build( + { + 'setup.py': DALS( + """ + import setuptools + setuptools.setup( + name='dep', version='2.0', + py_modules=['epcmd'], + extras_require={'extra': ['extra_dep']}, + entry_points=''' + [distutils.commands] + epcmd = epcmd:epcmd [extra] + ''', + ) + """ + ), + 'setup.cfg': '', + 'epcmd.py': DALS( + """ + from distutils.command.build_py import build_py + + import extra_dep + + class epcmd(build_py): + pass + """ + ), + }, + prefix=dep_pkg, + ) + # "Install" dep. + run_setup(os.path.join(dep_pkg, 'setup.py'), ['dist_info']) + working_set.add_entry(dep_pkg) + # Create source tree for test package. + test_pkg = os.path.join(temp_dir, 'test_pkg') + test_setup_py = os.path.join(test_pkg, 'setup.py') + os.mkdir(test_pkg) + with open(test_setup_py, 'w', encoding="utf-8") as fp: + fp.write( + DALS( + """ + from setuptools import installer, setup + setup(setup_requires='dep[extra]') + """ + ) + ) + # Check... + monkeypatch.setenv('PIP_FIND_LINKS', str(temp_dir)) + monkeypatch.setenv('PIP_NO_INDEX', '1') + monkeypatch.setenv('PIP_RETRIES', '0') + monkeypatch.setenv('PIP_TIMEOUT', '0') + run_setup(test_setup_py, ['epcmd']) + + +def make_trivial_sdist(dist_path, distname, version): + """ + Create a simple sdist tarball at dist_path, containing just a simple + setup.py. + """ + + make_sdist( + dist_path, + [ + ( + 'setup.py', + DALS( + f"""\ + import setuptools + setuptools.setup( + name={distname!r}, + version={version!r} + ) + """ + ), + ), + ('setup.cfg', ''), + ], + ) + + +def make_nspkg_sdist(dist_path, distname, version): + """ + Make an sdist tarball with distname and version which also contains one + package with the same name as distname. The top-level package is + designated a namespace package). + """ + # Assert that the distname contains at least one period + assert '.' in distname + + parts = distname.split('.') + nspackage = parts[0] + + packages = ['.'.join(parts[:idx]) for idx in range(1, len(parts) + 1)] + + setup_py = DALS( + f"""\ + import setuptools + setuptools.setup( + name={distname!r}, + version={version!r}, + packages={packages!r}, + namespace_packages=[{nspackage!r}] + ) + """ + ) + + init = "__import__('pkg_resources').declare_namespace(__name__)" + + files = [('setup.py', setup_py), (os.path.join(nspackage, '__init__.py'), init)] + for package in packages[1:]: + filename = os.path.join(*(package.split('.') + ['__init__.py'])) + files.append((filename, '')) + + make_sdist(dist_path, files) + + +def make_python_requires_sdist(dist_path, distname, version, python_requires): + make_sdist( + dist_path, + [ + ( + 'setup.py', + DALS( + """\ + import setuptools + setuptools.setup( + name={name!r}, + version={version!r}, + python_requires={python_requires!r}, + ) + """ + ).format( + name=distname, version=version, python_requires=python_requires + ), + ), + ('setup.cfg', ''), + ], + ) + + +def make_sdist(dist_path, files): + """ + Create a simple sdist tarball at dist_path, containing the files + listed in ``files`` as ``(filename, content)`` tuples. + """ + + # Distributions with only one file don't play well with pip. + assert len(files) > 1 + with tarfile.open(dist_path, 'w:gz') as dist: + for filename, content in files: + file_bytes = io.BytesIO(content.encode('utf-8')) + file_info = tarfile.TarInfo(name=filename) + file_info.size = len(file_bytes.getvalue()) + file_info.mtime = int(time.time()) + dist.addfile(file_info, fileobj=file_bytes) + + +def create_setup_requires_package( + path, + distname='foobar', + version='0.1', + make_package=make_trivial_sdist, + setup_py_template=None, + setup_attrs=None, + use_setup_cfg=(), +): + """Creates a source tree under path for a trivial test package that has a + single requirement in setup_requires--a tarball for that requirement is + also created and added to the dependency_links argument. + + ``distname`` and ``version`` refer to the name/version of the package that + the test package requires via ``setup_requires``. The name of the test + package itself is just 'test_pkg'. + """ + + normalized_distname = safer_name(distname) + test_setup_attrs = { + 'name': 'test_pkg', + 'version': '0.0', + 'setup_requires': [f'{normalized_distname}=={version}'], + 'dependency_links': [os.path.abspath(path)], + } + if setup_attrs: + test_setup_attrs.update(setup_attrs) + + test_pkg = os.path.join(path, 'test_pkg') + os.mkdir(test_pkg) + + # setup.cfg + if use_setup_cfg: + options = [] + metadata = [] + for name in use_setup_cfg: + value = test_setup_attrs.pop(name) + if name in 'name version'.split(): + section = metadata + else: + section = options + if isinstance(value, (tuple, list)): + value = ';'.join(value) + section.append(f'{name}: {value}') + test_setup_cfg_contents = DALS( + """ + [metadata] + {metadata} + [options] + {options} + """ + ).format( + options='\n'.join(options), + metadata='\n'.join(metadata), + ) + else: + test_setup_cfg_contents = '' + with open(os.path.join(test_pkg, 'setup.cfg'), 'w', encoding="utf-8") as f: + f.write(test_setup_cfg_contents) + + # setup.py + if setup_py_template is None: + setup_py_template = DALS( + """\ + import setuptools + setuptools.setup(**%r) + """ + ) + with open(os.path.join(test_pkg, 'setup.py'), 'w', encoding="utf-8") as f: + f.write(setup_py_template % test_setup_attrs) + + foobar_path = os.path.join(path, f'{normalized_distname}-{version}.tar.gz') + make_package(foobar_path, distname, version) + + return test_pkg + + +@pytest.mark.skipif( + sys.platform.startswith('java') and ei.is_sh(sys.executable), + reason="Test cannot run under java when executable is sh", +) +class TestScriptHeader: + non_ascii_exe = '/Users/José/bin/python' + exe_with_spaces = r'C:\Program Files\Python36\python.exe' + + def test_get_script_header(self): + expected = f'#!{ei.nt_quote_arg(os.path.normpath(sys.executable))}\n' + actual = ei.ScriptWriter.get_header('#!/usr/local/bin/python') + assert actual == expected + + def test_get_script_header_args(self): + expected = f'#!{ei.nt_quote_arg(os.path.normpath(sys.executable))} -x\n' + actual = ei.ScriptWriter.get_header('#!/usr/bin/python -x') + assert actual == expected + + def test_get_script_header_non_ascii_exe(self): + actual = ei.ScriptWriter.get_header( + '#!/usr/bin/python', executable=self.non_ascii_exe + ) + expected = f'#!{self.non_ascii_exe} -x\n' + assert actual == expected + + def test_get_script_header_exe_with_spaces(self): + actual = ei.ScriptWriter.get_header( + '#!/usr/bin/python', executable='"' + self.exe_with_spaces + '"' + ) + expected = f'#!"{self.exe_with_spaces}"\n' + assert actual == expected + + +class TestCommandSpec: + def test_custom_launch_command(self): + """ + Show how a custom CommandSpec could be used to specify a #! executable + which takes parameters. + """ + cmd = ei.CommandSpec(['/usr/bin/env', 'python3']) + assert cmd.as_header() == '#!/usr/bin/env python3\n' + + def test_from_param_for_CommandSpec_is_passthrough(self): + """ + from_param should return an instance of a CommandSpec + """ + cmd = ei.CommandSpec(['python']) + cmd_new = ei.CommandSpec.from_param(cmd) + assert cmd is cmd_new + + @mock.patch('sys.executable', TestScriptHeader.exe_with_spaces) + @mock.patch.dict(os.environ) + def test_from_environment_with_spaces_in_executable(self): + os.environ.pop('__PYVENV_LAUNCHER__', None) + cmd = ei.CommandSpec.from_environment() + assert len(cmd) == 1 + assert cmd.as_header().startswith('#!"') + + def test_from_simple_string_uses_shlex(self): + """ + In order to support `executable = /usr/bin/env my-python`, make sure + from_param invokes shlex on that input. + """ + cmd = ei.CommandSpec.from_param('/usr/bin/env my-python') + assert len(cmd) == 2 + assert '"' not in cmd.as_header() + + def test_from_param_raises_expected_error(self) -> None: + """ + from_param should raise its own TypeError when the argument's type is unsupported + """ + with pytest.raises(TypeError) as exc_info: + ei.CommandSpec.from_param(object()) # type: ignore[arg-type] # We want a type error here + assert ( + str(exc_info.value) == "Argument has an unsupported type " + ), exc_info.value + + +class TestWindowsScriptWriter: + def test_header(self): + hdr = ei.WindowsScriptWriter.get_header('') + assert hdr.startswith('#!') + assert hdr.endswith('\n') + hdr = hdr.lstrip('#!') + hdr = hdr.rstrip('\n') + # header should not start with an escaped quote + assert not hdr.startswith('\\"') + + +class VersionStub(NamedTuple): + major: int + minor: int + micro: int + releaselevel: str + serial: int + + +def test_use_correct_python_version_string(tmpdir, tmpdir_cwd, monkeypatch): + # In issue #3001, easy_install wrongly uses the `python3.1` directory + # when the interpreter is `python3.10` and the `--user` option is given. + # See pypa/setuptools#3001. + dist = Distribution() + cmd = dist.get_command_obj('easy_install') + cmd.args = ['ok'] + cmd.optimize = 0 + cmd.user = True + cmd.install_userbase = str(tmpdir) + cmd.install_usersite = None + install_cmd = dist.get_command_obj('install') + install_cmd.install_userbase = str(tmpdir) + install_cmd.install_usersite = None + + with monkeypatch.context() as patch, warnings.catch_warnings(): + warnings.simplefilter("ignore") + version = '3.10.1 (main, Dec 21 2021, 09:17:12) [GCC 10.2.1 20210110]' + info = VersionStub(3, 10, 1, "final", 0) + patch.setattr('site.ENABLE_USER_SITE', True) + patch.setattr('sys.version', version) + patch.setattr('sys.version_info', info) + patch.setattr(cmd, 'create_home_path', mock.Mock()) + cmd.finalize_options() + + name = "pypy" if hasattr(sys, 'pypy_version_info') else "python" + install_dir = cmd.install_dir.lower() + + # In some platforms (e.g. Windows), install_dir is mostly determined + # via `sysconfig`, which define constants eagerly at module creation. + # This means that monkeypatching `sys.version` to emulate 3.10 for testing + # may have no effect. + # The safest test here is to rely on the fact that 3.1 is no longer + # supported/tested, and make sure that if 'python3.1' ever appears in the string + # it is followed by another digit (e.g. 'python3.10'). + if re.search(name + r'3\.?1', install_dir): + assert re.search(name + r'3\.?1\d', install_dir) + + # The following "variables" are used for interpolation in distutils + # installation schemes, so it should be fair to treat them as "semi-public", + # or at least public enough so we can have a test to make sure they are correct + assert cmd.config_vars['py_version'] == '3.10.1' + assert cmd.config_vars['py_version_short'] == '3.10' + assert cmd.config_vars['py_version_nodot'] == '310' + + +@pytest.mark.xfail( + sys.platform == "darwin", + reason="https://github.com/pypa/setuptools/pull/4716#issuecomment-2447624418", +) +def test_editable_user_and_build_isolation(setup_context, monkeypatch, tmp_path): + """`setup.py develop` should honor `--user` even under build isolation""" + + # == Arrange == + # Pretend that build isolation was enabled + # e.g pip sets the environment variable PYTHONNOUSERSITE=1 + monkeypatch.setattr('site.ENABLE_USER_SITE', False) + + # Patching $HOME for 2 reasons: + # 1. setuptools/command/easy_install.py:create_home_path + # tries creating directories in $HOME. + # Given:: + # self.config_vars['DESTDIRS'] = ( + # "/home/user/.pyenv/versions/3.9.10 " + # "/home/user/.pyenv/versions/3.9.10/lib " + # "/home/user/.pyenv/versions/3.9.10/lib/python3.9 " + # "/home/user/.pyenv/versions/3.9.10/lib/python3.9/lib-dynload") + # `create_home_path` will:: + # makedirs( + # "/home/user/.pyenv/versions/3.9.10 " + # "/home/user/.pyenv/versions/3.9.10/lib " + # "/home/user/.pyenv/versions/3.9.10/lib/python3.9 " + # "/home/user/.pyenv/versions/3.9.10/lib/python3.9/lib-dynload") + # + # 2. We are going to force `site` to update site.USER_BASE and site.USER_SITE + # To point inside our new home + monkeypatch.setenv('HOME', str(tmp_path / '.home')) + monkeypatch.setenv('USERPROFILE', str(tmp_path / '.home')) + monkeypatch.setenv('APPDATA', str(tmp_path / '.home')) + monkeypatch.setattr('site.USER_BASE', None) + monkeypatch.setattr('site.USER_SITE', None) + user_site = Path(site.getusersitepackages()) + user_site.mkdir(parents=True, exist_ok=True) + + sys_prefix = tmp_path / '.sys_prefix' + sys_prefix.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr('sys.prefix', str(sys_prefix)) + + setup_script = ( + "__import__('setuptools').setup(name='aproj', version=42, packages=[])\n" + ) + (tmp_path / "setup.py").write_text(setup_script, encoding="utf-8") + + # == Sanity check == + assert list(sys_prefix.glob("*")) == [] + assert list(user_site.glob("*")) == [] + + # == Act == + run_setup('setup.py', ['develop', '--user']) + + # == Assert == + # Should not install to sys.prefix + assert list(sys_prefix.glob("*")) == [] + # Should install to user site + installed = {f.name for f in user_site.glob("*")} + # sometimes easy-install.pth is created and sometimes not + installed = installed - {"easy-install.pth"} + assert installed == {'aproj.egg-link'} diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_editable_install.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_editable_install.py new file mode 100644 index 0000000000000000000000000000000000000000..038dcadf934af20dbe11dcb13ba324ceb30bf90d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_editable_install.py @@ -0,0 +1,1289 @@ +from __future__ import annotations + +import os +import platform +import stat +import subprocess +import sys +from copy import deepcopy +from importlib import import_module +from importlib.machinery import EXTENSION_SUFFIXES +from pathlib import Path +from textwrap import dedent +from typing import Any +from unittest.mock import Mock +from uuid import uuid4 + +import jaraco.envs +import jaraco.path +import pytest +from path import Path as _Path + +from setuptools._importlib import resources as importlib_resources +from setuptools.command.editable_wheel import ( + _DebuggingTips, + _encode_pth, + _find_namespaces, + _find_package_roots, + _find_virtual_namespaces, + _finder_template, + _LinkTree, + _TopLevelFinder, + editable_wheel, +) +from setuptools.dist import Distribution +from setuptools.extension import Extension +from setuptools.warnings import SetuptoolsDeprecationWarning + +from . import contexts, namespaces + +from distutils.core import run_setup + + +@pytest.fixture(params=["strict", "lenient"]) +def editable_opts(request): + if request.param == "strict": + return ["--config-settings", "editable-mode=strict"] + return [] + + +EXAMPLE = { + 'pyproject.toml': dedent( + """\ + [build-system] + requires = ["setuptools"] + build-backend = "setuptools.build_meta" + + [project] + name = "mypkg" + version = "3.14159" + license = {text = "MIT"} + description = "This is a Python package" + dynamic = ["readme"] + classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers" + ] + urls = {Homepage = "https://github.com"} + + [tool.setuptools] + package-dir = {"" = "src"} + packages = {find = {where = ["src"]}} + license-files = ["LICENSE*"] + + [tool.setuptools.dynamic] + readme = {file = "README.rst"} + + [tool.distutils.egg_info] + tag-build = ".post0" + """ + ), + "MANIFEST.in": dedent( + """\ + global-include *.py *.txt + global-exclude *.py[cod] + prune dist + prune build + """ + ).strip(), + "README.rst": "This is a ``README``", + "LICENSE.txt": "---- placeholder MIT license ----", + "src": { + "mypkg": { + "__init__.py": dedent( + """\ + import sys + from importlib.metadata import PackageNotFoundError, version + + try: + __version__ = version(__name__) + except PackageNotFoundError: + __version__ = "unknown" + """ + ), + "__main__.py": dedent( + """\ + from importlib.resources import read_text + from . import __version__, __name__ as parent + from .mod import x + + data = read_text(parent, "data.txt") + print(__version__, data, x) + """ + ), + "mod.py": "x = ''", + "data.txt": "Hello World", + } + }, +} + + +SETUP_SCRIPT_STUB = "__import__('setuptools').setup()" + + +@pytest.mark.xfail(sys.platform == "darwin", reason="pypa/setuptools#4328") +@pytest.mark.parametrize( + "files", + [ + {**EXAMPLE, "setup.py": SETUP_SCRIPT_STUB}, + EXAMPLE, # No setup.py script + ], +) +def test_editable_with_pyproject(tmp_path, venv, files, editable_opts): + project = tmp_path / "mypkg" + project.mkdir() + jaraco.path.build(files, prefix=project) + + cmd = [ + "python", + "-m", + "pip", + "install", + "--no-build-isolation", # required to force current version of setuptools + "-e", + str(project), + *editable_opts, + ] + print(venv.run(cmd)) + + cmd = ["python", "-m", "mypkg"] + assert venv.run(cmd).strip() == "3.14159.post0 Hello World" + + (project / "src/mypkg/data.txt").write_text("foobar", encoding="utf-8") + (project / "src/mypkg/mod.py").write_text("x = 42", encoding="utf-8") + assert venv.run(cmd).strip() == "3.14159.post0 foobar 42" + + +def test_editable_with_flat_layout(tmp_path, venv, editable_opts): + files = { + "mypkg": { + "pyproject.toml": dedent( + """\ + [build-system] + requires = ["setuptools", "wheel"] + build-backend = "setuptools.build_meta" + + [project] + name = "mypkg" + version = "3.14159" + + [tool.setuptools] + packages = ["pkg"] + py-modules = ["mod"] + """ + ), + "pkg": {"__init__.py": "a = 4"}, + "mod.py": "b = 2", + }, + } + jaraco.path.build(files, prefix=tmp_path) + project = tmp_path / "mypkg" + + cmd = [ + "python", + "-m", + "pip", + "install", + "--no-build-isolation", # required to force current version of setuptools + "-e", + str(project), + *editable_opts, + ] + print(venv.run(cmd)) + cmd = ["python", "-c", "import pkg, mod; print(pkg.a, mod.b)"] + assert venv.run(cmd).strip() == "4 2" + + +def test_editable_with_single_module(tmp_path, venv, editable_opts): + files = { + "mypkg": { + "pyproject.toml": dedent( + """\ + [build-system] + requires = ["setuptools", "wheel"] + build-backend = "setuptools.build_meta" + + [project] + name = "mod" + version = "3.14159" + + [tool.setuptools] + py-modules = ["mod"] + """ + ), + "mod.py": "b = 2", + }, + } + jaraco.path.build(files, prefix=tmp_path) + project = tmp_path / "mypkg" + + cmd = [ + "python", + "-m", + "pip", + "install", + "--no-build-isolation", # required to force current version of setuptools + "-e", + str(project), + *editable_opts, + ] + print(venv.run(cmd)) + cmd = ["python", "-c", "import mod; print(mod.b)"] + assert venv.run(cmd).strip() == "2" + + +class TestLegacyNamespaces: + # legacy => pkg_resources.declare_namespace(...) + setup(namespace_packages=...) + + def test_nspkg_file_is_unique(self, tmp_path, monkeypatch): + deprecation = pytest.warns( + SetuptoolsDeprecationWarning, match=".*namespace_packages parameter.*" + ) + installation_dir = tmp_path / ".installation_dir" + installation_dir.mkdir() + examples = ( + "myns.pkgA", + "myns.pkgB", + "myns.n.pkgA", + "myns.n.pkgB", + ) + + for name in examples: + pkg = namespaces.build_namespace_package(tmp_path, name, version="42") + with deprecation, monkeypatch.context() as ctx: + ctx.chdir(pkg) + dist = run_setup("setup.py", stop_after="config") + cmd = editable_wheel(dist) + cmd.finalize_options() + editable_name = cmd.get_finalized_command("dist_info").name + cmd._install_namespaces(installation_dir, editable_name) + + files = list(installation_dir.glob("*-nspkg.pth")) + assert len(files) == len(examples) + + @pytest.mark.parametrize( + "impl", + ( + "pkg_resources", + # "pkgutil", => does not work + ), + ) + @pytest.mark.parametrize("ns", ("myns.n",)) + def test_namespace_package_importable( + self, venv, tmp_path, ns, impl, editable_opts + ): + """ + Installing two packages sharing the same namespace, one installed + naturally using pip or `--single-version-externally-managed` + and the other installed in editable mode should leave the namespace + intact and both packages reachable by import. + (Ported from test_develop). + """ + build_system = """\ + [build-system] + requires = ["setuptools"] + build-backend = "setuptools.build_meta" + """ + pkg_A = namespaces.build_namespace_package(tmp_path, f"{ns}.pkgA", impl=impl) + pkg_B = namespaces.build_namespace_package(tmp_path, f"{ns}.pkgB", impl=impl) + (pkg_A / "pyproject.toml").write_text(build_system, encoding="utf-8") + (pkg_B / "pyproject.toml").write_text(build_system, encoding="utf-8") + # use pip to install to the target directory + opts = editable_opts[:] + opts.append("--no-build-isolation") # force current version of setuptools + venv.run(["python", "-m", "pip", "install", str(pkg_A), *opts]) + venv.run(["python", "-m", "pip", "install", "-e", str(pkg_B), *opts]) + venv.run(["python", "-c", f"import {ns}.pkgA; import {ns}.pkgB"]) + # additionally ensure that pkg_resources import works + venv.run(["python", "-c", "import pkg_resources"]) + + +class TestPep420Namespaces: + def test_namespace_package_importable(self, venv, tmp_path, editable_opts): + """ + Installing two packages sharing the same namespace, one installed + normally using pip and the other installed in editable mode + should allow importing both packages. + """ + pkg_A = namespaces.build_pep420_namespace_package(tmp_path, 'myns.n.pkgA') + pkg_B = namespaces.build_pep420_namespace_package(tmp_path, 'myns.n.pkgB') + # use pip to install to the target directory + opts = editable_opts[:] + opts.append("--no-build-isolation") # force current version of setuptools + venv.run(["python", "-m", "pip", "install", str(pkg_A), *opts]) + venv.run(["python", "-m", "pip", "install", "-e", str(pkg_B), *opts]) + venv.run(["python", "-c", "import myns.n.pkgA; import myns.n.pkgB"]) + + def test_namespace_created_via_package_dir(self, venv, tmp_path, editable_opts): + """Currently users can create a namespace by tweaking `package_dir`""" + files = { + "pkgA": { + "pyproject.toml": dedent( + """\ + [build-system] + requires = ["setuptools", "wheel"] + build-backend = "setuptools.build_meta" + + [project] + name = "pkgA" + version = "3.14159" + + [tool.setuptools] + package-dir = {"myns.n.pkgA" = "src"} + """ + ), + "src": {"__init__.py": "a = 1"}, + }, + } + jaraco.path.build(files, prefix=tmp_path) + pkg_A = tmp_path / "pkgA" + pkg_B = namespaces.build_pep420_namespace_package(tmp_path, 'myns.n.pkgB') + pkg_C = namespaces.build_pep420_namespace_package(tmp_path, 'myns.n.pkgC') + + # use pip to install to the target directory + opts = editable_opts[:] + opts.append("--no-build-isolation") # force current version of setuptools + venv.run(["python", "-m", "pip", "install", str(pkg_A), *opts]) + venv.run(["python", "-m", "pip", "install", "-e", str(pkg_B), *opts]) + venv.run(["python", "-m", "pip", "install", "-e", str(pkg_C), *opts]) + venv.run(["python", "-c", "from myns.n import pkgA, pkgB, pkgC"]) + + def test_namespace_accidental_config_in_lenient_mode(self, venv, tmp_path): + """Sometimes users might specify an ``include`` pattern that ignores parent + packages. In a normal installation this would ignore all modules inside the + parent packages, and make them namespaces (reported in issue #3504), + so the editable mode should preserve this behaviour. + """ + files = { + "pkgA": { + "pyproject.toml": dedent( + """\ + [build-system] + requires = ["setuptools", "wheel"] + build-backend = "setuptools.build_meta" + + [project] + name = "pkgA" + version = "3.14159" + + [tool.setuptools] + packages.find.include = ["mypkg.*"] + """ + ), + "mypkg": { + "__init__.py": "", + "other.py": "b = 1", + "n": { + "__init__.py": "", + "pkgA.py": "a = 1", + }, + }, + "MANIFEST.in": EXAMPLE["MANIFEST.in"], + }, + } + jaraco.path.build(files, prefix=tmp_path) + pkg_A = tmp_path / "pkgA" + + # use pip to install to the target directory + opts = ["--no-build-isolation"] # force current version of setuptools + venv.run(["python", "-m", "pip", "-v", "install", "-e", str(pkg_A), *opts]) + out = venv.run(["python", "-c", "from mypkg.n import pkgA; print(pkgA.a)"]) + assert out.strip() == "1" + cmd = """\ + try: + import mypkg.other + except ImportError: + print("mypkg.other not defined") + """ + out = venv.run(["python", "-c", dedent(cmd)]) + assert "mypkg.other not defined" in out + + +def test_editable_with_prefix(tmp_path, sample_project, editable_opts): + """ + Editable install to a prefix should be discoverable. + """ + prefix = tmp_path / 'prefix' + + # figure out where pip will likely install the package + site_packages_all = [ + prefix / Path(path).relative_to(sys.prefix) + for path in sys.path + if 'site-packages' in path and path.startswith(sys.prefix) + ] + + for sp in site_packages_all: + sp.mkdir(parents=True) + + # install workaround + _addsitedirs(site_packages_all) + + env = dict(os.environ, PYTHONPATH=os.pathsep.join(map(str, site_packages_all))) + cmd = [ + sys.executable, + '-m', + 'pip', + 'install', + '--editable', + str(sample_project), + '--prefix', + str(prefix), + '--no-build-isolation', + *editable_opts, + ] + subprocess.check_call(cmd, env=env) + + # now run 'sample' with the prefix on the PYTHONPATH + bin = 'Scripts' if platform.system() == 'Windows' else 'bin' + exe = prefix / bin / 'sample' + subprocess.check_call([exe], env=env) + + +class TestFinderTemplate: + """This test focus in getting a particular implementation detail right. + If at some point in time the implementation is changed for something different, + this test can be modified or even excluded. + """ + + def install_finder(self, finder): + loc = {} + exec(finder, loc, loc) + loc["install"]() + + def test_packages(self, tmp_path): + files = { + "src1": { + "pkg1": { + "__init__.py": "", + "subpkg": {"mod1.py": "a = 42"}, + }, + }, + "src2": {"mod2.py": "a = 43"}, + } + jaraco.path.build(files, prefix=tmp_path) + + mapping = { + "pkg1": str(tmp_path / "src1/pkg1"), + "mod2": str(tmp_path / "src2/mod2"), + } + template = _finder_template(str(uuid4()), mapping, {}) + + with contexts.save_paths(), contexts.save_sys_modules(): + for mod in ("pkg1", "pkg1.subpkg", "pkg1.subpkg.mod1", "mod2"): + sys.modules.pop(mod, None) + + self.install_finder(template) + mod1 = import_module("pkg1.subpkg.mod1") + mod2 = import_module("mod2") + subpkg = import_module("pkg1.subpkg") + + assert mod1.a == 42 + assert mod2.a == 43 + expected = str((tmp_path / "src1/pkg1/subpkg").resolve()) + assert_path(subpkg, expected) + + def test_namespace(self, tmp_path): + files = {"pkg": {"__init__.py": "a = 13", "text.txt": "abc"}} + jaraco.path.build(files, prefix=tmp_path) + + mapping = {"ns.othername": str(tmp_path / "pkg")} + namespaces = {"ns": []} + + template = _finder_template(str(uuid4()), mapping, namespaces) + with contexts.save_paths(), contexts.save_sys_modules(): + for mod in ("ns", "ns.othername"): + sys.modules.pop(mod, None) + + self.install_finder(template) + pkg = import_module("ns.othername") + text = importlib_resources.files(pkg) / "text.txt" + + expected = str((tmp_path / "pkg").resolve()) + assert_path(pkg, expected) + assert pkg.a == 13 + + # Make sure resources can also be found + assert text.read_text(encoding="utf-8") == "abc" + + def test_combine_namespaces(self, tmp_path): + files = { + "src1": {"ns": {"pkg1": {"__init__.py": "a = 13"}}}, + "src2": {"ns": {"mod2.py": "b = 37"}}, + } + jaraco.path.build(files, prefix=tmp_path) + + mapping = { + "ns.pkgA": str(tmp_path / "src1/ns/pkg1"), + "ns": str(tmp_path / "src2/ns"), + } + namespaces_ = {"ns": [str(tmp_path / "src1"), str(tmp_path / "src2")]} + template = _finder_template(str(uuid4()), mapping, namespaces_) + + with contexts.save_paths(), contexts.save_sys_modules(): + for mod in ("ns", "ns.pkgA", "ns.mod2"): + sys.modules.pop(mod, None) + + self.install_finder(template) + pkgA = import_module("ns.pkgA") + mod2 = import_module("ns.mod2") + + expected = str((tmp_path / "src1/ns/pkg1").resolve()) + assert_path(pkgA, expected) + assert pkgA.a == 13 + assert mod2.b == 37 + + def test_combine_namespaces_nested(self, tmp_path): + """ + Users may attempt to combine namespace packages in a nested way via + ``package_dir`` as shown in pypa/setuptools#4248. + """ + + files = { + "src": {"my_package": {"my_module.py": "a = 13"}}, + "src2": {"my_package2": {"my_module2.py": "b = 37"}}, + } + + stack = jaraco.path.DirectoryStack() + with stack.context(tmp_path): + jaraco.path.build(files) + attrs = { + "script_name": "%PEP 517%", + "package_dir": { + "different_name": "src/my_package", + "different_name.subpkg": "src2/my_package2", + }, + "packages": ["different_name", "different_name.subpkg"], + } + dist = Distribution(attrs) + finder = _TopLevelFinder(dist, str(uuid4())) + code = next(v for k, v in finder.get_implementation() if k.endswith(".py")) + + with contexts.save_paths(), contexts.save_sys_modules(): + for mod in attrs["packages"]: + sys.modules.pop(mod, None) + + self.install_finder(code) + mod1 = import_module("different_name.my_module") + mod2 = import_module("different_name.subpkg.my_module2") + + expected = str((tmp_path / "src/my_package/my_module.py").resolve()) + assert str(Path(mod1.__file__).resolve()) == expected + + expected = str((tmp_path / "src2/my_package2/my_module2.py").resolve()) + assert str(Path(mod2.__file__).resolve()) == expected + + assert mod1.a == 13 + assert mod2.b == 37 + + def test_dynamic_path_computation(self, tmp_path): + # Follows the example in PEP 420 + files = { + "project1": {"parent": {"child": {"one.py": "x = 1"}}}, + "project2": {"parent": {"child": {"two.py": "x = 2"}}}, + "project3": {"parent": {"child": {"three.py": "x = 3"}}}, + } + jaraco.path.build(files, prefix=tmp_path) + mapping = {} + namespaces_ = {"parent": [str(tmp_path / "project1/parent")]} + template = _finder_template(str(uuid4()), mapping, namespaces_) + + mods = (f"parent.child.{name}" for name in ("one", "two", "three")) + with contexts.save_paths(), contexts.save_sys_modules(): + for mod in ("parent", "parent.child", "parent.child", *mods): + sys.modules.pop(mod, None) + + self.install_finder(template) + + one = import_module("parent.child.one") + assert one.x == 1 + + with pytest.raises(ImportError): + import_module("parent.child.two") + + sys.path.append(str(tmp_path / "project2")) + two = import_module("parent.child.two") + assert two.x == 2 + + with pytest.raises(ImportError): + import_module("parent.child.three") + + sys.path.append(str(tmp_path / "project3")) + three = import_module("parent.child.three") + assert three.x == 3 + + def test_no_recursion(self, tmp_path): + # See issue #3550 + files = { + "pkg": { + "__init__.py": "from . import pkg", + }, + } + jaraco.path.build(files, prefix=tmp_path) + + mapping = { + "pkg": str(tmp_path / "pkg"), + } + template = _finder_template(str(uuid4()), mapping, {}) + + with contexts.save_paths(), contexts.save_sys_modules(): + sys.modules.pop("pkg", None) + + self.install_finder(template) + with pytest.raises(ImportError, match="pkg"): + import_module("pkg") + + def test_similar_name(self, tmp_path): + files = { + "foo": { + "__init__.py": "", + "bar": { + "__init__.py": "", + }, + }, + } + jaraco.path.build(files, prefix=tmp_path) + + mapping = { + "foo": str(tmp_path / "foo"), + } + template = _finder_template(str(uuid4()), mapping, {}) + + with contexts.save_paths(), contexts.save_sys_modules(): + sys.modules.pop("foo", None) + sys.modules.pop("foo.bar", None) + + self.install_finder(template) + with pytest.raises(ImportError, match="foobar"): + import_module("foobar") + + def test_case_sensitivity(self, tmp_path): + files = { + "foo": { + "__init__.py": "", + "lowercase.py": "x = 1", + "bar": { + "__init__.py": "", + "lowercase.py": "x = 2", + }, + }, + } + jaraco.path.build(files, prefix=tmp_path) + mapping = { + "foo": str(tmp_path / "foo"), + } + template = _finder_template(str(uuid4()), mapping, {}) + with contexts.save_paths(), contexts.save_sys_modules(): + sys.modules.pop("foo", None) + + self.install_finder(template) + with pytest.raises(ImportError, match="'FOO'"): + import_module("FOO") + + with pytest.raises(ImportError, match="'foo\\.LOWERCASE'"): + import_module("foo.LOWERCASE") + + with pytest.raises(ImportError, match="'foo\\.bar\\.Lowercase'"): + import_module("foo.bar.Lowercase") + + with pytest.raises(ImportError, match="'foo\\.BAR'"): + import_module("foo.BAR.lowercase") + + with pytest.raises(ImportError, match="'FOO'"): + import_module("FOO.bar.lowercase") + + mod = import_module("foo.lowercase") + assert mod.x == 1 + + mod = import_module("foo.bar.lowercase") + assert mod.x == 2 + + def test_namespace_case_sensitivity(self, tmp_path): + files = { + "pkg": { + "__init__.py": "a = 13", + "foo": { + "__init__.py": "b = 37", + "bar.py": "c = 42", + }, + }, + } + jaraco.path.build(files, prefix=tmp_path) + + mapping = {"ns.othername": str(tmp_path / "pkg")} + namespaces = {"ns": []} + + template = _finder_template(str(uuid4()), mapping, namespaces) + with contexts.save_paths(), contexts.save_sys_modules(): + for mod in ("ns", "ns.othername"): + sys.modules.pop(mod, None) + + self.install_finder(template) + pkg = import_module("ns.othername") + expected = str((tmp_path / "pkg").resolve()) + assert_path(pkg, expected) + assert pkg.a == 13 + + foo = import_module("ns.othername.foo") + assert foo.b == 37 + + bar = import_module("ns.othername.foo.bar") + assert bar.c == 42 + + with pytest.raises(ImportError, match="'NS'"): + import_module("NS.othername.foo") + + with pytest.raises(ImportError, match="'ns\\.othername\\.FOO\\'"): + import_module("ns.othername.FOO") + + with pytest.raises(ImportError, match="'ns\\.othername\\.foo\\.BAR\\'"): + import_module("ns.othername.foo.BAR") + + def test_intermediate_packages(self, tmp_path): + """ + The finder should not import ``fullname`` if the intermediate segments + don't exist (see pypa/setuptools#4019). + """ + files = { + "src": { + "mypkg": { + "__init__.py": "", + "config.py": "a = 13", + "helloworld.py": "b = 13", + "components": { + "config.py": "a = 37", + }, + }, + } + } + jaraco.path.build(files, prefix=tmp_path) + + mapping = {"mypkg": str(tmp_path / "src/mypkg")} + template = _finder_template(str(uuid4()), mapping, {}) + + with contexts.save_paths(), contexts.save_sys_modules(): + for mod in ( + "mypkg", + "mypkg.config", + "mypkg.helloworld", + "mypkg.components", + "mypkg.components.config", + "mypkg.components.helloworld", + ): + sys.modules.pop(mod, None) + + self.install_finder(template) + + config = import_module("mypkg.components.config") + assert config.a == 37 + + helloworld = import_module("mypkg.helloworld") + assert helloworld.b == 13 + + with pytest.raises(ImportError): + import_module("mypkg.components.helloworld") + + +def test_pkg_roots(tmp_path): + """This test focus in getting a particular implementation detail right. + If at some point in time the implementation is changed for something different, + this test can be modified or even excluded. + """ + files = { + "a": {"b": {"__init__.py": "ab = 1"}, "__init__.py": "a = 1"}, + "d": {"__init__.py": "d = 1", "e": {"__init__.py": "de = 1"}}, + "f": {"g": {"h": {"__init__.py": "fgh = 1"}}}, + "other": {"__init__.py": "abc = 1"}, + "another": {"__init__.py": "abcxyz = 1"}, + "yet_another": {"__init__.py": "mnopq = 1"}, + } + jaraco.path.build(files, prefix=tmp_path) + package_dir = { + "a.b.c": "other", + "a.b.c.x.y.z": "another", + "m.n.o.p.q": "yet_another", + } + packages = [ + "a", + "a.b", + "a.b.c", + "a.b.c.x.y", + "a.b.c.x.y.z", + "d", + "d.e", + "f", + "f.g", + "f.g.h", + "m.n.o.p.q", + ] + roots = _find_package_roots(packages, package_dir, tmp_path) + assert roots == { + "a": str(tmp_path / "a"), + "a.b.c": str(tmp_path / "other"), + "a.b.c.x.y.z": str(tmp_path / "another"), + "d": str(tmp_path / "d"), + "f": str(tmp_path / "f"), + "m.n.o.p.q": str(tmp_path / "yet_another"), + } + + ns = set(dict(_find_namespaces(packages, roots))) + assert ns == {"f", "f.g"} + + ns = set(_find_virtual_namespaces(roots)) + assert ns == {"a.b", "a.b.c.x", "a.b.c.x.y", "m", "m.n", "m.n.o", "m.n.o.p"} + + +class TestOverallBehaviour: + PYPROJECT = """\ + [build-system] + requires = ["setuptools"] + build-backend = "setuptools.build_meta" + + [project] + name = "mypkg" + version = "3.14159" + """ + + # Any: Would need a TypedDict. Keep it simple for tests + FLAT_LAYOUT: dict[str, Any] = { + "pyproject.toml": dedent(PYPROJECT), + "MANIFEST.in": EXAMPLE["MANIFEST.in"], + "otherfile.py": "", + "mypkg": { + "__init__.py": "", + "mod1.py": "var = 42", + "subpackage": { + "__init__.py": "", + "mod2.py": "var = 13", + "resource_file.txt": "resource 39", + }, + }, + } + + EXAMPLES = { + "flat-layout": FLAT_LAYOUT, + "src-layout": { + "pyproject.toml": dedent(PYPROJECT), + "MANIFEST.in": EXAMPLE["MANIFEST.in"], + "otherfile.py": "", + "src": {"mypkg": FLAT_LAYOUT["mypkg"]}, + }, + "custom-layout": { + "pyproject.toml": dedent(PYPROJECT) + + dedent( + """\ + [tool.setuptools] + packages = ["mypkg", "mypkg.subpackage"] + + [tool.setuptools.package-dir] + "mypkg.subpackage" = "other" + """ + ), + "MANIFEST.in": EXAMPLE["MANIFEST.in"], + "otherfile.py": "", + "mypkg": { + "__init__.py": "", + "mod1.py": FLAT_LAYOUT["mypkg"]["mod1.py"], + }, + "other": FLAT_LAYOUT["mypkg"]["subpackage"], + }, + "namespace": { + "pyproject.toml": dedent(PYPROJECT), + "MANIFEST.in": EXAMPLE["MANIFEST.in"], + "otherfile.py": "", + "src": { + "mypkg": { + "mod1.py": FLAT_LAYOUT["mypkg"]["mod1.py"], + "subpackage": FLAT_LAYOUT["mypkg"]["subpackage"], + }, + }, + }, + } + + @pytest.mark.xfail(sys.platform == "darwin", reason="pypa/setuptools#4328") + @pytest.mark.parametrize("layout", EXAMPLES.keys()) + def test_editable_install(self, tmp_path, venv, layout, editable_opts): + project, _ = install_project( + "mypkg", venv, tmp_path, self.EXAMPLES[layout], *editable_opts + ) + + # Ensure stray files are not importable + cmd_import_error = """\ + try: + import otherfile + except ImportError as ex: + print(ex) + """ + out = venv.run(["python", "-c", dedent(cmd_import_error)]) + assert "No module named 'otherfile'" in out + + # Ensure the modules are importable + cmd_get_vars = """\ + import mypkg, mypkg.mod1, mypkg.subpackage.mod2 + print(mypkg.mod1.var, mypkg.subpackage.mod2.var) + """ + out = venv.run(["python", "-c", dedent(cmd_get_vars)]) + assert "42 13" in out + + # Ensure resources are reachable + cmd_get_resource = """\ + import mypkg.subpackage + from setuptools._importlib import resources as importlib_resources + text = importlib_resources.files(mypkg.subpackage) / "resource_file.txt" + print(text.read_text(encoding="utf-8")) + """ + out = venv.run(["python", "-c", dedent(cmd_get_resource)]) + assert "resource 39" in out + + # Ensure files are editable + mod1 = next(project.glob("**/mod1.py")) + mod2 = next(project.glob("**/mod2.py")) + resource_file = next(project.glob("**/resource_file.txt")) + + mod1.write_text("var = 17", encoding="utf-8") + mod2.write_text("var = 781", encoding="utf-8") + resource_file.write_text("resource 374", encoding="utf-8") + + out = venv.run(["python", "-c", dedent(cmd_get_vars)]) + assert "42 13" not in out + assert "17 781" in out + + out = venv.run(["python", "-c", dedent(cmd_get_resource)]) + assert "resource 39" not in out + assert "resource 374" in out + + +class TestLinkTree: + FILES = deepcopy(TestOverallBehaviour.EXAMPLES["src-layout"]) + FILES["pyproject.toml"] += dedent( + """\ + [tool.setuptools] + # Temporary workaround: both `include-package-data` and `package-data` configs + # can be removed after #3260 is fixed. + include-package-data = false + package-data = {"*" = ["*.txt"]} + + [tool.setuptools.packages.find] + where = ["src"] + exclude = ["*.subpackage*"] + """ + ) + FILES["src"]["mypkg"]["resource.not_in_manifest"] = "abc" + + def test_generated_tree(self, tmp_path): + jaraco.path.build(self.FILES, prefix=tmp_path) + + with _Path(tmp_path): + name = "mypkg-3.14159" + dist = Distribution({"script_name": "%PEP 517%"}) + dist.parse_config_files() + + wheel = Mock() + aux = tmp_path / ".aux" + build = tmp_path / ".build" + aux.mkdir() + build.mkdir() + + build_py = dist.get_command_obj("build_py") + build_py.editable_mode = True + build_py.build_lib = str(build) + build_py.ensure_finalized() + outputs = build_py.get_outputs() + output_mapping = build_py.get_output_mapping() + + make_tree = _LinkTree(dist, name, aux, build) + make_tree(wheel, outputs, output_mapping) + + mod1 = next(aux.glob("**/mod1.py")) + expected = tmp_path / "src/mypkg/mod1.py" + assert_link_to(mod1, expected) + + assert next(aux.glob("**/subpackage"), None) is None + assert next(aux.glob("**/mod2.py"), None) is None + assert next(aux.glob("**/resource_file.txt"), None) is None + + assert next(aux.glob("**/resource.not_in_manifest"), None) is None + + def test_strict_install(self, tmp_path, venv): + opts = ["--config-settings", "editable-mode=strict"] + install_project("mypkg", venv, tmp_path, self.FILES, *opts) + + out = venv.run(["python", "-c", "import mypkg.mod1; print(mypkg.mod1.var)"]) + assert "42" in out + + # Ensure packages excluded from distribution are not importable + cmd_import_error = """\ + try: + from mypkg import subpackage + except ImportError as ex: + print(ex) + """ + out = venv.run(["python", "-c", dedent(cmd_import_error)]) + assert "cannot import name 'subpackage'" in out + + # Ensure resource files excluded from distribution are not reachable + cmd_get_resource = """\ + import mypkg + from setuptools._importlib import resources as importlib_resources + try: + text = importlib_resources.files(mypkg) / "resource.not_in_manifest" + print(text.read_text(encoding="utf-8")) + except FileNotFoundError as ex: + print(ex) + """ + out = venv.run(["python", "-c", dedent(cmd_get_resource)]) + assert "No such file or directory" in out + assert "resource.not_in_manifest" in out + + +@pytest.mark.filterwarnings("ignore:.*compat.*:setuptools.SetuptoolsDeprecationWarning") +def test_compat_install(tmp_path, venv): + # TODO: Remove `compat` after Dec/2022. + opts = ["--config-settings", "editable-mode=compat"] + files = TestOverallBehaviour.EXAMPLES["custom-layout"] + install_project("mypkg", venv, tmp_path, files, *opts) + + out = venv.run(["python", "-c", "import mypkg.mod1; print(mypkg.mod1.var)"]) + assert "42" in out + + expected_path = comparable_path(str(tmp_path)) + + # Compatible behaviour will make spurious modules and excluded + # files importable directly from the original path + for cmd in ( + "import otherfile; print(otherfile)", + "import other; print(other)", + "import mypkg; print(mypkg)", + ): + out = comparable_path(venv.run(["python", "-c", cmd])) + assert expected_path in out + + # Compatible behaviour will not consider custom mappings + cmd = """\ + try: + from mypkg import subpackage; + except ImportError as ex: + print(ex) + """ + out = venv.run(["python", "-c", dedent(cmd)]) + assert "cannot import name 'subpackage'" in out + + +def test_pbr_integration(tmp_path, venv, editable_opts): + """Ensure editable installs work with pbr, issue #3500""" + files = { + "pyproject.toml": dedent( + """\ + [build-system] + requires = ["setuptools"] + build-backend = "setuptools.build_meta" + """ + ), + "setup.py": dedent( + """\ + __import__('setuptools').setup( + pbr=True, + setup_requires=["pbr"], + ) + """ + ), + "setup.cfg": dedent( + """\ + [metadata] + name = mypkg + + [files] + packages = + mypkg + """ + ), + "mypkg": { + "__init__.py": "", + "hello.py": "print('Hello world!')", + }, + "other": {"test.txt": "Another file in here."}, + } + venv.run(["python", "-m", "pip", "install", "pbr"]) + + with contexts.environment(PBR_VERSION="0.42"): + install_project("mypkg", venv, tmp_path, files, *editable_opts) + + out = venv.run(["python", "-c", "import mypkg.hello"]) + assert "Hello world!" in out + + +class TestCustomBuildPy: + """ + Issue #3501 indicates that some plugins/customizations might rely on: + + 1. ``build_py`` not running + 2. ``build_py`` always copying files to ``build_lib`` + + During the transition period setuptools should prevent potential errors from + happening due to those assumptions. + """ + + # TODO: Remove tests after _run_build_steps is removed. + + FILES = { + **TestOverallBehaviour.EXAMPLES["flat-layout"], + "setup.py": dedent( + """\ + import pathlib + from setuptools import setup + from setuptools.command.build_py import build_py as orig + + class my_build_py(orig): + def run(self): + super().run() + raise ValueError("TEST_RAISE") + + setup(cmdclass={"build_py": my_build_py}) + """ + ), + } + + def test_safeguarded_from_errors(self, tmp_path, venv): + """Ensure that errors in custom build_py are reported as warnings""" + # Warnings should show up + _, out = install_project("mypkg", venv, tmp_path, self.FILES) + assert "SetuptoolsDeprecationWarning" in out + assert "ValueError: TEST_RAISE" in out + # but installation should be successful + out = venv.run(["python", "-c", "import mypkg.mod1; print(mypkg.mod1.var)"]) + assert "42" in out + + +class TestCustomBuildWheel: + def install_custom_build_wheel(self, dist): + bdist_wheel_cls = dist.get_command_class("bdist_wheel") + + class MyBdistWheel(bdist_wheel_cls): + def get_tag(self): + # In issue #3513, we can see that some extensions may try to access + # the `plat_name` property in bdist_wheel + if self.plat_name.startswith("macosx-"): + _ = "macOS platform" + return super().get_tag() + + dist.cmdclass["bdist_wheel"] = MyBdistWheel + + def test_access_plat_name(self, tmpdir_cwd): + # Even when a custom bdist_wheel tries to access plat_name the build should + # be successful + jaraco.path.build({"module.py": "x = 42"}) + dist = Distribution() + dist.script_name = "setup.py" + dist.set_defaults() + self.install_custom_build_wheel(dist) + cmd = editable_wheel(dist) + cmd.ensure_finalized() + cmd.run() + wheel_file = str(next(Path().glob('dist/*.whl'))) + assert "editable" in wheel_file + + +class TestCustomBuildExt: + def install_custom_build_ext_distutils(self, dist): + from distutils.command.build_ext import build_ext as build_ext_cls + + class MyBuildExt(build_ext_cls): + pass + + dist.cmdclass["build_ext"] = MyBuildExt + + @pytest.mark.skipif( + sys.platform != "linux", reason="compilers may fail without correct setup" + ) + def test_distutils_leave_inplace_files(self, tmpdir_cwd): + jaraco.path.build({"module.c": ""}) + attrs = { + "ext_modules": [Extension("module", ["module.c"])], + } + dist = Distribution(attrs) + dist.script_name = "setup.py" + dist.set_defaults() + self.install_custom_build_ext_distutils(dist) + cmd = editable_wheel(dist) + cmd.ensure_finalized() + cmd.run() + wheel_file = str(next(Path().glob('dist/*.whl'))) + assert "editable" in wheel_file + files = [p for p in Path().glob("module.*") if p.suffix != ".c"] + assert len(files) == 1 + name = files[0].name + assert any(name.endswith(ext) for ext in EXTENSION_SUFFIXES) + + +def test_debugging_tips(tmpdir_cwd, monkeypatch): + """Make sure to display useful debugging tips to the user.""" + jaraco.path.build({"module.py": "x = 42"}) + dist = Distribution() + dist.script_name = "setup.py" + dist.set_defaults() + cmd = editable_wheel(dist) + cmd.ensure_finalized() + + SimulatedErr = type("SimulatedErr", (Exception,), {}) + simulated_failure = Mock(side_effect=SimulatedErr()) + monkeypatch.setattr(cmd, "get_finalized_command", simulated_failure) + + expected_msg = "following steps are recommended to help debug" + with pytest.raises(SimulatedErr), pytest.warns(_DebuggingTips, match=expected_msg): + cmd.run() + + +@pytest.mark.filterwarnings("error") +def test_encode_pth(): + """Ensure _encode_pth function does not produce encoding warnings""" + content = _encode_pth("tkmilan_ç_utf8") # no warnings (would be turned into errors) + assert isinstance(content, bytes) + + +def install_project(name, venv, tmp_path, files, *opts): + project = tmp_path / name + project.mkdir() + jaraco.path.build(files, prefix=project) + opts = [*opts, "--no-build-isolation"] # force current version of setuptools + out = venv.run( + ["python", "-m", "pip", "-v", "install", "-e", str(project), *opts], + stderr=subprocess.STDOUT, + ) + return project, out + + +def _addsitedirs(new_dirs): + """To use this function, it is necessary to insert new_dir in front of sys.path. + The Python process will try to import a ``sitecustomize`` module on startup. + If we manipulate sys.path/PYTHONPATH, we can force it to run our code, + which invokes ``addsitedir`` and ensure ``.pth`` files are loaded. + """ + content = '\n'.join( + ("import site",) + + tuple(f"site.addsitedir({os.fspath(new_dir)!r})" for new_dir in new_dirs) + ) + (new_dirs[0] / "sitecustomize.py").write_text(content, encoding="utf-8") + + +# ---- Assertion Helpers ---- + + +def assert_path(pkg, expected): + # __path__ is not guaranteed to exist, so we have to account for that + if pkg.__path__: + path = next(iter(pkg.__path__), None) + if path: + assert str(Path(path).resolve()) == expected + + +def assert_link_to(file: Path, other: Path) -> None: + if file.is_symlink(): + assert str(file.resolve()) == str(other.resolve()) + else: + file_stat = file.stat() + other_stat = other.stat() + assert file_stat[stat.ST_INO] == other_stat[stat.ST_INO] + assert file_stat[stat.ST_DEV] == other_stat[stat.ST_DEV] + + +def comparable_path(str_with_path: str) -> str: + return str_with_path.lower().replace(os.sep, "/").replace("//", "/") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_egg_info.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_egg_info.py new file mode 100644 index 0000000000000000000000000000000000000000..8879ec58cec4c6e98d60da7a014653c7d9e48f36 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_egg_info.py @@ -0,0 +1,1285 @@ +from __future__ import annotations + +import ast +import glob +import os +import re +import stat +import sys +import time +from pathlib import Path +from unittest import mock + +import pytest +from jaraco import path + +from setuptools import errors +from setuptools.command.egg_info import egg_info, manifest_maker, write_entries +from setuptools.dist import Distribution + +from . import contexts, environment +from .textwrap import DALS + + +class Environment(str): + pass + + +@pytest.fixture +def env(): + with contexts.tempdir(prefix='setuptools-test.') as env_dir: + env = Environment(env_dir) + os.chmod(env_dir, stat.S_IRWXU) + subs = 'home', 'lib', 'scripts', 'data', 'egg-base' + env.paths = dict((dirname, os.path.join(env_dir, dirname)) for dirname in subs) + list(map(os.mkdir, env.paths.values())) + path.build({ + env.paths['home']: { + '.pydistutils.cfg': DALS( + """ + [egg_info] + egg-base = {egg-base} + """.format(**env.paths) + ) + } + }) + yield env + + +class TestEggInfo: + setup_script = DALS( + """ + from setuptools import setup + + setup( + name='foo', + py_modules=['hello'], + entry_points={'console_scripts': ['hi = hello.run']}, + zip_safe=False, + ) + """ + ) + + def _create_project(self): + path.build({ + 'setup.py': self.setup_script, + 'hello.py': DALS( + """ + def run(): + print('hello') + """ + ), + }) + + @staticmethod + def _extract_mv_version(pkg_info_lines: list[str]) -> tuple[int, int]: + version_str = pkg_info_lines[0].split(' ')[1] + major, minor = map(int, version_str.split('.')[:2]) + return major, minor + + def test_egg_info_save_version_info_setup_empty(self, tmpdir_cwd, env): + """ + When the egg_info section is empty or not present, running + save_version_info should add the settings to the setup.cfg + in a deterministic order. + """ + setup_cfg = os.path.join(env.paths['home'], 'setup.cfg') + dist = Distribution() + ei = egg_info(dist) + ei.initialize_options() + ei.save_version_info(setup_cfg) + + with open(setup_cfg, 'r', encoding="utf-8") as f: + content = f.read() + + assert '[egg_info]' in content + assert 'tag_build =' in content + assert 'tag_date = 0' in content + + expected_order = ( + 'tag_build', + 'tag_date', + ) + + self._validate_content_order(content, expected_order) + + @staticmethod + def _validate_content_order(content, expected): + """ + Assert that the strings in expected appear in content + in order. + """ + pattern = '.*'.join(expected) + flags = re.MULTILINE | re.DOTALL + assert re.search(pattern, content, flags) + + def test_egg_info_save_version_info_setup_defaults(self, tmpdir_cwd, env): + """ + When running save_version_info on an existing setup.cfg + with the 'default' values present from a previous run, + the file should remain unchanged. + """ + setup_cfg = os.path.join(env.paths['home'], 'setup.cfg') + path.build({ + setup_cfg: DALS( + """ + [egg_info] + tag_build = + tag_date = 0 + """ + ), + }) + dist = Distribution() + ei = egg_info(dist) + ei.initialize_options() + ei.save_version_info(setup_cfg) + + with open(setup_cfg, 'r', encoding="utf-8") as f: + content = f.read() + + assert '[egg_info]' in content + assert 'tag_build =' in content + assert 'tag_date = 0' in content + + expected_order = ( + 'tag_build', + 'tag_date', + ) + + self._validate_content_order(content, expected_order) + + def test_expected_files_produced(self, tmpdir_cwd, env): + self._create_project() + + self._run_egg_info_command(tmpdir_cwd, env) + actual = os.listdir('foo.egg-info') + + expected = [ + 'PKG-INFO', + 'SOURCES.txt', + 'dependency_links.txt', + 'entry_points.txt', + 'not-zip-safe', + 'top_level.txt', + ] + assert sorted(actual) == expected + + def test_handling_utime_error(self, tmpdir_cwd, env): + dist = Distribution() + ei = egg_info(dist) + utime_patch = mock.patch('os.utime', side_effect=OSError("TEST")) + mkpath_patch = mock.patch( + 'setuptools.command.egg_info.egg_info.mkpath', return_val=None + ) + + with utime_patch, mkpath_patch: + import distutils.errors + + msg = r"Cannot update time stamp of directory 'None'" + with pytest.raises(distutils.errors.DistutilsFileError, match=msg): + ei.run() + + def test_license_is_a_string(self, tmpdir_cwd, env): + setup_config = DALS( + """ + [metadata] + name=foo + version=0.0.1 + license=file:MIT + """ + ) + + setup_script = DALS( + """ + from setuptools import setup + + setup() + """ + ) + + path.build({ + 'setup.py': setup_script, + 'setup.cfg': setup_config, + }) + + # This command should fail with a ValueError, but because it's + # currently configured to use a subprocess, the actual traceback + # object is lost and we need to parse it from stderr + with pytest.raises(AssertionError) as exc: + self._run_egg_info_command(tmpdir_cwd, env) + + # The only argument to the assertion error should be a traceback + # containing a ValueError + assert 'ValueError' in exc.value.args[0] + + def test_rebuilt(self, tmpdir_cwd, env): + """Ensure timestamps are updated when the command is re-run.""" + self._create_project() + + self._run_egg_info_command(tmpdir_cwd, env) + timestamp_a = os.path.getmtime('foo.egg-info') + + # arbitrary sleep just to handle *really* fast systems + time.sleep(0.001) + + self._run_egg_info_command(tmpdir_cwd, env) + timestamp_b = os.path.getmtime('foo.egg-info') + + assert timestamp_a != timestamp_b + + def test_manifest_template_is_read(self, tmpdir_cwd, env): + self._create_project() + path.build({ + 'MANIFEST.in': DALS( + """ + recursive-include docs *.rst + """ + ), + 'docs': { + 'usage.rst': "Run 'hi'", + }, + }) + self._run_egg_info_command(tmpdir_cwd, env) + egg_info_dir = os.path.join('.', 'foo.egg-info') + sources_txt = os.path.join(egg_info_dir, 'SOURCES.txt') + with open(sources_txt, encoding="utf-8") as f: + assert 'docs/usage.rst' in f.read().split('\n') + + def _setup_script_with_requires(self, requires, use_setup_cfg=False): + setup_script = DALS( + """ + from setuptools import setup + + setup(name='foo', zip_safe=False, %s) + """ + ) % ('' if use_setup_cfg else requires) + setup_config = requires if use_setup_cfg else '' + path.build({ + 'setup.py': setup_script, + 'setup.cfg': setup_config, + }) + + mismatch_marker = f"python_version<'{sys.version_info[0]}'" + # Alternate equivalent syntax. + mismatch_marker_alternate = f'python_version < "{sys.version_info[0]}"' + invalid_marker = "<=>++" + + class RequiresTestHelper: + @staticmethod + def parametrize(*test_list, **format_dict): + idlist = [] + argvalues = [] + for test in test_list: + test_params = test.lstrip().split('\n\n', 3) + name_kwargs = test_params.pop(0).split('\n') + if len(name_kwargs) > 1: + val = name_kwargs[1].strip() + install_cmd_kwargs = ast.literal_eval(val) + else: + install_cmd_kwargs = {} + name = name_kwargs[0].strip() + setup_py_requires, setup_cfg_requires, expected_requires = [ + DALS(a).format(**format_dict) for a in test_params + ] + for id_, requires, use_cfg in ( + (name, setup_py_requires, False), + (name + '_in_setup_cfg', setup_cfg_requires, True), + ): + idlist.append(id_) + marks = () + if requires.startswith('@xfail\n'): + requires = requires[7:] + marks = pytest.mark.xfail + argvalues.append( + pytest.param( + requires, + use_cfg, + expected_requires, + install_cmd_kwargs, + marks=marks, + ) + ) + return pytest.mark.parametrize( + ( + "requires", + "use_setup_cfg", + "expected_requires", + "install_cmd_kwargs", + ), + argvalues, + ids=idlist, + ) + + @RequiresTestHelper.parametrize( + # Format of a test: + # + # id + # install_cmd_kwargs [optional] + # + # requires block (when used in setup.py) + # + # requires block (when used in setup.cfg) + # + # expected contents of requires.txt + """ + install_requires_deterministic + + install_requires=["wheel>=0.5", "pytest"] + + [options] + install_requires = + wheel>=0.5 + pytest + + wheel>=0.5 + pytest + """, + """ + install_requires_ordered + + install_requires=["pytest>=3.0.2,!=10.9999"] + + [options] + install_requires = + pytest>=3.0.2,!=10.9999 + + pytest!=10.9999,>=3.0.2 + """, + """ + install_requires_with_marker + + install_requires=["barbazquux;{mismatch_marker}"], + + [options] + install_requires = + barbazquux; {mismatch_marker} + + [:{mismatch_marker_alternate}] + barbazquux + """, + """ + install_requires_with_extra + {'cmd': ['egg_info']} + + install_requires=["barbazquux [test]"], + + [options] + install_requires = + barbazquux [test] + + barbazquux[test] + """, + """ + install_requires_with_extra_and_marker + + install_requires=["barbazquux [test]; {mismatch_marker}"], + + [options] + install_requires = + barbazquux [test]; {mismatch_marker} + + [:{mismatch_marker_alternate}] + barbazquux[test] + """, + """ + setup_requires_with_markers + + setup_requires=["barbazquux;{mismatch_marker}"], + + [options] + setup_requires = + barbazquux; {mismatch_marker} + + """, + """ + extras_require_with_extra + {'cmd': ['egg_info']} + + extras_require={{"extra": ["barbazquux [test]"]}}, + + [options.extras_require] + extra = barbazquux [test] + + [extra] + barbazquux[test] + """, + """ + extras_require_with_extra_and_marker_in_req + + extras_require={{"extra": ["barbazquux [test]; {mismatch_marker}"]}}, + + [options.extras_require] + extra = + barbazquux [test]; {mismatch_marker} + + [extra] + + [extra:{mismatch_marker_alternate}] + barbazquux[test] + """, + # FIXME: ConfigParser does not allow : in key names! + """ + extras_require_with_marker + + extras_require={{":{mismatch_marker}": ["barbazquux"]}}, + + @xfail + [options.extras_require] + :{mismatch_marker} = barbazquux + + [:{mismatch_marker}] + barbazquux + """, + """ + extras_require_with_marker_in_req + + extras_require={{"extra": ["barbazquux; {mismatch_marker}"]}}, + + [options.extras_require] + extra = + barbazquux; {mismatch_marker} + + [extra] + + [extra:{mismatch_marker_alternate}] + barbazquux + """, + """ + extras_require_with_empty_section + + extras_require={{"empty": []}}, + + [options.extras_require] + empty = + + [empty] + """, + # Format arguments. + invalid_marker=invalid_marker, + mismatch_marker=mismatch_marker, + mismatch_marker_alternate=mismatch_marker_alternate, + ) + def test_requires( + self, + tmpdir_cwd, + env, + requires, + use_setup_cfg, + expected_requires, + install_cmd_kwargs, + ): + self._setup_script_with_requires(requires, use_setup_cfg) + self._run_egg_info_command(tmpdir_cwd, env, **install_cmd_kwargs) + egg_info_dir = os.path.join('.', 'foo.egg-info') + requires_txt = os.path.join(egg_info_dir, 'requires.txt') + if os.path.exists(requires_txt): + with open(requires_txt, encoding="utf-8") as fp: + install_requires = fp.read() + else: + install_requires = '' + assert install_requires.lstrip() == expected_requires + assert glob.glob(os.path.join(env.paths['lib'], 'barbazquux*')) == [] + + def test_install_requires_unordered_disallowed(self, tmpdir_cwd, env): + """ + Packages that pass unordered install_requires sequences + should be rejected as they produce non-deterministic + builds. See #458. + """ + req = 'install_requires={"fake-factory==0.5.2", "pytz"}' + self._setup_script_with_requires(req) + with pytest.raises(AssertionError): + self._run_egg_info_command(tmpdir_cwd, env) + + def test_extras_require_with_invalid_marker(self, tmpdir_cwd, env): + tmpl = 'extras_require={{":{marker}": ["barbazquux"]}},' + req = tmpl.format(marker=self.invalid_marker) + self._setup_script_with_requires(req) + with pytest.raises(AssertionError): + self._run_egg_info_command(tmpdir_cwd, env) + assert glob.glob(os.path.join(env.paths['lib'], 'barbazquux*')) == [] + + def test_extras_require_with_invalid_marker_in_req(self, tmpdir_cwd, env): + tmpl = 'extras_require={{"extra": ["barbazquux; {marker}"]}},' + req = tmpl.format(marker=self.invalid_marker) + self._setup_script_with_requires(req) + with pytest.raises(AssertionError): + self._run_egg_info_command(tmpdir_cwd, env) + assert glob.glob(os.path.join(env.paths['lib'], 'barbazquux*')) == [] + + def test_provides_extra(self, tmpdir_cwd, env): + self._setup_script_with_requires('extras_require={"foobar": ["barbazquux"]},') + environ = os.environ.copy().update( + HOME=env.paths['home'], + ) + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + data_stream=1, + env=environ, + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + with open(os.path.join(egg_info_dir, 'PKG-INFO'), encoding="utf-8") as fp: + pkg_info_lines = fp.read().split('\n') + assert 'Provides-Extra: foobar' in pkg_info_lines + assert 'Metadata-Version: 2.2' in pkg_info_lines + + def test_doesnt_provides_extra(self, tmpdir_cwd, env): + self._setup_script_with_requires( + """install_requires=["spam ; python_version<'3.6'"]""" + ) + environ = os.environ.copy().update( + HOME=env.paths['home'], + ) + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + data_stream=1, + env=environ, + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + with open(os.path.join(egg_info_dir, 'PKG-INFO'), encoding="utf-8") as fp: + pkg_info_text = fp.read() + assert 'Provides-Extra:' not in pkg_info_text + + @pytest.mark.parametrize( + ('files', 'license_in_sources'), + [ + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = LICENSE + """ + ), + 'LICENSE': "Test license", + }, + True, + ), # with license + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = INVALID_LICENSE + """ + ), + 'LICENSE': "Test license", + }, + False, + ), # with an invalid license + ( + { + 'setup.cfg': DALS( + """ + """ + ), + 'LICENSE': "Test license", + }, + True, + ), # no license_file attribute, LICENSE auto-included + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = LICENSE + """ + ), + 'MANIFEST.in': "exclude LICENSE", + 'LICENSE': "Test license", + }, + True, + ), # manifest is overwritten by license_file + pytest.param( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = LICEN[CS]E* + """ + ), + 'LICENSE': "Test license", + }, + True, + id="glob_pattern", + ), + ], + ) + def test_setup_cfg_license_file(self, tmpdir_cwd, env, files, license_in_sources): + self._create_project() + path.build(files) + + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + + sources_text = Path(egg_info_dir, "SOURCES.txt").read_text(encoding="utf-8") + + if license_in_sources: + assert 'LICENSE' in sources_text + else: + assert 'LICENSE' not in sources_text + # for invalid license test + assert 'INVALID_LICENSE' not in sources_text + + @pytest.mark.parametrize( + ('files', 'incl_licenses', 'excl_licenses'), + [ + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_files = + LICENSE-ABC + LICENSE-XYZ + """ + ), + 'LICENSE-ABC': "ABC license", + 'LICENSE-XYZ': "XYZ license", + }, + ['LICENSE-ABC', 'LICENSE-XYZ'], + [], + ), # with licenses + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_files = LICENSE-ABC, LICENSE-XYZ + """ + ), + 'LICENSE-ABC': "ABC license", + 'LICENSE-XYZ': "XYZ license", + }, + ['LICENSE-ABC', 'LICENSE-XYZ'], + [], + ), # with commas + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_files = + LICENSE-ABC + """ + ), + 'LICENSE-ABC': "ABC license", + 'LICENSE-XYZ': "XYZ license", + }, + ['LICENSE-ABC'], + ['LICENSE-XYZ'], + ), # with one license + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_files = + """ + ), + 'LICENSE-ABC': "ABC license", + 'LICENSE-XYZ': "XYZ license", + }, + [], + ['LICENSE-ABC', 'LICENSE-XYZ'], + ), # empty + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_files = LICENSE-XYZ + """ + ), + 'LICENSE-ABC': "ABC license", + 'LICENSE-XYZ': "XYZ license", + }, + ['LICENSE-XYZ'], + ['LICENSE-ABC'], + ), # on same line + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_files = + LICENSE-ABC + INVALID_LICENSE + """ + ), + 'LICENSE-ABC': "Test license", + }, + ['LICENSE-ABC'], + ['INVALID_LICENSE'], + ), # with an invalid license + ( + { + 'setup.cfg': DALS( + """ + """ + ), + 'LICENSE': "Test license", + }, + ['LICENSE'], + [], + ), # no license_files attribute, LICENSE auto-included + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_files = LICENSE + """ + ), + 'MANIFEST.in': "exclude LICENSE", + 'LICENSE': "Test license", + }, + ['LICENSE'], + [], + ), # manifest is overwritten by license_files + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_files = + LICENSE-ABC + LICENSE-XYZ + """ + ), + 'MANIFEST.in': "exclude LICENSE-XYZ", + 'LICENSE-ABC': "ABC license", + 'LICENSE-XYZ': "XYZ license", + # manifest is overwritten by license_files + }, + ['LICENSE-ABC', 'LICENSE-XYZ'], + [], + ), + pytest.param( + { + 'setup.cfg': "", + 'LICENSE-ABC': "ABC license", + 'COPYING-ABC': "ABC copying", + 'NOTICE-ABC': "ABC notice", + 'AUTHORS-ABC': "ABC authors", + 'LICENCE-XYZ': "XYZ license", + 'LICENSE': "License", + 'INVALID-LICENSE': "Invalid license", + }, + [ + 'LICENSE-ABC', + 'COPYING-ABC', + 'NOTICE-ABC', + 'AUTHORS-ABC', + 'LICENCE-XYZ', + 'LICENSE', + ], + ['INVALID-LICENSE'], + # ('LICEN[CS]E*', 'COPYING*', 'NOTICE*', 'AUTHORS*') + id="default_glob_patterns", + ), + pytest.param( + { + 'setup.cfg': DALS( + """ + [metadata] + license_files = + LICENSE* + """ + ), + 'LICENSE-ABC': "ABC license", + 'NOTICE-XYZ': "XYZ notice", + }, + ['LICENSE-ABC'], + ['NOTICE-XYZ'], + id="no_default_glob_patterns", + ), + pytest.param( + { + 'setup.cfg': DALS( + """ + [metadata] + license_files = + LICENSE-ABC + LICENSE* + """ + ), + 'LICENSE-ABC': "ABC license", + }, + ['LICENSE-ABC'], + [], + id="files_only_added_once", + ), + ], + ) + def test_setup_cfg_license_files( + self, tmpdir_cwd, env, files, incl_licenses, excl_licenses + ): + self._create_project() + path.build(files) + + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + + sources_text = Path(egg_info_dir, "SOURCES.txt").read_text(encoding="utf-8") + sources_lines = [line.strip() for line in sources_text.splitlines()] + + for lf in incl_licenses: + assert sources_lines.count(lf) == 1 + + for lf in excl_licenses: + assert sources_lines.count(lf) == 0 + + @pytest.mark.parametrize( + ('files', 'incl_licenses', 'excl_licenses'), + [ + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = + license_files = + """ + ), + 'LICENSE-ABC': "ABC license", + 'LICENSE-XYZ': "XYZ license", + }, + [], + ['LICENSE-ABC', 'LICENSE-XYZ'], + ), # both empty + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = + LICENSE-ABC + LICENSE-XYZ + """ + ), + 'LICENSE-ABC': "ABC license", + 'LICENSE-XYZ': "XYZ license", + # license_file is still singular + }, + [], + ['LICENSE-ABC', 'LICENSE-XYZ'], + ), + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = LICENSE-ABC + license_files = + LICENSE-XYZ + LICENSE-PQR + """ + ), + 'LICENSE-ABC': "ABC license", + 'LICENSE-PQR': "PQR license", + 'LICENSE-XYZ': "XYZ license", + }, + ['LICENSE-ABC', 'LICENSE-PQR', 'LICENSE-XYZ'], + [], + ), # combined + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = LICENSE-ABC + license_files = + LICENSE-ABC + LICENSE-XYZ + LICENSE-PQR + """ + ), + 'LICENSE-ABC': "ABC license", + 'LICENSE-PQR': "PQR license", + 'LICENSE-XYZ': "XYZ license", + # duplicate license + }, + ['LICENSE-ABC', 'LICENSE-PQR', 'LICENSE-XYZ'], + [], + ), + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = LICENSE-ABC + license_files = + LICENSE-XYZ + """ + ), + 'LICENSE-ABC': "ABC license", + 'LICENSE-PQR': "PQR license", + 'LICENSE-XYZ': "XYZ license", + # combined subset + }, + ['LICENSE-ABC', 'LICENSE-XYZ'], + ['LICENSE-PQR'], + ), + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = LICENSE-ABC + license_files = + LICENSE-XYZ + LICENSE-PQR + """ + ), + 'LICENSE-PQR': "Test license", + # with invalid licenses + }, + ['LICENSE-PQR'], + ['LICENSE-ABC', 'LICENSE-XYZ'], + ), + ( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = LICENSE-ABC + license_files = + LICENSE-PQR + LICENSE-XYZ + """ + ), + 'MANIFEST.in': "exclude LICENSE-ABC\nexclude LICENSE-PQR", + 'LICENSE-ABC': "ABC license", + 'LICENSE-PQR': "PQR license", + 'LICENSE-XYZ': "XYZ license", + # manifest is overwritten + }, + ['LICENSE-ABC', 'LICENSE-PQR', 'LICENSE-XYZ'], + [], + ), + pytest.param( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = LICENSE* + """ + ), + 'LICENSE-ABC': "ABC license", + 'NOTICE-XYZ': "XYZ notice", + }, + ['LICENSE-ABC'], + ['NOTICE-XYZ'], + id="no_default_glob_patterns", + ), + pytest.param( + { + 'setup.cfg': DALS( + """ + [metadata] + license_file = LICENSE* + license_files = + NOTICE* + """ + ), + 'LICENSE-ABC': "ABC license", + 'NOTICE-ABC': "ABC notice", + 'AUTHORS-ABC': "ABC authors", + }, + ['LICENSE-ABC', 'NOTICE-ABC'], + ['AUTHORS-ABC'], + id="combined_glob_patterrns", + ), + ], + ) + def test_setup_cfg_license_file_license_files( + self, tmpdir_cwd, env, files, incl_licenses, excl_licenses + ): + self._create_project() + path.build(files) + + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + + sources_text = Path(egg_info_dir, "SOURCES.txt").read_text(encoding="utf-8") + sources_lines = [line.strip() for line in sources_text.splitlines()] + + for lf in incl_licenses: + assert sources_lines.count(lf) == 1 + + for lf in excl_licenses: + assert sources_lines.count(lf) == 0 + + def test_license_file_attr_pkg_info(self, tmpdir_cwd, env): + """All matched license files should have a corresponding License-File.""" + self._create_project() + path.build({ + "setup.cfg": DALS( + """ + [metadata] + license_files = + NOTICE* + LICENSE* + """ + ), + "LICENSE-ABC": "ABC license", + "LICENSE-XYZ": "XYZ license", + "NOTICE": "included", + "IGNORE": "not include", + }) + + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + with open(os.path.join(egg_info_dir, 'PKG-INFO'), encoding="utf-8") as fp: + pkg_info_lines = fp.read().split('\n') + license_file_lines = [ + line for line in pkg_info_lines if line.startswith('License-File:') + ] + + # Only 'NOTICE', LICENSE-ABC', and 'LICENSE-XYZ' should have been matched + # Also assert that order from license_files is keeped + assert "License-File: NOTICE" == license_file_lines[0] + assert "License-File: LICENSE-ABC" in license_file_lines[1:] + assert "License-File: LICENSE-XYZ" in license_file_lines[1:] + + def test_metadata_version(self, tmpdir_cwd, env): + """Make sure latest metadata version is used by default.""" + self._setup_script_with_requires("") + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + data_stream=1, + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + with open(os.path.join(egg_info_dir, 'PKG-INFO'), encoding="utf-8") as fp: + pkg_info_lines = fp.read().split('\n') + # Update metadata version if changed + assert self._extract_mv_version(pkg_info_lines) == (2, 2) + + def test_long_description_content_type(self, tmpdir_cwd, env): + # Test that specifying a `long_description_content_type` keyword arg to + # the `setup` function results in writing a `Description-Content-Type` + # line to the `PKG-INFO` file in the `.egg-info` + # directory. + # `Description-Content-Type` is described at + # https://github.com/pypa/python-packaging-user-guide/pull/258 + + self._setup_script_with_requires( + """long_description_content_type='text/markdown',""" + ) + environ = os.environ.copy().update( + HOME=env.paths['home'], + ) + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + data_stream=1, + env=environ, + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + with open(os.path.join(egg_info_dir, 'PKG-INFO'), encoding="utf-8") as fp: + pkg_info_lines = fp.read().split('\n') + expected_line = 'Description-Content-Type: text/markdown' + assert expected_line in pkg_info_lines + assert 'Metadata-Version: 2.2' in pkg_info_lines + + def test_long_description(self, tmpdir_cwd, env): + # Test that specifying `long_description` and `long_description_content_type` + # keyword args to the `setup` function results in writing + # the description in the message payload of the `PKG-INFO` file + # in the `.egg-info` directory. + self._setup_script_with_requires( + "long_description='This is a long description\\nover multiple lines'," + "long_description_content_type='text/markdown'," + ) + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + data_stream=1, + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + with open(os.path.join(egg_info_dir, 'PKG-INFO'), encoding="utf-8") as fp: + pkg_info_lines = fp.read().split('\n') + assert 'Metadata-Version: 2.2' in pkg_info_lines + assert '' == pkg_info_lines[-1] # last line should be empty + long_desc_lines = pkg_info_lines[pkg_info_lines.index('') :] + assert 'This is a long description' in long_desc_lines + assert 'over multiple lines' in long_desc_lines + + def test_project_urls(self, tmpdir_cwd, env): + # Test that specifying a `project_urls` dict to the `setup` + # function results in writing multiple `Project-URL` lines to + # the `PKG-INFO` file in the `.egg-info` + # directory. + # `Project-URL` is described at https://packaging.python.org + # /specifications/core-metadata/#project-url-multiple-use + + self._setup_script_with_requires( + """project_urls={ + 'Link One': 'https://example.com/one/', + 'Link Two': 'https://example.com/two/', + },""" + ) + environ = os.environ.copy().update( + HOME=env.paths['home'], + ) + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + data_stream=1, + env=environ, + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + with open(os.path.join(egg_info_dir, 'PKG-INFO'), encoding="utf-8") as fp: + pkg_info_lines = fp.read().split('\n') + expected_line = 'Project-URL: Link One, https://example.com/one/' + assert expected_line in pkg_info_lines + expected_line = 'Project-URL: Link Two, https://example.com/two/' + assert expected_line in pkg_info_lines + assert self._extract_mv_version(pkg_info_lines) >= (1, 2) + + def test_license(self, tmpdir_cwd, env): + """Test single line license.""" + self._setup_script_with_requires("license='MIT',") + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + data_stream=1, + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + with open(os.path.join(egg_info_dir, 'PKG-INFO'), encoding="utf-8") as fp: + pkg_info_lines = fp.read().split('\n') + assert 'License: MIT' in pkg_info_lines + + def test_license_escape(self, tmpdir_cwd, env): + """Test license is escaped correctly if longer than one line.""" + self._setup_script_with_requires( + "license='This is a long license text \\nover multiple lines'," + ) + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + data_stream=1, + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + with open(os.path.join(egg_info_dir, 'PKG-INFO'), encoding="utf-8") as fp: + pkg_info_lines = fp.read().split('\n') + + assert 'License: This is a long license text ' in pkg_info_lines + assert ' over multiple lines' in pkg_info_lines + assert 'text \n over multiple' in '\n'.join(pkg_info_lines) + + def test_python_requires_egg_info(self, tmpdir_cwd, env): + self._setup_script_with_requires("""python_requires='>=2.7.12',""") + environ = os.environ.copy().update( + HOME=env.paths['home'], + ) + environment.run_setup_py( + cmd=['egg_info'], + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + data_stream=1, + env=environ, + ) + egg_info_dir = os.path.join('.', 'foo.egg-info') + with open(os.path.join(egg_info_dir, 'PKG-INFO'), encoding="utf-8") as fp: + pkg_info_lines = fp.read().split('\n') + assert 'Requires-Python: >=2.7.12' in pkg_info_lines + assert self._extract_mv_version(pkg_info_lines) >= (1, 2) + + def test_manifest_maker_warning_suppression(self): + fixtures = [ + "standard file not found: should have one of foo.py, bar.py", + "standard file 'setup.py' not found", + ] + + for msg in fixtures: + assert manifest_maker._should_suppress_warning(msg) + + def test_egg_info_includes_setup_py(self, tmpdir_cwd): + self._create_project() + dist = Distribution({"name": "foo", "version": "0.0.1"}) + dist.script_name = "non_setup.py" + egg_info_instance = egg_info(dist) + egg_info_instance.finalize_options() + egg_info_instance.run() + + assert 'setup.py' in egg_info_instance.filelist.files + + with open(egg_info_instance.egg_info + "/SOURCES.txt", encoding="utf-8") as f: + sources = f.read().split('\n') + assert 'setup.py' in sources + + def _run_egg_info_command(self, tmpdir_cwd, env, cmd=None, output=None): + environ = os.environ.copy().update( + HOME=env.paths['home'], + ) + if cmd is None: + cmd = [ + 'egg_info', + ] + code, data = environment.run_setup_py( + cmd=cmd, + pypath=os.pathsep.join([env.paths['lib'], str(tmpdir_cwd)]), + data_stream=1, + env=environ, + ) + assert not code, data + + if output: + assert output in data + + def test_egg_info_tag_only_once(self, tmpdir_cwd, env): + self._create_project() + path.build({ + 'setup.cfg': DALS( + """ + [egg_info] + tag_build = dev + tag_date = 0 + tag_svn_revision = 0 + """ + ), + }) + self._run_egg_info_command(tmpdir_cwd, env) + egg_info_dir = os.path.join('.', 'foo.egg-info') + with open(os.path.join(egg_info_dir, 'PKG-INFO'), encoding="utf-8") as fp: + pkg_info_lines = fp.read().split('\n') + assert 'Version: 0.0.0.dev0' in pkg_info_lines + + +class TestWriteEntries: + def test_invalid_entry_point(self, tmpdir_cwd, env): + dist = Distribution({"name": "foo", "version": "0.0.1"}) + dist.entry_points = {"foo": "foo = invalid-identifier:foo"} + cmd = dist.get_command_obj("egg_info") + expected_msg = r"Problems to parse .*invalid-identifier.*" + with pytest.raises(errors.OptionError, match=expected_msg) as ex: + write_entries(cmd, "entry_points", "entry_points.txt") + assert "ensure entry-point follows the spec" in ex.value.args[0] + + def test_valid_entry_point(self, tmpdir_cwd, env): + dist = Distribution({"name": "foo", "version": "0.0.1"}) + dist.entry_points = { + "abc": "foo = bar:baz", + "def": ["faa = bor:boz"], + } + cmd = dist.get_command_obj("egg_info") + write_entries(cmd, "entry_points", "entry_points.txt") + content = Path("entry_points.txt").read_text(encoding="utf-8") + assert "[abc]\nfoo = bar:baz\n" in content + assert "[def]\nfaa = bor:boz\n" in content diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_extern.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_extern.py new file mode 100644 index 0000000000000000000000000000000000000000..d7eb3c62c190dacdd4a054d2934962be2f4ee860 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_extern.py @@ -0,0 +1,15 @@ +import importlib +import pickle + +import packaging + +from setuptools import Distribution + + +def test_reimport_extern(): + packaging2 = importlib.import_module(packaging.__name__) + assert packaging is packaging2 + + +def test_distribution_picklable(): + pickle.loads(pickle.dumps(Distribution())) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_find_packages.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_find_packages.py new file mode 100644 index 0000000000000000000000000000000000000000..9fd9f8f6637d13cb898fcd446a6b090323d02014 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_find_packages.py @@ -0,0 +1,218 @@ +"""Tests for automatic package discovery""" + +import os +import shutil +import tempfile + +import pytest + +from setuptools import find_namespace_packages, find_packages +from setuptools.discovery import FlatLayoutPackageFinder + +from .compat.py39 import os_helper + + +class TestFindPackages: + def setup_method(self, method): + self.dist_dir = tempfile.mkdtemp() + self._make_pkg_structure() + + def teardown_method(self, method): + shutil.rmtree(self.dist_dir) + + def _make_pkg_structure(self): + """Make basic package structure. + + dist/ + docs/ + conf.py + pkg/ + __pycache__/ + nspkg/ + mod.py + subpkg/ + assets/ + asset + __init__.py + setup.py + + """ + self.docs_dir = self._mkdir('docs', self.dist_dir) + self._touch('conf.py', self.docs_dir) + self.pkg_dir = self._mkdir('pkg', self.dist_dir) + self._mkdir('__pycache__', self.pkg_dir) + self.ns_pkg_dir = self._mkdir('nspkg', self.pkg_dir) + self._touch('mod.py', self.ns_pkg_dir) + self.sub_pkg_dir = self._mkdir('subpkg', self.pkg_dir) + self.asset_dir = self._mkdir('assets', self.sub_pkg_dir) + self._touch('asset', self.asset_dir) + self._touch('__init__.py', self.sub_pkg_dir) + self._touch('setup.py', self.dist_dir) + + def _mkdir(self, path, parent_dir=None): + if parent_dir: + path = os.path.join(parent_dir, path) + os.mkdir(path) + return path + + def _touch(self, path, dir_=None): + if dir_: + path = os.path.join(dir_, path) + open(path, 'wb').close() + return path + + def test_regular_package(self): + self._touch('__init__.py', self.pkg_dir) + packages = find_packages(self.dist_dir) + assert packages == ['pkg', 'pkg.subpkg'] + + def test_exclude(self): + self._touch('__init__.py', self.pkg_dir) + packages = find_packages(self.dist_dir, exclude=('pkg.*',)) + assert packages == ['pkg'] + + def test_exclude_recursive(self): + """ + Excluding a parent package should not exclude child packages as well. + """ + self._touch('__init__.py', self.pkg_dir) + self._touch('__init__.py', self.sub_pkg_dir) + packages = find_packages(self.dist_dir, exclude=('pkg',)) + assert packages == ['pkg.subpkg'] + + def test_include_excludes_other(self): + """ + If include is specified, other packages should be excluded. + """ + self._touch('__init__.py', self.pkg_dir) + alt_dir = self._mkdir('other_pkg', self.dist_dir) + self._touch('__init__.py', alt_dir) + packages = find_packages(self.dist_dir, include=['other_pkg']) + assert packages == ['other_pkg'] + + def test_dir_with_dot_is_skipped(self): + shutil.rmtree(os.path.join(self.dist_dir, 'pkg/subpkg/assets')) + data_dir = self._mkdir('some.data', self.pkg_dir) + self._touch('__init__.py', data_dir) + self._touch('file.dat', data_dir) + packages = find_packages(self.dist_dir) + assert 'pkg.some.data' not in packages + + def test_dir_with_packages_in_subdir_is_excluded(self): + """ + Ensure that a package in a non-package such as build/pkg/__init__.py + is excluded. + """ + build_dir = self._mkdir('build', self.dist_dir) + build_pkg_dir = self._mkdir('pkg', build_dir) + self._touch('__init__.py', build_pkg_dir) + packages = find_packages(self.dist_dir) + assert 'build.pkg' not in packages + + @pytest.mark.skipif(not os_helper.can_symlink(), reason='Symlink support required') + def test_symlinked_packages_are_included(self): + """ + A symbolically-linked directory should be treated like any other + directory when matched as a package. + + Create a link from lpkg -> pkg. + """ + self._touch('__init__.py', self.pkg_dir) + linked_pkg = os.path.join(self.dist_dir, 'lpkg') + os.symlink('pkg', linked_pkg) + assert os.path.isdir(linked_pkg) + packages = find_packages(self.dist_dir) + assert 'lpkg' in packages + + def _assert_packages(self, actual, expected): + assert set(actual) == set(expected) + + def test_pep420_ns_package(self): + packages = find_namespace_packages( + self.dist_dir, include=['pkg*'], exclude=['pkg.subpkg.assets'] + ) + self._assert_packages(packages, ['pkg', 'pkg.nspkg', 'pkg.subpkg']) + + def test_pep420_ns_package_no_includes(self): + packages = find_namespace_packages(self.dist_dir, exclude=['pkg.subpkg.assets']) + self._assert_packages(packages, ['docs', 'pkg', 'pkg.nspkg', 'pkg.subpkg']) + + def test_pep420_ns_package_no_includes_or_excludes(self): + packages = find_namespace_packages(self.dist_dir) + expected = ['docs', 'pkg', 'pkg.nspkg', 'pkg.subpkg', 'pkg.subpkg.assets'] + self._assert_packages(packages, expected) + + def test_regular_package_with_nested_pep420_ns_packages(self): + self._touch('__init__.py', self.pkg_dir) + packages = find_namespace_packages( + self.dist_dir, exclude=['docs', 'pkg.subpkg.assets'] + ) + self._assert_packages(packages, ['pkg', 'pkg.nspkg', 'pkg.subpkg']) + + def test_pep420_ns_package_no_non_package_dirs(self): + shutil.rmtree(self.docs_dir) + shutil.rmtree(os.path.join(self.dist_dir, 'pkg/subpkg/assets')) + packages = find_namespace_packages(self.dist_dir) + self._assert_packages(packages, ['pkg', 'pkg.nspkg', 'pkg.subpkg']) + + +class TestFlatLayoutPackageFinder: + EXAMPLES = { + "hidden-folders": ( + [".pkg/__init__.py", "pkg/__init__.py", "pkg/nested/file.txt"], + ["pkg", "pkg.nested"], + ), + "private-packages": ( + ["_pkg/__init__.py", "pkg/_private/__init__.py"], + ["pkg", "pkg._private"], + ), + "invalid-name": ( + ["invalid-pkg/__init__.py", "other.pkg/__init__.py", "yet,another/file.py"], + [], + ), + "docs": (["pkg/__init__.py", "docs/conf.py", "docs/readme.rst"], ["pkg"]), + "tests": ( + ["pkg/__init__.py", "tests/test_pkg.py", "tests/__init__.py"], + ["pkg"], + ), + "examples": ( + [ + "pkg/__init__.py", + "examples/__init__.py", + "examples/file.py", + "example/other_file.py", + # Sub-packages should always be fine + "pkg/example/__init__.py", + "pkg/examples/__init__.py", + ], + ["pkg", "pkg.examples", "pkg.example"], + ), + "tool-specific": ( + [ + "htmlcov/index.html", + "pkg/__init__.py", + "tasks/__init__.py", + "tasks/subpackage/__init__.py", + "fabfile/__init__.py", + "fabfile/subpackage/__init__.py", + # Sub-packages should always be fine + "pkg/tasks/__init__.py", + "pkg/fabfile/__init__.py", + ], + ["pkg", "pkg.tasks", "pkg.fabfile"], + ), + } + + @pytest.mark.parametrize("example", EXAMPLES.keys()) + def test_unwanted_directories_not_included(self, tmp_path, example): + files, expected_packages = self.EXAMPLES[example] + ensure_files(tmp_path, files) + found_packages = FlatLayoutPackageFinder.find(str(tmp_path)) + assert set(found_packages) == set(expected_packages) + + +def ensure_files(root_path, files): + for file in files: + path = root_path / file + path.parent.mkdir(parents=True, exist_ok=True) + path.touch() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_find_py_modules.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_find_py_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..8034b544294e5d30274bac82f24f93613120a0d4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_find_py_modules.py @@ -0,0 +1,73 @@ +"""Tests for automatic discovery of modules""" + +import os + +import pytest + +from setuptools.discovery import FlatLayoutModuleFinder, ModuleFinder + +from .compat.py39 import os_helper +from .test_find_packages import ensure_files + + +class TestModuleFinder: + def find(self, path, *args, **kwargs): + return set(ModuleFinder.find(str(path), *args, **kwargs)) + + EXAMPLES = { + # circumstance: (files, kwargs, expected_modules) + "simple_folder": ( + ["file.py", "other.py"], + {}, # kwargs + ["file", "other"], + ), + "exclude": ( + ["file.py", "other.py"], + {"exclude": ["f*"]}, + ["other"], + ), + "include": ( + ["file.py", "fole.py", "other.py"], + {"include": ["f*"], "exclude": ["fo*"]}, + ["file"], + ), + "invalid-name": (["my-file.py", "other.file.py"], {}, []), + } + + @pytest.mark.parametrize("example", EXAMPLES.keys()) + def test_finder(self, tmp_path, example): + files, kwargs, expected_modules = self.EXAMPLES[example] + ensure_files(tmp_path, files) + assert self.find(tmp_path, **kwargs) == set(expected_modules) + + @pytest.mark.skipif(not os_helper.can_symlink(), reason='Symlink support required') + def test_symlinked_packages_are_included(self, tmp_path): + src = "_myfiles/file.py" + ensure_files(tmp_path, [src]) + os.symlink(tmp_path / src, tmp_path / "link.py") + assert self.find(tmp_path) == {"link"} + + +class TestFlatLayoutModuleFinder: + def find(self, path, *args, **kwargs): + return set(FlatLayoutModuleFinder.find(str(path))) + + EXAMPLES = { + # circumstance: (files, expected_modules) + "hidden-files": ([".module.py"], []), + "private-modules": (["_module.py"], []), + "common-names": ( + ["setup.py", "conftest.py", "test.py", "tests.py", "example.py", "mod.py"], + ["mod"], + ), + "tool-specific": ( + ["tasks.py", "fabfile.py", "noxfile.py", "dodo.py", "manage.py", "mod.py"], + ["mod"], + ), + } + + @pytest.mark.parametrize("example", EXAMPLES.keys()) + def test_unwanted_files_not_included(self, tmp_path, example): + files, expected_modules = self.EXAMPLES[example] + ensure_files(tmp_path, files) + assert self.find(tmp_path) == set(expected_modules) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_glob.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_glob.py new file mode 100644 index 0000000000000000000000000000000000000000..8d225a44610163c7d56d65b07c06f0f598ccfe84 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_glob.py @@ -0,0 +1,45 @@ +import pytest +from jaraco import path + +from setuptools.glob import glob + + +@pytest.mark.parametrize( + ('tree', 'pattern', 'matches'), + ( + ('', b'', []), + ('', '', []), + ( + """ + appveyor.yml + CHANGES.rst + LICENSE + MANIFEST.in + pyproject.toml + README.rst + setup.cfg + setup.py + """, + '*.rst', + ('CHANGES.rst', 'README.rst'), + ), + ( + """ + appveyor.yml + CHANGES.rst + LICENSE + MANIFEST.in + pyproject.toml + README.rst + setup.cfg + setup.py + """, + b'*.rst', + (b'CHANGES.rst', b'README.rst'), + ), + ), +) +def test_glob(monkeypatch, tmpdir, tree, pattern, matches): + monkeypatch.chdir(tmpdir) + path.build({name: '' for name in tree.split()}) + assert list(sorted(glob(pattern))) == list(sorted(matches)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_install_scripts.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_install_scripts.py new file mode 100644 index 0000000000000000000000000000000000000000..e62a6b7f318df2da0cf29e53c41f74e5525e78ac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_install_scripts.py @@ -0,0 +1,89 @@ +"""install_scripts tests""" + +import sys + +import pytest + +from setuptools.command.install_scripts import install_scripts +from setuptools.dist import Distribution + +from . import contexts + + +class TestInstallScripts: + settings = dict( + name='foo', + entry_points={'console_scripts': ['foo=foo:foo']}, + version='0.0', + ) + unix_exe = '/usr/dummy-test-path/local/bin/python' + unix_spaces_exe = '/usr/bin/env dummy-test-python' + win32_exe = 'C:\\Dummy Test Path\\Program Files\\Python 3.6\\python.exe' + + def _run_install_scripts(self, install_dir, executable=None): + dist = Distribution(self.settings) + dist.script_name = 'setup.py' + cmd = install_scripts(dist) + cmd.install_dir = install_dir + if executable is not None: + bs = cmd.get_finalized_command('build_scripts') + bs.executable = executable + cmd.ensure_finalized() + with contexts.quiet(): + cmd.run() + + @pytest.mark.skipif(sys.platform == 'win32', reason='non-Windows only') + def test_sys_executable_escaping_unix(self, tmpdir, monkeypatch): + """ + Ensure that shebang is not quoted on Unix when getting the Python exe + from sys.executable. + """ + expected = f'#!{self.unix_exe}\n' + monkeypatch.setattr('sys.executable', self.unix_exe) + with tmpdir.as_cwd(): + self._run_install_scripts(str(tmpdir)) + with open(str(tmpdir.join('foo')), 'r', encoding="utf-8") as f: + actual = f.readline() + assert actual == expected + + @pytest.mark.skipif(sys.platform != 'win32', reason='Windows only') + def test_sys_executable_escaping_win32(self, tmpdir, monkeypatch): + """ + Ensure that shebang is quoted on Windows when getting the Python exe + from sys.executable and it contains a space. + """ + expected = f'#!"{self.win32_exe}"\n' + monkeypatch.setattr('sys.executable', self.win32_exe) + with tmpdir.as_cwd(): + self._run_install_scripts(str(tmpdir)) + with open(str(tmpdir.join('foo-script.py')), 'r', encoding="utf-8") as f: + actual = f.readline() + assert actual == expected + + @pytest.mark.skipif(sys.platform == 'win32', reason='non-Windows only') + def test_executable_with_spaces_escaping_unix(self, tmpdir): + """ + Ensure that shebang on Unix is not quoted, even when + a value with spaces + is specified using --executable. + """ + expected = f'#!{self.unix_spaces_exe}\n' + with tmpdir.as_cwd(): + self._run_install_scripts(str(tmpdir), self.unix_spaces_exe) + with open(str(tmpdir.join('foo')), 'r', encoding="utf-8") as f: + actual = f.readline() + assert actual == expected + + @pytest.mark.skipif(sys.platform != 'win32', reason='Windows only') + def test_executable_arg_escaping_win32(self, tmpdir): + """ + Ensure that shebang on Windows is quoted when + getting a path with spaces + from --executable, that is itself properly quoted. + """ + expected = f'#!"{self.win32_exe}"\n' + with tmpdir.as_cwd(): + self._run_install_scripts(str(tmpdir), '"' + self.win32_exe + '"') + with open(str(tmpdir.join('foo-script.py')), 'r', encoding="utf-8") as f: + actual = f.readline() + assert actual == expected diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_logging.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_logging.py new file mode 100644 index 0000000000000000000000000000000000000000..ea58001e93d8e4bcbd50bfd49303324d71858d16 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_logging.py @@ -0,0 +1,76 @@ +import functools +import inspect +import logging +import sys + +import pytest + +IS_PYPY = '__pypy__' in sys.builtin_module_names + + +setup_py = """\ +from setuptools import setup + +setup( + name="test_logging", + version="0.0" +) +""" + + +@pytest.mark.parametrize( + ('flag', 'expected_level'), [("--dry-run", "INFO"), ("--verbose", "DEBUG")] +) +def test_verbosity_level(tmp_path, monkeypatch, flag, expected_level): + """Make sure the correct verbosity level is set (issue #3038)""" + import setuptools # noqa: F401 # import setuptools to monkeypatch distutils + + import distutils # <- load distutils after all the patches take place + + logger = logging.Logger(__name__) + monkeypatch.setattr(logging, "root", logger) + unset_log_level = logger.getEffectiveLevel() + assert logging.getLevelName(unset_log_level) == "NOTSET" + + setup_script = tmp_path / "setup.py" + setup_script.write_text(setup_py, encoding="utf-8") + dist = distutils.core.run_setup(setup_script, stop_after="init") + dist.script_args = [flag, "sdist"] + dist.parse_command_line() # <- where the log level is set + log_level = logger.getEffectiveLevel() + log_level_name = logging.getLevelName(log_level) + assert log_level_name == expected_level + + +def flaky_on_pypy(func): + @functools.wraps(func) + def _func(): + try: + func() + except AssertionError: # pragma: no cover + if IS_PYPY: + msg = "Flaky monkeypatch on PyPy (#4124)" + pytest.xfail(f"{msg}. Original discussion in #3707, #3709.") + raise + + return _func + + +@flaky_on_pypy +def test_patching_does_not_cause_problems(): + # Ensure `dist.log` is only patched if necessary + + import _distutils_hack + + import setuptools.logging + + from distutils import dist + + setuptools.logging.configure() + + if _distutils_hack.enabled(): + # Modern logging infra, no problematic patching. + assert dist.__file__ is None or "setuptools" in dist.__file__ + assert isinstance(dist.log, logging.Logger) + else: + assert inspect.ismodule(dist.log) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_manifest.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_manifest.py new file mode 100644 index 0000000000000000000000000000000000000000..903a528db0cc2bba27bbcef24aa1c59dc4156528 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_manifest.py @@ -0,0 +1,622 @@ +"""sdist tests""" + +from __future__ import annotations + +import contextlib +import io +import itertools +import logging +import os +import shutil +import sys +import tempfile + +import pytest + +from setuptools.command.egg_info import FileList, egg_info, translate_pattern +from setuptools.dist import Distribution +from setuptools.tests.textwrap import DALS + +from distutils import log +from distutils.errors import DistutilsTemplateError + +IS_PYPY = '__pypy__' in sys.builtin_module_names + + +def make_local_path(s): + """Converts '/' in a string to os.sep""" + return s.replace('/', os.sep) + + +SETUP_ATTRS = { + 'name': 'app', + 'version': '0.0', + 'packages': ['app'], +} + +SETUP_PY = f"""\ +from setuptools import setup + +setup(**{SETUP_ATTRS!r}) +""" + + +@contextlib.contextmanager +def quiet(): + old_stdout, old_stderr = sys.stdout, sys.stderr + sys.stdout, sys.stderr = io.StringIO(), io.StringIO() + try: + yield + finally: + sys.stdout, sys.stderr = old_stdout, old_stderr + + +def touch(filename): + open(filename, 'wb').close() + + +# The set of files always in the manifest, including all files in the +# .egg-info directory +default_files = frozenset( + map( + make_local_path, + [ + 'README.rst', + 'MANIFEST.in', + 'setup.py', + 'app.egg-info/PKG-INFO', + 'app.egg-info/SOURCES.txt', + 'app.egg-info/dependency_links.txt', + 'app.egg-info/top_level.txt', + 'app/__init__.py', + ], + ) +) + + +translate_specs: list[tuple[str, list[str], list[str]]] = [ + ('foo', ['foo'], ['bar', 'foobar']), + ('foo/bar', ['foo/bar'], ['foo/bar/baz', './foo/bar', 'foo']), + # Glob matching + ('*.txt', ['foo.txt', 'bar.txt'], ['foo/foo.txt']), + ('dir/*.txt', ['dir/foo.txt', 'dir/bar.txt', 'dir/.txt'], ['notdir/foo.txt']), + ('*/*.py', ['bin/start.py'], []), + ('docs/page-?.txt', ['docs/page-9.txt'], ['docs/page-10.txt']), + # Globstars change what they mean depending upon where they are + ( + 'foo/**/bar', + ['foo/bing/bar', 'foo/bing/bang/bar', 'foo/bar'], + ['foo/abar'], + ), + ( + 'foo/**', + ['foo/bar/bing.py', 'foo/x'], + ['/foo/x'], + ), + ( + '**', + ['x', 'abc/xyz', '@nything'], + [], + ), + # Character classes + ( + 'pre[one]post', + ['preopost', 'prenpost', 'preepost'], + ['prepost', 'preonepost'], + ), + ( + 'hello[!one]world', + ['helloxworld', 'helloyworld'], + ['hellooworld', 'helloworld', 'hellooneworld'], + ), + ( + '[]one].txt', + ['o.txt', '].txt', 'e.txt'], + ['one].txt'], + ), + ( + 'foo[!]one]bar', + ['fooybar'], + ['foo]bar', 'fooobar', 'fooebar'], + ), +] +""" +A spec of inputs for 'translate_pattern' and matches and mismatches +for that input. +""" + +match_params = itertools.chain.from_iterable( + zip(itertools.repeat(pattern), matches) + for pattern, matches, mismatches in translate_specs +) + + +@pytest.fixture(params=match_params) +def pattern_match(request): + return map(make_local_path, request.param) + + +mismatch_params = itertools.chain.from_iterable( + zip(itertools.repeat(pattern), mismatches) + for pattern, matches, mismatches in translate_specs +) + + +@pytest.fixture(params=mismatch_params) +def pattern_mismatch(request): + return map(make_local_path, request.param) + + +def test_translated_pattern_match(pattern_match): + pattern, target = pattern_match + assert translate_pattern(pattern).match(target) + + +def test_translated_pattern_mismatch(pattern_mismatch): + pattern, target = pattern_mismatch + assert not translate_pattern(pattern).match(target) + + +class TempDirTestCase: + def setup_method(self, method): + self.temp_dir = tempfile.mkdtemp() + self.old_cwd = os.getcwd() + os.chdir(self.temp_dir) + + def teardown_method(self, method): + os.chdir(self.old_cwd) + shutil.rmtree(self.temp_dir) + + +class TestManifestTest(TempDirTestCase): + def setup_method(self, method): + super().setup_method(method) + + f = open(os.path.join(self.temp_dir, 'setup.py'), 'w', encoding="utf-8") + f.write(SETUP_PY) + f.close() + """ + Create a file tree like: + - LICENSE + - README.rst + - testing.rst + - .hidden.rst + - app/ + - __init__.py + - a.txt + - b.txt + - c.rst + - static/ + - app.js + - app.js.map + - app.css + - app.css.map + """ + + for fname in ['README.rst', '.hidden.rst', 'testing.rst', 'LICENSE']: + touch(os.path.join(self.temp_dir, fname)) + + # Set up the rest of the test package + test_pkg = os.path.join(self.temp_dir, 'app') + os.mkdir(test_pkg) + for fname in ['__init__.py', 'a.txt', 'b.txt', 'c.rst']: + touch(os.path.join(test_pkg, fname)) + + # Some compiled front-end assets to include + static = os.path.join(test_pkg, 'static') + os.mkdir(static) + for fname in ['app.js', 'app.js.map', 'app.css', 'app.css.map']: + touch(os.path.join(static, fname)) + + def make_manifest(self, contents): + """Write a MANIFEST.in.""" + manifest = os.path.join(self.temp_dir, 'MANIFEST.in') + with open(manifest, 'w', encoding="utf-8") as f: + f.write(DALS(contents)) + + def get_files(self): + """Run egg_info and get all the files to include, as a set""" + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'setup.py' + cmd = egg_info(dist) + cmd.ensure_finalized() + + cmd.run() + + return set(cmd.filelist.files) + + def test_no_manifest(self): + """Check a missing MANIFEST.in includes only the standard files.""" + assert (default_files - set(['MANIFEST.in'])) == self.get_files() + + def test_empty_files(self): + """Check an empty MANIFEST.in includes only the standard files.""" + self.make_manifest("") + assert default_files == self.get_files() + + def test_include(self): + """Include extra rst files in the project root.""" + self.make_manifest("include *.rst") + files = default_files | set(['testing.rst', '.hidden.rst']) + assert files == self.get_files() + + def test_exclude(self): + """Include everything in app/ except the text files""" + ml = make_local_path + self.make_manifest( + """ + include app/* + exclude app/*.txt + """ + ) + files = default_files | set([ml('app/c.rst')]) + assert files == self.get_files() + + def test_include_multiple(self): + """Include with multiple patterns.""" + ml = make_local_path + self.make_manifest("include app/*.txt app/static/*") + files = default_files | set([ + ml('app/a.txt'), + ml('app/b.txt'), + ml('app/static/app.js'), + ml('app/static/app.js.map'), + ml('app/static/app.css'), + ml('app/static/app.css.map'), + ]) + assert files == self.get_files() + + def test_graft(self): + """Include the whole app/static/ directory.""" + ml = make_local_path + self.make_manifest("graft app/static") + files = default_files | set([ + ml('app/static/app.js'), + ml('app/static/app.js.map'), + ml('app/static/app.css'), + ml('app/static/app.css.map'), + ]) + assert files == self.get_files() + + def test_graft_glob_syntax(self): + """Include the whole app/static/ directory.""" + ml = make_local_path + self.make_manifest("graft */static") + files = default_files | set([ + ml('app/static/app.js'), + ml('app/static/app.js.map'), + ml('app/static/app.css'), + ml('app/static/app.css.map'), + ]) + assert files == self.get_files() + + def test_graft_global_exclude(self): + """Exclude all *.map files in the project.""" + ml = make_local_path + self.make_manifest( + """ + graft app/static + global-exclude *.map + """ + ) + files = default_files | set([ml('app/static/app.js'), ml('app/static/app.css')]) + assert files == self.get_files() + + def test_global_include(self): + """Include all *.rst, *.js, and *.css files in the whole tree.""" + ml = make_local_path + self.make_manifest( + """ + global-include *.rst *.js *.css + """ + ) + files = default_files | set([ + '.hidden.rst', + 'testing.rst', + ml('app/c.rst'), + ml('app/static/app.js'), + ml('app/static/app.css'), + ]) + assert files == self.get_files() + + def test_graft_prune(self): + """Include all files in app/, except for the whole app/static/ dir.""" + ml = make_local_path + self.make_manifest( + """ + graft app + prune app/static + """ + ) + files = default_files | set([ml('app/a.txt'), ml('app/b.txt'), ml('app/c.rst')]) + assert files == self.get_files() + + +class TestFileListTest(TempDirTestCase): + """ + A copy of the relevant bits of distutils/tests/test_filelist.py, + to ensure setuptools' version of FileList keeps parity with distutils. + """ + + @pytest.fixture(autouse=os.getenv("SETUPTOOLS_USE_DISTUTILS") == "stdlib") + def _compat_record_logs(self, monkeypatch, caplog): + """Account for stdlib compatibility""" + + def _log(_logger, level, msg, args): + exc = sys.exc_info() + rec = logging.LogRecord("distutils", level, "", 0, msg, args, exc) + caplog.records.append(rec) + + monkeypatch.setattr(log.Log, "_log", _log) + + def get_records(self, caplog, *levels): + return [r for r in caplog.records if r.levelno in levels] + + def assertNoWarnings(self, caplog): + assert self.get_records(caplog, log.WARN) == [] + caplog.clear() + + def assertWarnings(self, caplog): + if IS_PYPY and not caplog.records: + pytest.xfail("caplog checks may not work well in PyPy") + else: + assert len(self.get_records(caplog, log.WARN)) > 0 + caplog.clear() + + def make_files(self, files): + for file in files: + file = os.path.join(self.temp_dir, file) + dirname, _basename = os.path.split(file) + os.makedirs(dirname, exist_ok=True) + touch(file) + + def test_process_template_line(self): + # testing all MANIFEST.in template patterns + file_list = FileList() + ml = make_local_path + + # simulated file list + self.make_files([ + 'foo.tmp', + 'ok', + 'xo', + 'four.txt', + 'buildout.cfg', + # filelist does not filter out VCS directories, + # it's sdist that does + ml('.hg/last-message.txt'), + ml('global/one.txt'), + ml('global/two.txt'), + ml('global/files.x'), + ml('global/here.tmp'), + ml('f/o/f.oo'), + ml('dir/graft-one'), + ml('dir/dir2/graft2'), + ml('dir3/ok'), + ml('dir3/sub/ok.txt'), + ]) + + MANIFEST_IN = DALS( + """\ + include ok + include xo + exclude xo + include foo.tmp + include buildout.cfg + global-include *.x + global-include *.txt + global-exclude *.tmp + recursive-include f *.oo + recursive-exclude global *.x + graft dir + prune dir3 + """ + ) + + for line in MANIFEST_IN.split('\n'): + if not line: + continue + file_list.process_template_line(line) + + wanted = [ + 'buildout.cfg', + 'four.txt', + 'ok', + ml('.hg/last-message.txt'), + ml('dir/graft-one'), + ml('dir/dir2/graft2'), + ml('f/o/f.oo'), + ml('global/one.txt'), + ml('global/two.txt'), + ] + + file_list.sort() + assert file_list.files == wanted + + def test_exclude_pattern(self): + # return False if no match + file_list = FileList() + assert not file_list.exclude_pattern('*.py') + + # return True if files match + file_list = FileList() + file_list.files = ['a.py', 'b.py'] + assert file_list.exclude_pattern('*.py') + + # test excludes + file_list = FileList() + file_list.files = ['a.py', 'a.txt'] + file_list.exclude_pattern('*.py') + file_list.sort() + assert file_list.files == ['a.txt'] + + def test_include_pattern(self): + # return False if no match + file_list = FileList() + self.make_files([]) + assert not file_list.include_pattern('*.py') + + # return True if files match + file_list = FileList() + self.make_files(['a.py', 'b.txt']) + assert file_list.include_pattern('*.py') + + # test * matches all files + file_list = FileList() + self.make_files(['a.py', 'b.txt']) + file_list.include_pattern('*') + file_list.sort() + assert file_list.files == ['a.py', 'b.txt'] + + def test_process_template_line_invalid(self): + # invalid lines + file_list = FileList() + for action in ( + 'include', + 'exclude', + 'global-include', + 'global-exclude', + 'recursive-include', + 'recursive-exclude', + 'graft', + 'prune', + 'blarg', + ): + with pytest.raises(DistutilsTemplateError): + file_list.process_template_line(action) + + def test_include(self, caplog): + caplog.set_level(logging.DEBUG) + ml = make_local_path + # include + file_list = FileList() + self.make_files(['a.py', 'b.txt', ml('d/c.py')]) + + file_list.process_template_line('include *.py') + file_list.sort() + assert file_list.files == ['a.py'] + self.assertNoWarnings(caplog) + + file_list.process_template_line('include *.rb') + file_list.sort() + assert file_list.files == ['a.py'] + self.assertWarnings(caplog) + + def test_exclude(self, caplog): + caplog.set_level(logging.DEBUG) + ml = make_local_path + # exclude + file_list = FileList() + file_list.files = ['a.py', 'b.txt', ml('d/c.py')] + + file_list.process_template_line('exclude *.py') + file_list.sort() + assert file_list.files == ['b.txt', ml('d/c.py')] + self.assertNoWarnings(caplog) + + file_list.process_template_line('exclude *.rb') + file_list.sort() + assert file_list.files == ['b.txt', ml('d/c.py')] + self.assertWarnings(caplog) + + def test_global_include(self, caplog): + caplog.set_level(logging.DEBUG) + ml = make_local_path + # global-include + file_list = FileList() + self.make_files(['a.py', 'b.txt', ml('d/c.py')]) + + file_list.process_template_line('global-include *.py') + file_list.sort() + assert file_list.files == ['a.py', ml('d/c.py')] + self.assertNoWarnings(caplog) + + file_list.process_template_line('global-include *.rb') + file_list.sort() + assert file_list.files == ['a.py', ml('d/c.py')] + self.assertWarnings(caplog) + + def test_global_exclude(self, caplog): + caplog.set_level(logging.DEBUG) + ml = make_local_path + # global-exclude + file_list = FileList() + file_list.files = ['a.py', 'b.txt', ml('d/c.py')] + + file_list.process_template_line('global-exclude *.py') + file_list.sort() + assert file_list.files == ['b.txt'] + self.assertNoWarnings(caplog) + + file_list.process_template_line('global-exclude *.rb') + file_list.sort() + assert file_list.files == ['b.txt'] + self.assertWarnings(caplog) + + def test_recursive_include(self, caplog): + caplog.set_level(logging.DEBUG) + ml = make_local_path + # recursive-include + file_list = FileList() + self.make_files(['a.py', ml('d/b.py'), ml('d/c.txt'), ml('d/d/e.py')]) + + file_list.process_template_line('recursive-include d *.py') + file_list.sort() + assert file_list.files == [ml('d/b.py'), ml('d/d/e.py')] + self.assertNoWarnings(caplog) + + file_list.process_template_line('recursive-include e *.py') + file_list.sort() + assert file_list.files == [ml('d/b.py'), ml('d/d/e.py')] + self.assertWarnings(caplog) + + def test_recursive_exclude(self, caplog): + caplog.set_level(logging.DEBUG) + ml = make_local_path + # recursive-exclude + file_list = FileList() + file_list.files = ['a.py', ml('d/b.py'), ml('d/c.txt'), ml('d/d/e.py')] + + file_list.process_template_line('recursive-exclude d *.py') + file_list.sort() + assert file_list.files == ['a.py', ml('d/c.txt')] + self.assertNoWarnings(caplog) + + file_list.process_template_line('recursive-exclude e *.py') + file_list.sort() + assert file_list.files == ['a.py', ml('d/c.txt')] + self.assertWarnings(caplog) + + def test_graft(self, caplog): + caplog.set_level(logging.DEBUG) + ml = make_local_path + # graft + file_list = FileList() + self.make_files(['a.py', ml('d/b.py'), ml('d/d/e.py'), ml('f/f.py')]) + + file_list.process_template_line('graft d') + file_list.sort() + assert file_list.files == [ml('d/b.py'), ml('d/d/e.py')] + self.assertNoWarnings(caplog) + + file_list.process_template_line('graft e') + file_list.sort() + assert file_list.files == [ml('d/b.py'), ml('d/d/e.py')] + self.assertWarnings(caplog) + + def test_prune(self, caplog): + caplog.set_level(logging.DEBUG) + ml = make_local_path + # prune + file_list = FileList() + file_list.files = ['a.py', ml('d/b.py'), ml('d/d/e.py'), ml('f/f.py')] + + file_list.process_template_line('prune d') + file_list.sort() + assert file_list.files == ['a.py', ml('f/f.py')] + self.assertNoWarnings(caplog) + + file_list.process_template_line('prune e') + file_list.sort() + assert file_list.files == ['a.py', ml('f/f.py')] + self.assertWarnings(caplog) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_namespaces.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_namespaces.py new file mode 100644 index 0000000000000000000000000000000000000000..a0f4120bf7900b2118cc066034e036ab7af1798b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_namespaces.py @@ -0,0 +1,138 @@ +import subprocess +import sys + +from setuptools._path import paths_on_pythonpath + +from . import namespaces + + +class TestNamespaces: + def test_mixed_site_and_non_site(self, tmpdir): + """ + Installing two packages sharing the same namespace, one installed + to a site dir and the other installed just to a path on PYTHONPATH + should leave the namespace in tact and both packages reachable by + import. + """ + pkg_A = namespaces.build_namespace_package(tmpdir, 'myns.pkgA') + pkg_B = namespaces.build_namespace_package(tmpdir, 'myns.pkgB') + site_packages = tmpdir / 'site-packages' + path_packages = tmpdir / 'path-packages' + targets = site_packages, path_packages + # use pip to install to the target directory + install_cmd = [ + sys.executable, + '-m', + 'pip.__main__', + 'install', + str(pkg_A), + '-t', + str(site_packages), + ] + subprocess.check_call(install_cmd) + namespaces.make_site_dir(site_packages) + install_cmd = [ + sys.executable, + '-m', + 'pip.__main__', + 'install', + str(pkg_B), + '-t', + str(path_packages), + ] + subprocess.check_call(install_cmd) + try_import = [ + sys.executable, + '-c', + 'import myns.pkgA; import myns.pkgB', + ] + with paths_on_pythonpath(map(str, targets)): + subprocess.check_call(try_import) + + def test_pkg_resources_import(self, tmpdir): + """ + Ensure that a namespace package doesn't break on import + of pkg_resources. + """ + pkg = namespaces.build_namespace_package(tmpdir, 'myns.pkgA') + target = tmpdir / 'packages' + target.mkdir() + install_cmd = [ + sys.executable, + '-m', + 'pip', + 'install', + '-t', + str(target), + str(pkg), + ] + with paths_on_pythonpath([str(target)]): + subprocess.check_call(install_cmd) + namespaces.make_site_dir(target) + try_import = [ + sys.executable, + '-c', + 'import pkg_resources', + ] + with paths_on_pythonpath([str(target)]): + subprocess.check_call(try_import) + + def test_namespace_package_installed_and_cwd(self, tmpdir): + """ + Installing a namespace packages but also having it in the current + working directory, only one version should take precedence. + """ + pkg_A = namespaces.build_namespace_package(tmpdir, 'myns.pkgA') + target = tmpdir / 'packages' + # use pip to install to the target directory + install_cmd = [ + sys.executable, + '-m', + 'pip.__main__', + 'install', + str(pkg_A), + '-t', + str(target), + ] + subprocess.check_call(install_cmd) + namespaces.make_site_dir(target) + + # ensure that package imports and pkg_resources imports + pkg_resources_imp = [ + sys.executable, + '-c', + 'import pkg_resources; import myns.pkgA', + ] + with paths_on_pythonpath([str(target)]): + subprocess.check_call(pkg_resources_imp, cwd=str(pkg_A)) + + def test_packages_in_the_same_namespace_installed_and_cwd(self, tmpdir): + """ + Installing one namespace package and also have another in the same + namespace in the current working directory, both of them must be + importable. + """ + pkg_A = namespaces.build_namespace_package(tmpdir, 'myns.pkgA') + pkg_B = namespaces.build_namespace_package(tmpdir, 'myns.pkgB') + target = tmpdir / 'packages' + # use pip to install to the target directory + install_cmd = [ + sys.executable, + '-m', + 'pip.__main__', + 'install', + str(pkg_A), + '-t', + str(target), + ] + subprocess.check_call(install_cmd) + namespaces.make_site_dir(target) + + # ensure that all packages import and pkg_resources imports + pkg_resources_imp = [ + sys.executable, + '-c', + 'import pkg_resources; import myns.pkgA; import myns.pkgB', + ] + with paths_on_pythonpath([str(target)]): + subprocess.check_call(pkg_resources_imp, cwd=str(pkg_B)) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_packageindex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_packageindex.py new file mode 100644 index 0000000000000000000000000000000000000000..2a6e5917a8733c71a5d9b29d4bde6fc12cc1e183 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_packageindex.py @@ -0,0 +1,267 @@ +import http.client +import re +import urllib.error +import urllib.request +from inspect import cleandoc + +import pytest + +import setuptools.package_index + +import distutils.errors + + +class TestPackageIndex: + def test_regex(self): + hash_url = 'http://other_url?:action=show_md5&' + hash_url += 'digest=0123456789abcdef0123456789abcdef' + doc = """ + Name + (md5) + """.lstrip().format(**locals()) + assert setuptools.package_index.PYPI_MD5.match(doc) + + def test_bad_url_bad_port(self): + index = setuptools.package_index.PackageIndex() + url = 'http://127.0.0.1:0/nonesuch/test_package_index' + with pytest.raises(Exception, match=re.escape(url)): + v = index.open_url(url) + assert isinstance(v, urllib.error.HTTPError) + + def test_bad_url_typo(self): + # issue 16 + # easy_install inquant.contentmirror.plone breaks because of a typo + # in its home URL + index = setuptools.package_index.PackageIndex(hosts=('www.example.com',)) + + url = 'url:%20https://svn.plone.org/svn/collective/inquant.contentmirror.plone/trunk' + + with pytest.raises(Exception, match=re.escape(url)): + v = index.open_url(url) + assert isinstance(v, urllib.error.HTTPError) + + def test_bad_url_bad_status_line(self): + index = setuptools.package_index.PackageIndex(hosts=('www.example.com',)) + + def _urlopen(*args): + raise http.client.BadStatusLine('line') + + index.opener = _urlopen + url = 'http://example.com' + with pytest.raises(Exception, match=r'line'): + index.open_url(url) + + def test_bad_url_double_scheme(self): + """ + A bad URL with a double scheme should raise a DistutilsError. + """ + index = setuptools.package_index.PackageIndex(hosts=('www.example.com',)) + + # issue 20 + url = 'http://http://svn.pythonpaste.org/Paste/wphp/trunk' + try: + index.open_url(url) + except distutils.errors.DistutilsError as error: + msg = str(error) + assert ( + 'nonnumeric port' in msg + or 'getaddrinfo failed' in msg + or 'Name or service not known' in msg + ) + return + raise RuntimeError("Did not raise") + + def test_url_ok(self): + index = setuptools.package_index.PackageIndex(hosts=('www.example.com',)) + url = 'file:///tmp/test_package_index' + assert index.url_ok(url, True) + + def test_parse_bdist_wininst(self): + parse = setuptools.package_index.parse_bdist_wininst + + actual = parse('reportlab-2.5.win32-py2.4.exe') + expected = 'reportlab-2.5', '2.4', 'win32' + assert actual == expected + + actual = parse('reportlab-2.5.win32.exe') + expected = 'reportlab-2.5', None, 'win32' + assert actual == expected + + actual = parse('reportlab-2.5.win-amd64-py2.7.exe') + expected = 'reportlab-2.5', '2.7', 'win-amd64' + assert actual == expected + + actual = parse('reportlab-2.5.win-amd64.exe') + expected = 'reportlab-2.5', None, 'win-amd64' + assert actual == expected + + def test__vcs_split_rev_from_url(self): + """ + Test the basic usage of _vcs_split_rev_from_url + """ + vsrfu = setuptools.package_index.PackageIndex._vcs_split_rev_from_url + url, rev = vsrfu('https://example.com/bar@2995') + assert url == 'https://example.com/bar' + assert rev == '2995' + + def test_local_index(self, tmpdir): + """ + local_open should be able to read an index from the file system. + """ + index_file = tmpdir / 'index.html' + with index_file.open('w') as f: + f.write('
content
') + url = 'file:' + urllib.request.pathname2url(str(tmpdir)) + '/' + res = setuptools.package_index.local_open(url) + assert 'content' in res.read() + + def test_egg_fragment(self): + """ + EGG fragments must comply to PEP 440 + """ + epoch = [ + '', + '1!', + ] + releases = [ + '0', + '0.0', + '0.0.0', + ] + pre = [ + 'a0', + 'b0', + 'rc0', + ] + post = ['.post0'] + dev = [ + '.dev0', + ] + local = [ + ('', ''), + ('+ubuntu.0', '+ubuntu.0'), + ('+ubuntu-0', '+ubuntu.0'), + ('+ubuntu_0', '+ubuntu.0'), + ] + versions = [ + [''.join([e, r, p, loc]) for loc in locs] + for e in epoch + for r in releases + for p in sum([pre, post, dev], ['']) + for locs in local + ] + for v, vc in versions: + dists = list( + setuptools.package_index.distros_for_url( + 'http://example.com/example-foo.zip#egg=example-foo-' + v + ) + ) + assert dists[0].version == '' + assert dists[1].version == vc + + def test_download_git_with_rev(self, tmp_path, fp): + url = 'git+https://github.example/group/project@master#egg=foo' + index = setuptools.package_index.PackageIndex() + + expected_dir = tmp_path / 'project@master' + fp.register([ + 'git', + 'clone', + '--quiet', + 'https://github.example/group/project', + expected_dir, + ]) + fp.register(['git', '-C', expected_dir, 'checkout', '--quiet', 'master']) + + result = index.download(url, tmp_path) + + assert result == str(expected_dir) + assert len(fp.calls) == 2 + + def test_download_git_no_rev(self, tmp_path, fp): + url = 'git+https://github.example/group/project#egg=foo' + index = setuptools.package_index.PackageIndex() + + expected_dir = tmp_path / 'project' + fp.register([ + 'git', + 'clone', + '--quiet', + 'https://github.example/group/project', + expected_dir, + ]) + index.download(url, tmp_path) + + def test_download_svn(self, tmp_path): + url = 'svn+https://svn.example/project#egg=foo' + index = setuptools.package_index.PackageIndex() + + msg = r".*SVN download is not supported.*" + with pytest.raises(distutils.errors.DistutilsError, match=msg): + index.download(url, tmp_path) + + +class TestContentCheckers: + def test_md5(self): + checker = setuptools.package_index.HashChecker.from_url( + 'http://foo/bar#md5=f12895fdffbd45007040d2e44df98478' + ) + checker.feed('You should probably not be using MD5'.encode('ascii')) + assert checker.hash.hexdigest() == 'f12895fdffbd45007040d2e44df98478' + assert checker.is_valid() + + def test_other_fragment(self): + "Content checks should succeed silently if no hash is present" + checker = setuptools.package_index.HashChecker.from_url( + 'http://foo/bar#something%20completely%20different' + ) + checker.feed('anything'.encode('ascii')) + assert checker.is_valid() + + def test_blank_md5(self): + "Content checks should succeed if a hash is empty" + checker = setuptools.package_index.HashChecker.from_url('http://foo/bar#md5=') + checker.feed('anything'.encode('ascii')) + assert checker.is_valid() + + def test_get_hash_name_md5(self): + checker = setuptools.package_index.HashChecker.from_url( + 'http://foo/bar#md5=f12895fdffbd45007040d2e44df98478' + ) + assert checker.hash_name == 'md5' + + def test_report(self): + checker = setuptools.package_index.HashChecker.from_url( + 'http://foo/bar#md5=f12895fdffbd45007040d2e44df98478' + ) + rep = checker.report(lambda x: x, 'My message about %s') + assert rep == 'My message about md5' + + +class TestPyPIConfig: + def test_percent_in_password(self, tmp_home_dir): + pypirc = tmp_home_dir / '.pypirc' + pypirc.write_text( + cleandoc( + """ + [pypi] + repository=https://pypi.org + username=jaraco + password=pity% + """ + ), + encoding="utf-8", + ) + cfg = setuptools.package_index.PyPIConfig() + cred = cfg.creds_by_repository['https://pypi.org'] + assert cred.username == 'jaraco' + assert cred.password == 'pity%' + + +@pytest.mark.timeout(1) +def test_REL_DoS(): + """ + REL should not hang on a contrived attack string. + """ + setuptools.package_index.REL.search('< rel=' + ' ' * 2**12) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_sandbox.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_sandbox.py new file mode 100644 index 0000000000000000000000000000000000000000..a476b7c93d7441834f415223cb386859581724f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_sandbox.py @@ -0,0 +1,134 @@ +"""develop tests""" + +import os +import types + +import pytest + +import pkg_resources +import setuptools.sandbox + + +class TestSandbox: + def test_devnull(self, tmpdir): + with setuptools.sandbox.DirectorySandbox(str(tmpdir)): + self._file_writer(os.devnull) + + @staticmethod + def _file_writer(path): + def do_write(): + with open(path, 'w', encoding="utf-8") as f: + f.write('xxx') + + return do_write + + def test_setup_py_with_BOM(self): + """ + It should be possible to execute a setup.py with a Byte Order Mark + """ + target = pkg_resources.resource_filename(__name__, 'script-with-bom.py') + namespace = types.ModuleType('namespace') + setuptools.sandbox._execfile(target, vars(namespace)) + assert namespace.result == 'passed' + + def test_setup_py_with_CRLF(self, tmpdir): + setup_py = tmpdir / 'setup.py' + with setup_py.open('wb') as stream: + stream.write(b'"degenerate script"\r\n') + setuptools.sandbox._execfile(str(setup_py), globals()) + + +class TestExceptionSaver: + def test_exception_trapped(self): + with setuptools.sandbox.ExceptionSaver(): + raise ValueError("details") + + def test_exception_resumed(self): + with setuptools.sandbox.ExceptionSaver() as saved_exc: + raise ValueError("details") + + with pytest.raises(ValueError) as caught: + saved_exc.resume() + + assert isinstance(caught.value, ValueError) + assert str(caught.value) == 'details' + + def test_exception_reconstructed(self): + orig_exc = ValueError("details") + + with setuptools.sandbox.ExceptionSaver() as saved_exc: + raise orig_exc + + with pytest.raises(ValueError) as caught: + saved_exc.resume() + + assert isinstance(caught.value, ValueError) + assert caught.value is not orig_exc + + def test_no_exception_passes_quietly(self): + with setuptools.sandbox.ExceptionSaver() as saved_exc: + pass + + saved_exc.resume() + + def test_unpickleable_exception(self): + class CantPickleThis(Exception): + "This Exception is unpickleable because it's not in globals" + + def __repr__(self) -> str: + return f'CantPickleThis{self.args!r}' + + with setuptools.sandbox.ExceptionSaver() as saved_exc: + raise CantPickleThis('detail') + + with pytest.raises(setuptools.sandbox.UnpickleableException) as caught: + saved_exc.resume() + + assert str(caught.value) == "CantPickleThis('detail',)" + + def test_unpickleable_exception_when_hiding_setuptools(self): + """ + As revealed in #440, an infinite recursion can occur if an unpickleable + exception while setuptools is hidden. Ensure this doesn't happen. + """ + + class ExceptionUnderTest(Exception): + """ + An unpickleable exception (not in globals). + """ + + with pytest.raises(setuptools.sandbox.UnpickleableException) as caught: + with setuptools.sandbox.save_modules(): + setuptools.sandbox.hide_setuptools() + raise ExceptionUnderTest + + (msg,) = caught.value.args + assert msg == 'ExceptionUnderTest()' + + def test_sandbox_violation_raised_hiding_setuptools(self, tmpdir): + """ + When in a sandbox with setuptools hidden, a SandboxViolation + should reflect a proper exception and not be wrapped in + an UnpickleableException. + """ + + def write_file(): + "Trigger a SandboxViolation by writing outside the sandbox" + with open('/etc/foo', 'w', encoding="utf-8"): + pass + + with pytest.raises(setuptools.sandbox.SandboxViolation) as caught: + with setuptools.sandbox.save_modules(): + setuptools.sandbox.hide_setuptools() + with setuptools.sandbox.DirectorySandbox(str(tmpdir)): + write_file() + + cmd, args, kwargs = caught.value.args + assert cmd == 'open' + assert args == ('/etc/foo', 'w') + assert kwargs == {"encoding": "utf-8"} + + msg = str(caught.value) + assert 'open' in msg + assert "('/etc/foo', 'w')" in msg + assert "{'encoding': 'utf-8'}" in msg diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_sdist.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_sdist.py new file mode 100644 index 0000000000000000000000000000000000000000..3ee0511b1c485a1e4a9f028a29b7a17d8b4c8130 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_sdist.py @@ -0,0 +1,972 @@ +"""sdist tests""" + +import contextlib +import io +import logging +import os +import pathlib +import sys +import tarfile +import tempfile +import unicodedata +from inspect import cleandoc +from pathlib import Path +from unittest import mock + +import jaraco.path +import pytest + +from setuptools import Command, SetuptoolsDeprecationWarning +from setuptools._importlib import metadata +from setuptools.command.egg_info import manifest_maker +from setuptools.command.sdist import sdist +from setuptools.dist import Distribution +from setuptools.extension import Extension +from setuptools.tests import fail_on_ascii + +from .text import Filenames + +import distutils +from distutils.core import run_setup + +SETUP_ATTRS = { + 'name': 'sdist_test', + 'version': '0.0', + 'packages': ['sdist_test'], + 'package_data': {'sdist_test': ['*.txt']}, + 'data_files': [("data", [os.path.join("d", "e.dat")])], +} + +SETUP_PY = f"""\ +from setuptools import setup + +setup(**{SETUP_ATTRS!r}) +""" + +EXTENSION = Extension( + name="sdist_test.f", + sources=[os.path.join("sdist_test", "f.c")], + depends=[os.path.join("sdist_test", "f.h")], +) +EXTENSION_SOURCES = EXTENSION.sources + EXTENSION.depends + + +@contextlib.contextmanager +def quiet(): + old_stdout, old_stderr = sys.stdout, sys.stderr + sys.stdout, sys.stderr = io.StringIO(), io.StringIO() + try: + yield + finally: + sys.stdout, sys.stderr = old_stdout, old_stderr + + +# Convert to POSIX path +def posix(path): + if not isinstance(path, str): + return path.replace(os.sep.encode('ascii'), b'/') + else: + return path.replace(os.sep, '/') + + +# HFS Plus uses decomposed UTF-8 +def decompose(path): + if isinstance(path, str): + return unicodedata.normalize('NFD', path) + try: + path = path.decode('utf-8') + path = unicodedata.normalize('NFD', path) + path = path.encode('utf-8') + except UnicodeError: + pass # Not UTF-8 + return path + + +def read_all_bytes(filename): + with open(filename, 'rb') as fp: + return fp.read() + + +def latin1_fail(): + try: + desc, filename = tempfile.mkstemp(suffix=Filenames.latin_1) + os.close(desc) + os.remove(filename) + except Exception: + return True + + +fail_on_latin1_encoded_filenames = pytest.mark.xfail( + latin1_fail(), + reason="System does not support latin-1 filenames", +) + + +skip_under_xdist = pytest.mark.skipif( + "os.environ.get('PYTEST_XDIST_WORKER')", + reason="pytest-dev/pytest-xdist#843", +) +skip_under_stdlib_distutils = pytest.mark.skipif( + not distutils.__package__.startswith('setuptools'), + reason="the test is not supported with stdlib distutils", +) + + +def touch(path): + open(path, 'wb').close() + return path + + +def symlink_or_skip_test(src, dst): + try: + os.symlink(src, dst) + except (OSError, NotImplementedError): + pytest.skip("symlink not supported in OS") + return None + return dst + + +class TestSdistTest: + @pytest.fixture(autouse=True) + def source_dir(self, tmpdir): + tmpdir = tmpdir / "project_root" + tmpdir.mkdir() + + (tmpdir / 'setup.py').write_text(SETUP_PY, encoding='utf-8') + + # Set up the rest of the test package + test_pkg = tmpdir / 'sdist_test' + test_pkg.mkdir() + data_folder = tmpdir / 'd' + data_folder.mkdir() + # *.rst was not included in package_data, so c.rst should not be + # automatically added to the manifest when not under version control + for fname in ['__init__.py', 'a.txt', 'b.txt', 'c.rst']: + touch(test_pkg / fname) + touch(data_folder / 'e.dat') + # C sources are not included by default, but they will be, + # if an extension module uses them as sources or depends + for fname in EXTENSION_SOURCES: + touch(tmpdir / fname) + + with tmpdir.as_cwd(): + yield tmpdir + + def assert_package_data_in_manifest(self, cmd): + manifest = cmd.filelist.files + assert os.path.join('sdist_test', 'a.txt') in manifest + assert os.path.join('sdist_test', 'b.txt') in manifest + assert os.path.join('sdist_test', 'c.rst') not in manifest + assert os.path.join('d', 'e.dat') in manifest + + def setup_with_extension(self): + setup_attrs = {**SETUP_ATTRS, 'ext_modules': [EXTENSION]} + + dist = Distribution(setup_attrs) + dist.script_name = 'setup.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + with quiet(): + cmd.run() + + return cmd + + def test_package_data_in_sdist(self): + """Regression test for pull request #4: ensures that files listed in + package_data are included in the manifest even if they're not added to + version control. + """ + + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'setup.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + with quiet(): + cmd.run() + + self.assert_package_data_in_manifest(cmd) + + def test_package_data_and_include_package_data_in_sdist(self): + """ + Ensure package_data and include_package_data work + together. + """ + setup_attrs = {**SETUP_ATTRS, 'include_package_data': True} + assert setup_attrs['package_data'] + + dist = Distribution(setup_attrs) + dist.script_name = 'setup.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + with quiet(): + cmd.run() + + self.assert_package_data_in_manifest(cmd) + + def test_extension_sources_in_sdist(self): + """ + Ensure that the files listed in Extension.sources and Extension.depends + are automatically included in the manifest. + """ + cmd = self.setup_with_extension() + self.assert_package_data_in_manifest(cmd) + manifest = cmd.filelist.files + for path in EXTENSION_SOURCES: + assert path in manifest + + def test_missing_extension_sources(self): + """ + Similar to test_extension_sources_in_sdist but the referenced files don't exist. + Missing files should not be included in distribution (with no error raised). + """ + for path in EXTENSION_SOURCES: + os.remove(path) + + cmd = self.setup_with_extension() + self.assert_package_data_in_manifest(cmd) + manifest = cmd.filelist.files + for path in EXTENSION_SOURCES: + assert path not in manifest + + def test_symlinked_extension_sources(self): + """ + Similar to test_extension_sources_in_sdist but the referenced files are + instead symbolic links to project-local files. Referenced file paths + should be included. Symlink targets themselves should NOT be included. + """ + symlinked = [] + for path in EXTENSION_SOURCES: + base, ext = os.path.splitext(path) + target = base + "_target." + ext + + os.rename(path, target) + symlink_or_skip_test(os.path.basename(target), path) + symlinked.append(target) + + cmd = self.setup_with_extension() + self.assert_package_data_in_manifest(cmd) + manifest = cmd.filelist.files + for path in EXTENSION_SOURCES: + assert path in manifest + for path in symlinked: + assert path not in manifest + + _INVALID_PATHS = { + "must be relative": lambda: ( + os.path.abspath(os.path.join("sdist_test", "f.h")) + ), + "can't have `..` segments": lambda: ( + os.path.join("sdist_test", "..", "sdist_test", "f.h") + ), + "doesn't exist": lambda: ( + os.path.join("sdist_test", "this_file_does_not_exist.h") + ), + "must be inside the project root": lambda: ( + symlink_or_skip_test( + touch(os.path.join("..", "outside_of_project_root.h")), + "symlink.h", + ) + ), + } + + @skip_under_stdlib_distutils + @pytest.mark.parametrize("reason", _INVALID_PATHS.keys()) + def test_invalid_extension_depends(self, reason, caplog): + """ + Due to backwards compatibility reasons, `Extension.depends` should accept + invalid/weird paths, but then ignore them when building a sdist. + + This test verifies that the source distribution is still built + successfully with such paths, but that instead of adding these paths to + the manifest, we emit an informational message, notifying the user that + the invalid path won't be automatically included. + """ + invalid_path = self._INVALID_PATHS[reason]() + extension = Extension( + name="sdist_test.f", + sources=[], + depends=[invalid_path], + ) + setup_attrs = {**SETUP_ATTRS, 'ext_modules': [extension]} + + dist = Distribution(setup_attrs) + dist.script_name = 'setup.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + with quiet(), caplog.at_level(logging.INFO): + cmd.run() + + self.assert_package_data_in_manifest(cmd) + manifest = cmd.filelist.files + assert invalid_path not in manifest + + expected_message = [ + message + for (logger, level, message) in caplog.record_tuples + if ( + logger == "root" # + and level == logging.INFO # + and invalid_path in message # + ) + ] + assert len(expected_message) == 1 + (expected_message,) = expected_message + assert reason in expected_message + + def test_custom_build_py(self): + """ + Ensure projects defining custom build_py don't break + when creating sdists (issue #2849) + """ + from distutils.command.build_py import build_py as OrigBuildPy + + using_custom_command_guard = mock.Mock() + + class CustomBuildPy(OrigBuildPy): + """ + Some projects have custom commands inheriting from `distutils` + """ + + def get_data_files(self): + using_custom_command_guard() + return super().get_data_files() + + setup_attrs = {**SETUP_ATTRS, 'include_package_data': True} + assert setup_attrs['package_data'] + + dist = Distribution(setup_attrs) + dist.script_name = 'setup.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + # Make sure we use the custom command + cmd.cmdclass = {'build_py': CustomBuildPy} + cmd.distribution.cmdclass = {'build_py': CustomBuildPy} + assert cmd.distribution.get_command_class('build_py') == CustomBuildPy + + msg = "setuptools instead of distutils" + with quiet(), pytest.warns(SetuptoolsDeprecationWarning, match=msg): + cmd.run() + + using_custom_command_guard.assert_called() + self.assert_package_data_in_manifest(cmd) + + def test_setup_py_exists(self): + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'foo.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + with quiet(): + cmd.run() + + manifest = cmd.filelist.files + assert 'setup.py' in manifest + + def test_setup_py_missing(self): + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'foo.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + if os.path.exists("setup.py"): + os.remove("setup.py") + with quiet(): + cmd.run() + + manifest = cmd.filelist.files + assert 'setup.py' not in manifest + + def test_setup_py_excluded(self): + with open("MANIFEST.in", "w", encoding="utf-8") as manifest_file: + manifest_file.write("exclude setup.py") + + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'foo.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + with quiet(): + cmd.run() + + manifest = cmd.filelist.files + assert 'setup.py' not in manifest + + def test_defaults_case_sensitivity(self, source_dir): + """ + Make sure default files (README.*, etc.) are added in a case-sensitive + way to avoid problems with packages built on Windows. + """ + + touch(source_dir / 'readme.rst') + touch(source_dir / 'SETUP.cfg') + + dist = Distribution(SETUP_ATTRS) + # the extension deliberately capitalized for this test + # to make sure the actual filename (not capitalized) gets added + # to the manifest + dist.script_name = 'setup.PY' + cmd = sdist(dist) + cmd.ensure_finalized() + + with quiet(): + cmd.run() + + # lowercase all names so we can test in a + # case-insensitive way to make sure the files + # are not included. + manifest = map(lambda x: x.lower(), cmd.filelist.files) + assert 'readme.rst' not in manifest, manifest + assert 'setup.py' not in manifest, manifest + assert 'setup.cfg' not in manifest, manifest + + def test_exclude_dev_only_cache_folders(self, source_dir): + included = { + # Emulate problem in https://github.com/pypa/setuptools/issues/4601 + "MANIFEST.in": ( + "global-include LICEN[CS]E* COPYING* NOTICE* AUTHORS*\n" + "global-include *.txt\n" + ), + # For the sake of being conservative and limiting unforeseen side-effects + # we just exclude dev-only cache folders at the root of the repository: + "test/.venv/lib/python3.9/site-packages/bar-2.dist-info/AUTHORS.rst": "", + "src/.nox/py/lib/python3.12/site-packages/bar-2.dist-info/COPYING.txt": "", + "doc/.tox/default/lib/python3.11/site-packages/foo-4.dist-info/LICENSE": "", + # Let's test against false positives with similarly named files: + ".venv-requirements.txt": "", + ".tox-coveragerc.txt": "", + ".noxy/coveragerc.txt": "", + } + + excluded = { + # .tox/.nox/.venv are well-know folders present at the root of Python repos + # and therefore should be excluded + ".tox/release/lib/python3.11/site-packages/foo-4.dist-info/LICENSE": "", + ".nox/py/lib/python3.12/site-packages/bar-2.dist-info/COPYING.txt": "", + ".venv/lib/python3.9/site-packages/bar-2.dist-info/AUTHORS.rst": "", + } + + for file, content in {**excluded, **included}.items(): + Path(source_dir, file).parent.mkdir(parents=True, exist_ok=True) + Path(source_dir, file).write_text(content, encoding="utf-8") + + cmd = self.setup_with_extension() + self.assert_package_data_in_manifest(cmd) + manifest = {f.replace(os.sep, '/') for f in cmd.filelist.files} + for path in excluded: + assert os.path.exists(path) + assert path not in manifest, (path, manifest) + for path in included: + assert os.path.exists(path) + assert path in manifest, (path, manifest) + + @fail_on_ascii + def test_manifest_is_written_with_utf8_encoding(self): + # Test for #303. + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'setup.py' + mm = manifest_maker(dist) + mm.manifest = os.path.join('sdist_test.egg-info', 'SOURCES.txt') + os.mkdir('sdist_test.egg-info') + + # UTF-8 filename + filename = os.path.join('sdist_test', 'smörbröd.py') + + # Must create the file or it will get stripped. + touch(filename) + + # Add UTF-8 filename and write manifest + with quiet(): + mm.run() + mm.filelist.append(filename) + mm.write_manifest() + + contents = read_all_bytes(mm.manifest) + + # The manifest should be UTF-8 encoded + u_contents = contents.decode('UTF-8') + + # The manifest should contain the UTF-8 filename + assert posix(filename) in u_contents + + @fail_on_ascii + def test_write_manifest_allows_utf8_filenames(self): + # Test for #303. + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'setup.py' + mm = manifest_maker(dist) + mm.manifest = os.path.join('sdist_test.egg-info', 'SOURCES.txt') + os.mkdir('sdist_test.egg-info') + + filename = os.path.join(b'sdist_test', Filenames.utf_8) + + # Must touch the file or risk removal + touch(filename) + + # Add filename and write manifest + with quiet(): + mm.run() + u_filename = filename.decode('utf-8') + mm.filelist.files.append(u_filename) + # Re-write manifest + mm.write_manifest() + + contents = read_all_bytes(mm.manifest) + + # The manifest should be UTF-8 encoded + contents.decode('UTF-8') + + # The manifest should contain the UTF-8 filename + assert posix(filename) in contents + + # The filelist should have been updated as well + assert u_filename in mm.filelist.files + + @skip_under_xdist + def test_write_manifest_skips_non_utf8_filenames(self): + """ + Files that cannot be encoded to UTF-8 (specifically, those that + weren't originally successfully decoded and have surrogate + escapes) should be omitted from the manifest. + See https://bitbucket.org/tarek/distribute/issue/303 for history. + """ + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'setup.py' + mm = manifest_maker(dist) + mm.manifest = os.path.join('sdist_test.egg-info', 'SOURCES.txt') + os.mkdir('sdist_test.egg-info') + + # Latin-1 filename + filename = os.path.join(b'sdist_test', Filenames.latin_1) + + # Add filename with surrogates and write manifest + with quiet(): + mm.run() + u_filename = filename.decode('utf-8', 'surrogateescape') + mm.filelist.append(u_filename) + # Re-write manifest + mm.write_manifest() + + contents = read_all_bytes(mm.manifest) + + # The manifest should be UTF-8 encoded + contents.decode('UTF-8') + + # The Latin-1 filename should have been skipped + assert posix(filename) not in contents + + # The filelist should have been updated as well + assert u_filename not in mm.filelist.files + + @fail_on_ascii + def test_manifest_is_read_with_utf8_encoding(self): + # Test for #303. + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'setup.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + # Create manifest + with quiet(): + cmd.run() + + # Add UTF-8 filename to manifest + filename = os.path.join(b'sdist_test', Filenames.utf_8) + cmd.manifest = os.path.join('sdist_test.egg-info', 'SOURCES.txt') + manifest = open(cmd.manifest, 'ab') + manifest.write(b'\n' + filename) + manifest.close() + + # The file must exist to be included in the filelist + touch(filename) + + # Re-read manifest + cmd.filelist.files = [] + with quiet(): + cmd.read_manifest() + + # The filelist should contain the UTF-8 filename + filename = filename.decode('utf-8') + assert filename in cmd.filelist.files + + @fail_on_latin1_encoded_filenames + def test_read_manifest_skips_non_utf8_filenames(self): + # Test for #303. + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'setup.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + # Create manifest + with quiet(): + cmd.run() + + # Add Latin-1 filename to manifest + filename = os.path.join(b'sdist_test', Filenames.latin_1) + cmd.manifest = os.path.join('sdist_test.egg-info', 'SOURCES.txt') + manifest = open(cmd.manifest, 'ab') + manifest.write(b'\n' + filename) + manifest.close() + + # The file must exist to be included in the filelist + touch(filename) + + # Re-read manifest + cmd.filelist.files = [] + with quiet(): + cmd.read_manifest() + + # The Latin-1 filename should have been skipped + filename = filename.decode('latin-1') + assert filename not in cmd.filelist.files + + @fail_on_ascii + @fail_on_latin1_encoded_filenames + def test_sdist_with_utf8_encoded_filename(self): + # Test for #303. + dist = Distribution(self.make_strings(SETUP_ATTRS)) + dist.script_name = 'setup.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + filename = os.path.join(b'sdist_test', Filenames.utf_8) + touch(filename) + + with quiet(): + cmd.run() + + if sys.platform == 'darwin': + filename = decompose(filename) + + fs_enc = sys.getfilesystemencoding() + + if sys.platform == 'win32': + if fs_enc == 'cp1252': + # Python mangles the UTF-8 filename + filename = filename.decode('cp1252') + assert filename in cmd.filelist.files + else: + filename = filename.decode('mbcs') + assert filename in cmd.filelist.files + else: + filename = filename.decode('utf-8') + assert filename in cmd.filelist.files + + @classmethod + def make_strings(cls, item): + if isinstance(item, dict): + return {key: cls.make_strings(value) for key, value in item.items()} + if isinstance(item, list): + return list(map(cls.make_strings, item)) + return str(item) + + @fail_on_latin1_encoded_filenames + @skip_under_xdist + def test_sdist_with_latin1_encoded_filename(self): + # Test for #303. + dist = Distribution(self.make_strings(SETUP_ATTRS)) + dist.script_name = 'setup.py' + cmd = sdist(dist) + cmd.ensure_finalized() + + # Latin-1 filename + filename = os.path.join(b'sdist_test', Filenames.latin_1) + touch(filename) + assert os.path.isfile(filename) + + with quiet(): + cmd.run() + + # not all windows systems have a default FS encoding of cp1252 + if sys.platform == 'win32': + # Latin-1 is similar to Windows-1252 however + # on mbcs filesys it is not in latin-1 encoding + fs_enc = sys.getfilesystemencoding() + if fs_enc != 'mbcs': + fs_enc = 'latin-1' + filename = filename.decode(fs_enc) + + assert filename in cmd.filelist.files + else: + # The Latin-1 filename should have been skipped + filename = filename.decode('latin-1') + assert filename not in cmd.filelist.files + + _EXAMPLE_DIRECTIVES = { + "setup.cfg - long_description and version": """ + [metadata] + name = testing + version = file: src/VERSION.txt + license_files = DOWHATYOUWANT + long_description = file: README.rst, USAGE.rst + """, + "pyproject.toml - static readme/license files and dynamic version": """ + [project] + name = "testing" + readme = "USAGE.rst" + license = {file = "DOWHATYOUWANT"} + dynamic = ["version"] + [tool.setuptools.dynamic] + version = {file = ["src/VERSION.txt"]} + """, + "pyproject.toml - directive with str instead of list": """ + [project] + name = "testing" + readme = "USAGE.rst" + license = {file = "DOWHATYOUWANT"} + dynamic = ["version"] + [tool.setuptools.dynamic] + version = {file = "src/VERSION.txt"} + """, + } + + @pytest.mark.parametrize("config", _EXAMPLE_DIRECTIVES.keys()) + def test_add_files_referenced_by_config_directives(self, source_dir, config): + config_file, _, _ = config.partition(" - ") + config_text = self._EXAMPLE_DIRECTIVES[config] + (source_dir / 'src').mkdir() + (source_dir / 'src/VERSION.txt').write_text("0.42", encoding="utf-8") + (source_dir / 'README.rst').write_text("hello world!", encoding="utf-8") + (source_dir / 'USAGE.rst').write_text("hello world!", encoding="utf-8") + (source_dir / 'DOWHATYOUWANT').write_text("hello world!", encoding="utf-8") + (source_dir / config_file).write_text(config_text, encoding="utf-8") + + dist = Distribution({"packages": []}) + dist.script_name = 'setup.py' + dist.parse_config_files() + + cmd = sdist(dist) + cmd.ensure_finalized() + with quiet(): + cmd.run() + + assert ( + 'src/VERSION.txt' in cmd.filelist.files + or 'src\\VERSION.txt' in cmd.filelist.files + ) + assert 'USAGE.rst' in cmd.filelist.files + assert 'DOWHATYOUWANT' in cmd.filelist.files + assert '/' not in cmd.filelist.files + assert '\\' not in cmd.filelist.files + + def test_pyproject_toml_in_sdist(self, source_dir): + """ + Check if pyproject.toml is included in source distribution if present + """ + touch(source_dir / 'pyproject.toml') + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'setup.py' + cmd = sdist(dist) + cmd.ensure_finalized() + with quiet(): + cmd.run() + manifest = cmd.filelist.files + assert 'pyproject.toml' in manifest + + def test_pyproject_toml_excluded(self, source_dir): + """ + Check that pyproject.toml can excluded even if present + """ + touch(source_dir / 'pyproject.toml') + with open('MANIFEST.in', 'w', encoding="utf-8") as mts: + print('exclude pyproject.toml', file=mts) + dist = Distribution(SETUP_ATTRS) + dist.script_name = 'setup.py' + cmd = sdist(dist) + cmd.ensure_finalized() + with quiet(): + cmd.run() + manifest = cmd.filelist.files + assert 'pyproject.toml' not in manifest + + def test_build_subcommand_source_files(self, source_dir): + touch(source_dir / '.myfile~') + + # Sanity check: without custom commands file list should not be affected + dist = Distribution({**SETUP_ATTRS, "script_name": "setup.py"}) + cmd = sdist(dist) + cmd.ensure_finalized() + with quiet(): + cmd.run() + manifest = cmd.filelist.files + assert '.myfile~' not in manifest + + # Test: custom command should be able to augment file list + dist = Distribution({**SETUP_ATTRS, "script_name": "setup.py"}) + build = dist.get_command_obj("build") + build.sub_commands = [*build.sub_commands, ("build_custom", None)] + + class build_custom(Command): + def initialize_options(self): ... + + def finalize_options(self): ... + + def run(self): ... + + def get_source_files(self): + return ['.myfile~'] + + dist.cmdclass.update(build_custom=build_custom) + + cmd = sdist(dist) + cmd.use_defaults = True + cmd.ensure_finalized() + with quiet(): + cmd.run() + manifest = cmd.filelist.files + assert '.myfile~' in manifest + + @pytest.mark.skipif("os.environ.get('SETUPTOOLS_USE_DISTUTILS') == 'stdlib'") + def test_build_base_pathlib(self, source_dir): + """ + Ensure if build_base is a pathlib.Path, the build still succeeds. + """ + dist = Distribution({ + **SETUP_ATTRS, + "script_name": "setup.py", + "options": {"build": {"build_base": pathlib.Path('build')}}, + }) + cmd = sdist(dist) + cmd.ensure_finalized() + with quiet(): + cmd.run() + + +def test_default_revctrl(): + """ + When _default_revctrl was removed from the `setuptools.command.sdist` + module in 10.0, it broke some systems which keep an old install of + setuptools (Distribute) around. Those old versions require that the + setuptools package continue to implement that interface, so this + function provides that interface, stubbed. See #320 for details. + + This interface must be maintained until Ubuntu 12.04 is no longer + supported (by Setuptools). + """ + (ep,) = metadata.EntryPoints._from_text( + """ + [setuptools.file_finders] + svn_cvs = setuptools.command.sdist:_default_revctrl + """ + ) + res = ep.load() + assert hasattr(res, '__iter__') + + +class TestRegressions: + """ + Can be removed/changed if the project decides to change how it handles symlinks + or external files. + """ + + @staticmethod + def files_for_symlink_in_extension_depends(tmp_path, dep_path): + return { + "external": { + "dir": {"file.h": ""}, + }, + "project": { + "setup.py": cleandoc( + f""" + from setuptools import Extension, setup + setup( + name="myproj", + version="42", + ext_modules=[ + Extension( + "hello", sources=["hello.pyx"], + depends=[{dep_path!r}] + ) + ], + ) + """ + ), + "hello.pyx": "", + "MANIFEST.in": "global-include *.h", + }, + } + + @pytest.mark.parametrize( + "dep_path", ("myheaders/dir/file.h", "myheaders/dir/../dir/file.h") + ) + def test_symlink_in_extension_depends(self, monkeypatch, tmp_path, dep_path): + # Given a project with a symlinked dir and a "depends" targeting that dir + files = self.files_for_symlink_in_extension_depends(tmp_path, dep_path) + jaraco.path.build(files, prefix=str(tmp_path)) + symlink_or_skip_test(tmp_path / "external", tmp_path / "project/myheaders") + + # When `sdist` runs, there should be no error + members = run_sdist(monkeypatch, tmp_path / "project") + # and the sdist should contain the symlinked files + for expected in ( + "myproj-42/hello.pyx", + "myproj-42/myheaders/dir/file.h", + ): + assert expected in members + + @staticmethod + def files_for_external_path_in_extension_depends(tmp_path, dep_path): + head, _, tail = dep_path.partition("$tmp_path$/") + dep_path = tmp_path / tail if tail else head + + return { + "external": { + "dir": {"file.h": ""}, + }, + "project": { + "setup.py": cleandoc( + f""" + from setuptools import Extension, setup + setup( + name="myproj", + version="42", + ext_modules=[ + Extension( + "hello", sources=["hello.pyx"], + depends=[{str(dep_path)!r}] + ) + ], + ) + """ + ), + "hello.pyx": "", + "MANIFEST.in": "global-include *.h", + }, + } + + @pytest.mark.parametrize( + "dep_path", ("$tmp_path$/external/dir/file.h", "../external/dir/file.h") + ) + def test_external_path_in_extension_depends(self, monkeypatch, tmp_path, dep_path): + # Given a project with a "depends" targeting an external dir + files = self.files_for_external_path_in_extension_depends(tmp_path, dep_path) + jaraco.path.build(files, prefix=str(tmp_path)) + # When `sdist` runs, there should be no error + members = run_sdist(monkeypatch, tmp_path / "project") + # and the sdist should not contain the external file + for name in members: + assert "file.h" not in name + + +def run_sdist(monkeypatch, project): + """Given a project directory, run the sdist and return its contents""" + monkeypatch.chdir(project) + with quiet(): + run_setup("setup.py", ["sdist"]) + + archive = next((project / "dist").glob("*.tar.gz")) + with tarfile.open(str(archive)) as tar: + return set(tar.getnames()) + + +def test_sanity_check_setuptools_own_sdist(setuptools_sdist): + with tarfile.open(setuptools_sdist) as tar: + files = tar.getnames() + + # setuptools sdist should not include the .tox folder + tox_files = [name for name in files if ".tox" in name] + assert len(tox_files) == 0, f"not empty {tox_files}" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_setopt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_setopt.py new file mode 100644 index 0000000000000000000000000000000000000000..ccf25618a5d6e255ad0be0fbed51fc29c179dcfe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_setopt.py @@ -0,0 +1,40 @@ +import configparser + +from setuptools.command import setopt + + +class TestEdit: + @staticmethod + def parse_config(filename): + parser = configparser.ConfigParser() + with open(filename, encoding='utf-8') as reader: + parser.read_file(reader) + return parser + + @staticmethod + def write_text(file, content): + with open(file, 'wb') as strm: + strm.write(content.encode('utf-8')) + + def test_utf8_encoding_retained(self, tmpdir): + """ + When editing a file, non-ASCII characters encoded in + UTF-8 should be retained. + """ + config = tmpdir.join('setup.cfg') + self.write_text(str(config), '[names]\njaraco=джарако') + setopt.edit_config(str(config), dict(names=dict(other='yes'))) + parser = self.parse_config(str(config)) + assert parser.get('names', 'jaraco') == 'джарако' + assert parser.get('names', 'other') == 'yes' + + def test_case_retained(self, tmpdir): + """ + When editing a file, case of keys should be retained. + """ + config = tmpdir.join('setup.cfg') + self.write_text(str(config), '[names]\nFoO=bAr') + setopt.edit_config(str(config), dict(names=dict(oTher='yes'))) + actual = config.read_text(encoding='ascii') + assert 'FoO' in actual + assert 'oTher' in actual diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_setuptools.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_setuptools.py new file mode 100644 index 0000000000000000000000000000000000000000..1d56e1a8a4ebc5c7aaeb9902ef9972f7de97ecbf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_setuptools.py @@ -0,0 +1,290 @@ +"""Tests for the 'setuptools' package""" + +import os +import re +import sys +from zipfile import ZipFile + +import pytest +from packaging.version import Version + +import setuptools +import setuptools.depends as dep +import setuptools.dist +from setuptools.depends import Require + +import distutils.cmd +import distutils.core +from distutils.core import Extension +from distutils.errors import DistutilsSetupError + + +@pytest.fixture(autouse=True) +def isolated_dir(tmpdir_cwd): + return + + +def makeSetup(**args): + """Return distribution from 'setup(**args)', without executing commands""" + + distutils.core._setup_stop_after = "commandline" + + # Don't let system command line leak into tests! + args.setdefault('script_args', ['install']) + + try: + return setuptools.setup(**args) + finally: + distutils.core._setup_stop_after = None + + +needs_bytecode = pytest.mark.skipif( + not hasattr(dep, 'get_module_constant'), + reason="bytecode support not available", +) + + +class TestDepends: + def testExtractConst(self): + if not hasattr(dep, 'extract_constant'): + # skip on non-bytecode platforms + return + + def f1(): + global x, y, z + x = "test" + y = z # pyright: ignore[reportUnboundVariable] # Explicitly testing for this runtime issue + + fc = f1.__code__ + + # unrecognized name + assert dep.extract_constant(fc, 'q', -1) is None + + # constant assigned + assert dep.extract_constant(fc, 'x', -1) == "test" + + # expression assigned + assert dep.extract_constant(fc, 'y', -1) == -1 + + # recognized name, not assigned + assert dep.extract_constant(fc, 'z', -1) is None + + def testFindModule(self): + with pytest.raises(ImportError): + dep.find_module('no-such.-thing') + with pytest.raises(ImportError): + dep.find_module('setuptools.non-existent') + f, _p, _i = dep.find_module('setuptools.tests') + f.close() + + @needs_bytecode + def testModuleExtract(self): + from json import __version__ + + assert dep.get_module_constant('json', '__version__') == __version__ + assert dep.get_module_constant('sys', 'version') == sys.version + assert ( + dep.get_module_constant('setuptools.tests.test_setuptools', '__doc__') + == __doc__ + ) + + @needs_bytecode + def testRequire(self): + req = Require('Json', '1.0.3', 'json') + + assert req.name == 'Json' + assert req.module == 'json' + assert req.requested_version == Version('1.0.3') + assert req.attribute == '__version__' + assert req.full_name() == 'Json-1.0.3' + + from json import __version__ + + assert str(req.get_version()) == __version__ + assert req.version_ok('1.0.9') + assert not req.version_ok('0.9.1') + assert not req.version_ok('unknown') + + assert req.is_present() + assert req.is_current() + + req = Require('Do-what-I-mean', '1.0', 'd-w-i-m') + assert not req.is_present() + assert not req.is_current() + + @needs_bytecode + def test_require_present(self): + # In #1896, this test was failing for months with the only + # complaint coming from test runners (not end users). + # TODO: Evaluate if this code is needed at all. + req = Require('Tests', None, 'tests', homepage="http://example.com") + assert req.format is None + assert req.attribute is None + assert req.requested_version is None + assert req.full_name() == 'Tests' + assert req.homepage == 'http://example.com' + + from setuptools.tests import __path__ + + paths = [os.path.dirname(p) for p in __path__] + assert req.is_present(paths) + assert req.is_current(paths) + + +class TestDistro: + def setup_method(self, method): + self.e1 = Extension('bar.ext', ['bar.c']) + self.e2 = Extension('c.y', ['y.c']) + + self.dist = makeSetup( + packages=['a', 'a.b', 'a.b.c', 'b', 'c'], + py_modules=['b.d', 'x'], + ext_modules=(self.e1, self.e2), + package_dir={}, + ) + + def testDistroType(self): + assert isinstance(self.dist, setuptools.dist.Distribution) + + def testExcludePackage(self): + self.dist.exclude_package('a') + assert self.dist.packages == ['b', 'c'] + + self.dist.exclude_package('b') + assert self.dist.packages == ['c'] + assert self.dist.py_modules == ['x'] + assert self.dist.ext_modules == [self.e1, self.e2] + + self.dist.exclude_package('c') + assert self.dist.packages == [] + assert self.dist.py_modules == ['x'] + assert self.dist.ext_modules == [self.e1] + + # test removals from unspecified options + makeSetup().exclude_package('x') + + def testIncludeExclude(self): + # remove an extension + self.dist.exclude(ext_modules=[self.e1]) + assert self.dist.ext_modules == [self.e2] + + # add it back in + self.dist.include(ext_modules=[self.e1]) + assert self.dist.ext_modules == [self.e2, self.e1] + + # should not add duplicate + self.dist.include(ext_modules=[self.e1]) + assert self.dist.ext_modules == [self.e2, self.e1] + + def testExcludePackages(self): + self.dist.exclude(packages=['c', 'b', 'a']) + assert self.dist.packages == [] + assert self.dist.py_modules == ['x'] + assert self.dist.ext_modules == [self.e1] + + def testEmpty(self): + dist = makeSetup() + dist.include(packages=['a'], py_modules=['b'], ext_modules=[self.e2]) + dist = makeSetup() + dist.exclude(packages=['a'], py_modules=['b'], ext_modules=[self.e2]) + + def testContents(self): + assert self.dist.has_contents_for('a') + self.dist.exclude_package('a') + assert not self.dist.has_contents_for('a') + + assert self.dist.has_contents_for('b') + self.dist.exclude_package('b') + assert not self.dist.has_contents_for('b') + + assert self.dist.has_contents_for('c') + self.dist.exclude_package('c') + assert not self.dist.has_contents_for('c') + + def testInvalidIncludeExclude(self): + with pytest.raises(DistutilsSetupError): + self.dist.include(nonexistent_option='x') + with pytest.raises(DistutilsSetupError): + self.dist.exclude(nonexistent_option='x') + with pytest.raises(DistutilsSetupError): + self.dist.include(packages={'x': 'y'}) + with pytest.raises(DistutilsSetupError): + self.dist.exclude(packages={'x': 'y'}) + with pytest.raises(DistutilsSetupError): + self.dist.include(ext_modules={'x': 'y'}) + with pytest.raises(DistutilsSetupError): + self.dist.exclude(ext_modules={'x': 'y'}) + + with pytest.raises(DistutilsSetupError): + self.dist.include(package_dir=['q']) + with pytest.raises(DistutilsSetupError): + self.dist.exclude(package_dir=['q']) + + +@pytest.fixture +def example_source(tmpdir): + tmpdir.mkdir('foo') + (tmpdir / 'foo/bar.py').write('') + (tmpdir / 'readme.txt').write('') + return tmpdir + + +def test_findall(example_source): + found = list(setuptools.findall(str(example_source))) + expected = ['readme.txt', 'foo/bar.py'] + expected = [example_source.join(fn) for fn in expected] + assert found == expected + + +def test_findall_curdir(example_source): + with example_source.as_cwd(): + found = list(setuptools.findall()) + expected = ['readme.txt', os.path.join('foo', 'bar.py')] + assert found == expected + + +@pytest.fixture +def can_symlink(tmpdir): + """ + Skip if cannot create a symbolic link + """ + link_fn = 'link' + target_fn = 'target' + try: + os.symlink(target_fn, link_fn) + except (OSError, NotImplementedError, AttributeError): + pytest.skip("Cannot create symbolic links") + os.remove(link_fn) + + +@pytest.mark.usefixtures("can_symlink") +def test_findall_missing_symlink(tmpdir): + with tmpdir.as_cwd(): + os.symlink('foo', 'bar') + found = list(setuptools.findall()) + assert found == [] + + +@pytest.mark.xfail(reason="unable to exclude tests; #4475 #3260") +def test_its_own_wheel_does_not_contain_tests(setuptools_wheel): + with ZipFile(setuptools_wheel) as zipfile: + contents = [f.replace(os.sep, '/') for f in zipfile.namelist()] + + for member in contents: + assert '/tests/' not in member + + +def test_wheel_includes_cli_scripts(setuptools_wheel): + with ZipFile(setuptools_wheel) as zipfile: + contents = [f.replace(os.sep, '/') for f in zipfile.namelist()] + + assert any('cli-64.exe' in member for member in contents) + + +def test_wheel_includes_vendored_metadata(setuptools_wheel): + with ZipFile(setuptools_wheel) as zipfile: + contents = [f.replace(os.sep, '/') for f in zipfile.namelist()] + + assert any( + re.search(r'_vendor/.*\.dist-info/METADATA', member) for member in contents + ) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_shutil_wrapper.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_shutil_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..74ff7e9a896328a3d57ca3639658e3b9d538585f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_shutil_wrapper.py @@ -0,0 +1,23 @@ +import stat +import sys +from unittest.mock import Mock + +from setuptools import _shutil + + +def test_rmtree_readonly(monkeypatch, tmp_path): + """Verify onerr works as expected""" + + tmp_dir = tmp_path / "with_readonly" + tmp_dir.mkdir() + some_file = tmp_dir.joinpath("file.txt") + some_file.touch() + some_file.chmod(stat.S_IREAD) + + expected_count = 1 if sys.platform.startswith("win") else 0 + chmod_fn = Mock(wraps=_shutil.attempt_chmod_verbose) + monkeypatch.setattr(_shutil, "attempt_chmod_verbose", chmod_fn) + + _shutil.rmtree(tmp_dir) + assert chmod_fn.call_count == expected_count + assert not tmp_dir.is_dir() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_unicode_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_unicode_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a24a9bd5305d1de7c1c925466bb7d222c85864a7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_unicode_utils.py @@ -0,0 +1,10 @@ +from setuptools import unicode_utils + + +def test_filesys_decode_fs_encoding_is_None(monkeypatch): + """ + Test filesys_decode does not raise TypeError when + getfilesystemencoding returns None. + """ + monkeypatch.setattr('sys.getfilesystemencoding', lambda: None) + unicode_utils.filesys_decode(b'test') diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_virtualenv.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_virtualenv.py new file mode 100644 index 0000000000000000000000000000000000000000..b02949baf9cef8eb4df9c697add28f79a93b64d4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_virtualenv.py @@ -0,0 +1,113 @@ +import os +import subprocess +import sys +from urllib.error import URLError +from urllib.request import urlopen + +import pytest + + +@pytest.fixture(autouse=True) +def pytest_virtualenv_works(venv): + """ + pytest_virtualenv may not work. if it doesn't, skip these + tests. See #1284. + """ + venv_prefix = venv.run(["python", "-c", "import sys; print(sys.prefix)"]).strip() + if venv_prefix == sys.prefix: + pytest.skip("virtualenv is broken (see pypa/setuptools#1284)") + + +def test_clean_env_install(venv_without_setuptools, setuptools_wheel): + """ + Check setuptools can be installed in a clean environment. + """ + cmd = ["python", "-m", "pip", "install", str(setuptools_wheel)] + venv_without_setuptools.run(cmd) + + +def access_pypi(): + # Detect if tests are being run without connectivity + if not os.environ.get('NETWORK_REQUIRED', False): # pragma: nocover + try: + urlopen('https://pypi.org', timeout=1) + except URLError: + # No network, disable most of these tests + return False + + return True + + +@pytest.mark.skipif( + 'platform.python_implementation() == "PyPy"', + reason="https://github.com/pypa/setuptools/pull/2865#issuecomment-965834995", +) +@pytest.mark.skipif(not access_pypi(), reason="no network") +# ^-- Even when it is not necessary to install a different version of `pip` +# the build process will still try to download `wheel`, see #3147 and #2986. +@pytest.mark.parametrize( + 'pip_version', + [ + None, + pytest.param( + 'pip<20.1', + marks=pytest.mark.xfail( + 'sys.version_info >= (3, 12)', + reason="pip 23.1.2 required for Python 3.12 and later", + ), + ), + pytest.param( + 'pip<21', + marks=pytest.mark.xfail( + 'sys.version_info >= (3, 12)', + reason="pip 23.1.2 required for Python 3.12 and later", + ), + ), + pytest.param( + 'pip<22', + marks=pytest.mark.xfail( + 'sys.version_info >= (3, 12)', + reason="pip 23.1.2 required for Python 3.12 and later", + ), + ), + pytest.param( + 'pip<23', + marks=pytest.mark.xfail( + 'sys.version_info >= (3, 12)', + reason="pip 23.1.2 required for Python 3.12 and later", + ), + ), + pytest.param( + 'https://github.com/pypa/pip/archive/main.zip', + marks=pytest.mark.xfail(reason='#2975'), + ), + ], +) +def test_pip_upgrade_from_source( + pip_version, venv_without_setuptools, setuptools_wheel, setuptools_sdist +): + """ + Check pip can upgrade setuptools from source. + """ + # Install pip/wheel, in a venv without setuptools (as it + # should not be needed for bootstrapping from source) + venv = venv_without_setuptools + venv.run(["pip", "install", "-U", "wheel"]) + if pip_version is not None: + venv.run(["python", "-m", "pip", "install", "-U", pip_version, "--retries=1"]) + with pytest.raises(subprocess.CalledProcessError): + # Meta-test to make sure setuptools is not installed + venv.run(["python", "-c", "import setuptools"]) + + # Then install from wheel. + venv.run(["pip", "install", str(setuptools_wheel)]) + # And finally try to upgrade from source. + venv.run(["pip", "install", "--no-cache-dir", "--upgrade", str(setuptools_sdist)]) + + +def test_no_missing_dependencies(bare_venv, request): + """ + Quick and dirty test to ensure all external dependencies are vendored. + """ + setuptools_dir = request.config.rootdir + bare_venv.run(['python', 'setup.py', '--help'], cwd=setuptools_dir) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_warnings.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_warnings.py new file mode 100644 index 0000000000000000000000000000000000000000..41193d4f717344546f8c70ad18b268a04740129b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_warnings.py @@ -0,0 +1,106 @@ +from inspect import cleandoc + +import pytest + +from setuptools.warnings import SetuptoolsDeprecationWarning, SetuptoolsWarning + +_EXAMPLES = { + "default": dict( + args=("Hello {x}", "\n\t{target} {v:.1f}"), + kwargs={"x": 5, "v": 3, "target": "World"}, + expected=""" + Hello 5 + !! + + ******************************************************************************** + World 3.0 + ******************************************************************************** + + !! + """, + ), + "futue_due_date": dict( + args=("Summary", "Lorem ipsum"), + kwargs={"due_date": (9999, 11, 22)}, + expected=""" + Summary + !! + + ******************************************************************************** + Lorem ipsum + + By 9999-Nov-22, you need to update your project and remove deprecated calls + or your builds will no longer be supported. + ******************************************************************************** + + !! + """, + ), + "past_due_date_with_docs": dict( + args=("Summary", "Lorem ipsum"), + kwargs={"due_date": (2000, 11, 22), "see_docs": "some_page.html"}, + expected=""" + Summary + !! + + ******************************************************************************** + Lorem ipsum + + This deprecation is overdue, please update your project and remove deprecated + calls to avoid build errors in the future. + + See https://setuptools.pypa.io/en/latest/some_page.html for details. + ******************************************************************************** + + !! + """, + ), +} + + +@pytest.mark.parametrize("example_name", _EXAMPLES.keys()) +def test_formatting(monkeypatch, example_name): + """ + It should automatically handle indentation, interpolation and things like due date. + """ + args = _EXAMPLES[example_name]["args"] + kwargs = _EXAMPLES[example_name]["kwargs"] + expected = _EXAMPLES[example_name]["expected"] + + monkeypatch.setenv("SETUPTOOLS_ENFORCE_DEPRECATION", "false") + with pytest.warns(SetuptoolsWarning) as warn_info: + SetuptoolsWarning.emit(*args, **kwargs) + assert _get_message(warn_info) == cleandoc(expected) + + +def test_due_date_enforcement(monkeypatch): + class _MyDeprecation(SetuptoolsDeprecationWarning): + _SUMMARY = "Summary" + _DETAILS = "Lorem ipsum" + _DUE_DATE = (2000, 11, 22) + _SEE_DOCS = "some_page.html" + + monkeypatch.setenv("SETUPTOOLS_ENFORCE_DEPRECATION", "true") + with pytest.raises(SetuptoolsDeprecationWarning) as exc_info: + _MyDeprecation.emit() + + expected = """ + Summary + !! + + ******************************************************************************** + Lorem ipsum + + This deprecation is overdue, please update your project and remove deprecated + calls to avoid build errors in the future. + + See https://setuptools.pypa.io/en/latest/some_page.html for details. + ******************************************************************************** + + !! + """ + assert str(exc_info.value) == cleandoc(expected) + + +def _get_message(warn_info): + return next(warn.message.args[0] for warn in warn_info) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_wheel.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_wheel.py new file mode 100644 index 0000000000000000000000000000000000000000..70165c608b60bc70a908a8b56e1f1105feeb4712 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_wheel.py @@ -0,0 +1,714 @@ +"""wheel tests""" + +from __future__ import annotations + +import contextlib +import glob +import inspect +import os +import pathlib +import shutil +import stat +import subprocess +import sys +import zipfile +from typing import Any + +import pytest +from jaraco import path +from packaging.tags import parse_tag +from packaging.utils import canonicalize_name + +from pkg_resources import PY_MAJOR, Distribution, PathMetadata +from setuptools.wheel import Wheel + +from .contexts import tempdir +from .textwrap import DALS + +from distutils.sysconfig import get_config_var +from distutils.util import get_platform + +WHEEL_INFO_TESTS = ( + ('invalid.whl', ValueError), + ( + 'simplewheel-2.0-1-py2.py3-none-any.whl', + { + 'project_name': 'simplewheel', + 'version': '2.0', + 'build': '1', + 'py_version': 'py2.py3', + 'abi': 'none', + 'platform': 'any', + }, + ), + ( + 'simple.dist-0.1-py2.py3-none-any.whl', + { + 'project_name': 'simple.dist', + 'version': '0.1', + 'build': None, + 'py_version': 'py2.py3', + 'abi': 'none', + 'platform': 'any', + }, + ), + ( + 'example_pkg_a-1-py3-none-any.whl', + { + 'project_name': 'example_pkg_a', + 'version': '1', + 'build': None, + 'py_version': 'py3', + 'abi': 'none', + 'platform': 'any', + }, + ), + ( + 'PyQt5-5.9-5.9.1-cp35.cp36.cp37-abi3-manylinux1_x86_64.whl', + { + 'project_name': 'PyQt5', + 'version': '5.9', + 'build': '5.9.1', + 'py_version': 'cp35.cp36.cp37', + 'abi': 'abi3', + 'platform': 'manylinux1_x86_64', + }, + ), +) + + +@pytest.mark.parametrize( + ('filename', 'info'), WHEEL_INFO_TESTS, ids=[t[0] for t in WHEEL_INFO_TESTS] +) +def test_wheel_info(filename, info): + if inspect.isclass(info): + with pytest.raises(info): + Wheel(filename) + return + w = Wheel(filename) + assert {k: getattr(w, k) for k in info.keys()} == info + + +@contextlib.contextmanager +def build_wheel(extra_file_defs=None, **kwargs): + file_defs = { + 'setup.py': ( + DALS( + """ + # -*- coding: utf-8 -*- + from setuptools import setup + import setuptools + setup(**%r) + """ + ) + % kwargs + ).encode('utf-8'), + } + if extra_file_defs: + file_defs.update(extra_file_defs) + with tempdir() as source_dir: + path.build(file_defs, source_dir) + subprocess.check_call( + (sys.executable, 'setup.py', '-q', 'bdist_wheel'), cwd=source_dir + ) + yield glob.glob(os.path.join(source_dir, 'dist', '*.whl'))[0] + + +def tree_set(root): + contents = set() + for dirpath, dirnames, filenames in os.walk(root): + for filename in filenames: + contents.add(os.path.join(os.path.relpath(dirpath, root), filename)) + return contents + + +def flatten_tree(tree): + """Flatten nested dicts and lists into a full list of paths""" + output = set() + for node, contents in tree.items(): + if isinstance(contents, dict): + contents = flatten_tree(contents) + + for elem in contents: + if isinstance(elem, dict): + output |= {os.path.join(node, val) for val in flatten_tree(elem)} + else: + output.add(os.path.join(node, elem)) + return output + + +def format_install_tree(tree): + return { + x.format( + py_version=PY_MAJOR, + platform=get_platform(), + shlib_ext=get_config_var('EXT_SUFFIX') or get_config_var('SO'), + ) + for x in tree + } + + +def _check_wheel_install( + filename, install_dir, install_tree_includes, project_name, version, requires_txt +): + w = Wheel(filename) + egg_path = os.path.join(install_dir, w.egg_name()) + w.install_as_egg(egg_path) + if install_tree_includes is not None: + install_tree = format_install_tree(install_tree_includes) + exp = tree_set(install_dir) + assert install_tree.issubset(exp), install_tree - exp + + metadata = PathMetadata(egg_path, os.path.join(egg_path, 'EGG-INFO')) + dist = Distribution.from_filename(egg_path, metadata=metadata) + assert dist.project_name == project_name + assert dist.version == version + if requires_txt is None: + assert not dist.has_metadata('requires.txt') + else: + # Order must match to ensure reproducibility. + assert requires_txt == dist.get_metadata('requires.txt').lstrip() + + +class Record: + def __init__(self, id, **kwargs): + self._id = id + self._fields = kwargs + + def __repr__(self) -> str: + return f'{self._id}(**{self._fields!r})' + + +# Using Any to avoid possible type union issues later in test +# making a TypedDict is not worth in a test and anonymous/inline TypedDict are experimental +# https://github.com/python/mypy/issues/9884 +WHEEL_INSTALL_TESTS: tuple[dict[str, Any], ...] = ( + dict( + id='basic', + file_defs={'foo': {'__init__.py': ''}}, + setup_kwargs=dict( + packages=['foo'], + ), + install_tree=flatten_tree({ + 'foo-1.0-py{py_version}.egg': { + 'EGG-INFO': ['PKG-INFO', 'RECORD', 'WHEEL', 'top_level.txt'], + 'foo': ['__init__.py'], + } + }), + ), + dict( + id='utf-8', + setup_kwargs=dict( + description='Description accentuée', + ), + ), + dict( + id='data', + file_defs={ + 'data.txt': DALS( + """ + Some data... + """ + ), + }, + setup_kwargs=dict( + data_files=[('data_dir', ['data.txt'])], + ), + install_tree=flatten_tree({ + 'foo-1.0-py{py_version}.egg': { + 'EGG-INFO': ['PKG-INFO', 'RECORD', 'WHEEL', 'top_level.txt'], + 'data_dir': ['data.txt'], + } + }), + ), + dict( + id='extension', + file_defs={ + 'extension.c': DALS( + """ + #include "Python.h" + + #if PY_MAJOR_VERSION >= 3 + + static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "extension", + NULL, + 0, + NULL, + NULL, + NULL, + NULL, + NULL + }; + + #define INITERROR return NULL + + PyMODINIT_FUNC PyInit_extension(void) + + #else + + #define INITERROR return + + void initextension(void) + + #endif + { + #if PY_MAJOR_VERSION >= 3 + PyObject *module = PyModule_Create(&moduledef); + #else + PyObject *module = Py_InitModule("extension", NULL); + #endif + if (module == NULL) + INITERROR; + #if PY_MAJOR_VERSION >= 3 + return module; + #endif + } + """ + ), + }, + setup_kwargs=dict( + ext_modules=[ + Record( + 'setuptools.Extension', name='extension', sources=['extension.c'] + ) + ], + ), + install_tree=flatten_tree({ + 'foo-1.0-py{py_version}-{platform}.egg': [ + 'extension{shlib_ext}', + { + 'EGG-INFO': [ + 'PKG-INFO', + 'RECORD', + 'WHEEL', + 'top_level.txt', + ] + }, + ] + }), + ), + dict( + id='header', + file_defs={ + 'header.h': DALS( + """ + """ + ), + }, + setup_kwargs=dict( + headers=['header.h'], + ), + install_tree=flatten_tree({ + 'foo-1.0-py{py_version}.egg': [ + 'header.h', + { + 'EGG-INFO': [ + 'PKG-INFO', + 'RECORD', + 'WHEEL', + 'top_level.txt', + ] + }, + ] + }), + ), + dict( + id='script', + file_defs={ + 'script.py': DALS( + """ + #/usr/bin/python + print('hello world!') + """ + ), + 'script.sh': DALS( + """ + #/bin/sh + echo 'hello world!' + """ + ), + }, + setup_kwargs=dict( + scripts=['script.py', 'script.sh'], + ), + install_tree=flatten_tree({ + 'foo-1.0-py{py_version}.egg': { + 'EGG-INFO': [ + 'PKG-INFO', + 'RECORD', + 'WHEEL', + 'top_level.txt', + {'scripts': ['script.py', 'script.sh']}, + ] + } + }), + ), + dict( + id='requires1', + install_requires='foobar==2.0', + install_tree=flatten_tree({ + 'foo-1.0-py{py_version}.egg': { + 'EGG-INFO': [ + 'PKG-INFO', + 'RECORD', + 'WHEEL', + 'requires.txt', + 'top_level.txt', + ] + } + }), + requires_txt=DALS( + """ + foobar==2.0 + """ + ), + ), + dict( + id='requires2', + install_requires=f""" + bar + foo<=2.0; {sys.platform!r} in sys_platform + """, + requires_txt=DALS( + """ + bar + foo<=2.0 + """ + ), + ), + dict( + id='requires3', + install_requires=f""" + bar; {sys.platform!r} != sys_platform + """, + ), + dict( + id='requires4', + install_requires=""" + foo + """, + extras_require={ + 'extra': 'foobar>3', + }, + requires_txt=DALS( + """ + foo + + [extra] + foobar>3 + """ + ), + ), + dict( + id='requires5', + extras_require={ + 'extra': f'foobar; {sys.platform!r} != sys_platform', + }, + requires_txt=DALS( + """ + [extra] + """ + ), + ), + dict( + id='requires_ensure_order', + install_requires=""" + foo + bar + baz + qux + """, + extras_require={ + 'extra': """ + foobar>3 + barbaz>4 + bazqux>5 + quxzap>6 + """, + }, + requires_txt=DALS( + """ + foo + bar + baz + qux + + [extra] + foobar>3 + barbaz>4 + bazqux>5 + quxzap>6 + """ + ), + ), + dict( + id='namespace_package', + file_defs={ + 'foo': { + 'bar': {'__init__.py': ''}, + }, + }, + setup_kwargs=dict( + namespace_packages=['foo'], + packages=['foo.bar'], + ), + install_tree=flatten_tree({ + 'foo-1.0-py{py_version}.egg': [ + 'foo-1.0-py{py_version}-nspkg.pth', + { + 'EGG-INFO': [ + 'PKG-INFO', + 'RECORD', + 'WHEEL', + 'namespace_packages.txt', + 'top_level.txt', + ] + }, + { + 'foo': [ + '__init__.py', + {'bar': ['__init__.py']}, + ] + }, + ] + }), + ), + dict( + id='empty_namespace_package', + file_defs={ + 'foobar': { + '__init__.py': ( + "__import__('pkg_resources').declare_namespace(__name__)" + ) + }, + }, + setup_kwargs=dict( + namespace_packages=['foobar'], + packages=['foobar'], + ), + install_tree=flatten_tree({ + 'foo-1.0-py{py_version}.egg': [ + 'foo-1.0-py{py_version}-nspkg.pth', + { + 'EGG-INFO': [ + 'PKG-INFO', + 'RECORD', + 'WHEEL', + 'namespace_packages.txt', + 'top_level.txt', + ] + }, + { + 'foobar': [ + '__init__.py', + ] + }, + ] + }), + ), + dict( + id='data_in_package', + file_defs={ + 'foo': { + '__init__.py': '', + 'data_dir': { + 'data.txt': DALS( + """ + Some data... + """ + ), + }, + } + }, + setup_kwargs=dict( + packages=['foo'], + data_files=[('foo/data_dir', ['foo/data_dir/data.txt'])], + ), + install_tree=flatten_tree({ + 'foo-1.0-py{py_version}.egg': { + 'EGG-INFO': [ + 'PKG-INFO', + 'RECORD', + 'WHEEL', + 'top_level.txt', + ], + 'foo': [ + '__init__.py', + { + 'data_dir': [ + 'data.txt', + ] + }, + ], + } + }), + ), +) + + +@pytest.mark.parametrize( + 'params', + WHEEL_INSTALL_TESTS, + ids=[params['id'] for params in WHEEL_INSTALL_TESTS], +) +def test_wheel_install(params): + project_name = params.get('name', 'foo') + version = params.get('version', '1.0') + install_requires = params.get('install_requires', []) + extras_require = params.get('extras_require', {}) + requires_txt = params.get('requires_txt', None) + install_tree = params.get('install_tree') + file_defs = params.get('file_defs', {}) + setup_kwargs = params.get('setup_kwargs', {}) + with ( + build_wheel( + name=project_name, + version=version, + install_requires=install_requires, + extras_require=extras_require, + extra_file_defs=file_defs, + **setup_kwargs, + ) as filename, + tempdir() as install_dir, + ): + _check_wheel_install( + filename, install_dir, install_tree, project_name, version, requires_txt + ) + + +def test_wheel_install_pep_503(): + project_name = 'Foo_Bar' # PEP 503 canonicalized name is "foo-bar" + version = '1.0' + with ( + build_wheel( + name=project_name, + version=version, + ) as filename, + tempdir() as install_dir, + ): + new_filename = filename.replace(project_name, canonicalize_name(project_name)) + shutil.move(filename, new_filename) + _check_wheel_install( + new_filename, + install_dir, + None, + canonicalize_name(project_name), + version, + None, + ) + + +def test_wheel_no_dist_dir(): + project_name = 'nodistinfo' + version = '1.0' + wheel_name = f'{project_name}-{version}-py2.py3-none-any.whl' + with tempdir() as source_dir: + wheel_path = os.path.join(source_dir, wheel_name) + # create an empty zip file + zipfile.ZipFile(wheel_path, 'w').close() + with tempdir() as install_dir: + with pytest.raises(ValueError): + _check_wheel_install( + wheel_path, install_dir, None, project_name, version, None + ) + + +def test_wheel_is_compatible(monkeypatch): + def sys_tags(): + return { + (t.interpreter, t.abi, t.platform) + for t in parse_tag('cp36-cp36m-manylinux1_x86_64') + } + + monkeypatch.setattr('setuptools.wheel._get_supported_tags', sys_tags) + assert Wheel('onnxruntime-0.1.2-cp36-cp36m-manylinux1_x86_64.whl').is_compatible() + + +def test_wheel_mode(): + @contextlib.contextmanager + def build_wheel(extra_file_defs=None, **kwargs): + file_defs = { + 'setup.py': ( + DALS( + """ + # -*- coding: utf-8 -*- + from setuptools import setup + import setuptools + setup(**%r) + """ + ) + % kwargs + ).encode('utf-8'), + } + if extra_file_defs: + file_defs.update(extra_file_defs) + with tempdir() as source_dir: + path.build(file_defs, source_dir) + runsh = pathlib.Path(source_dir) / "script.sh" + os.chmod(runsh, 0o777) + subprocess.check_call( + (sys.executable, 'setup.py', '-q', 'bdist_wheel'), cwd=source_dir + ) + yield glob.glob(os.path.join(source_dir, 'dist', '*.whl'))[0] + + params = dict( + id='script', + file_defs={ + 'script.py': DALS( + """ + #/usr/bin/python + print('hello world!') + """ + ), + 'script.sh': DALS( + """ + #/bin/sh + echo 'hello world!' + """ + ), + }, + setup_kwargs=dict( + scripts=['script.py', 'script.sh'], + ), + install_tree=flatten_tree({ + 'foo-1.0-py{py_version}.egg': { + 'EGG-INFO': [ + 'PKG-INFO', + 'RECORD', + 'WHEEL', + 'top_level.txt', + {'scripts': ['script.py', 'script.sh']}, + ] + } + }), + ) + + project_name = params.get('name', 'foo') + version = params.get('version', '1.0') + install_tree = params.get('install_tree') + file_defs = params.get('file_defs', {}) + setup_kwargs = params.get('setup_kwargs', {}) + + with ( + build_wheel( + name=project_name, + version=version, + install_requires=[], + extras_require={}, + extra_file_defs=file_defs, + **setup_kwargs, + ) as filename, + tempdir() as install_dir, + ): + _check_wheel_install( + filename, install_dir, install_tree, project_name, version, None + ) + w = Wheel(filename) + base = pathlib.Path(install_dir) / w.egg_name() + script_sh = base / "EGG-INFO" / "scripts" / "script.sh" + assert script_sh.exists() + if sys.platform != 'win32': + # Editable file mode has no effect on Windows + assert oct(stat.S_IMODE(script_sh.stat().st_mode)) == "0o777" diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_windows_wrappers.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_windows_wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..f895485387f2f9f673f96f7114629297a381e66c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/test_windows_wrappers.py @@ -0,0 +1,259 @@ +""" +Python Script Wrapper for Windows +================================= + +setuptools includes wrappers for Python scripts that allows them to be +executed like regular windows programs. There are 2 wrappers, one +for command-line programs, cli.exe, and one for graphical programs, +gui.exe. These programs are almost identical, function pretty much +the same way, and are generated from the same source file. The +wrapper programs are used by copying them to the directory containing +the script they are to wrap and with the same name as the script they +are to wrap. +""" + +import pathlib +import platform +import subprocess +import sys +import textwrap + +import pytest + +import pkg_resources +from setuptools.command.easy_install import nt_quote_arg + +pytestmark = pytest.mark.skipif(sys.platform != 'win32', reason="Windows only") + + +class WrapperTester: + @classmethod + def prep_script(cls, template): + python_exe = nt_quote_arg(sys.executable) + return template % locals() + + @classmethod + def create_script(cls, tmpdir): + """ + Create a simple script, foo-script.py + + Note that the script starts with a Unix-style '#!' line saying which + Python executable to run. The wrapper will use this line to find the + correct Python executable. + """ + + script = cls.prep_script(cls.script_tmpl) + + with (tmpdir / cls.script_name).open('w') as f: + f.write(script) + + # also copy cli.exe to the sample directory + with (tmpdir / cls.wrapper_name).open('wb') as f: + w = pkg_resources.resource_string('setuptools', cls.wrapper_source) + f.write(w) + + +def win_launcher_exe(prefix): + """A simple routine to select launcher script based on platform.""" + assert prefix in ('cli', 'gui') + if platform.machine() == "ARM64": + return f"{prefix}-arm64.exe" + else: + return f"{prefix}-32.exe" + + +class TestCLI(WrapperTester): + script_name = 'foo-script.py' + wrapper_name = 'foo.exe' + wrapper_source = win_launcher_exe('cli') + + script_tmpl = textwrap.dedent( + """ + #!%(python_exe)s + import sys + input = repr(sys.stdin.read()) + print(sys.argv[0][-14:]) + print(sys.argv[1:]) + print(input) + if __debug__: + print('non-optimized') + """ + ).lstrip() + + def test_basic(self, tmpdir): + """ + When the copy of cli.exe, foo.exe in this example, runs, it examines + the path name it was run with and computes a Python script path name + by removing the '.exe' suffix and adding the '-script.py' suffix. (For + GUI programs, the suffix '-script.pyw' is added.) This is why we + named out script the way we did. Now we can run out script by running + the wrapper: + + This example was a little pathological in that it exercised windows + (MS C runtime) quoting rules: + + - Strings containing spaces are surrounded by double quotes. + + - Double quotes in strings need to be escaped by preceding them with + back slashes. + + - One or more backslashes preceding double quotes need to be escaped + by preceding each of them with back slashes. + """ + self.create_script(tmpdir) + cmd = [ + str(tmpdir / 'foo.exe'), + 'arg1', + 'arg 2', + 'arg "2\\"', + 'arg 4\\', + 'arg5 a\\\\b', + ] + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE, + text=True, + encoding="utf-8", + ) + stdout, _stderr = proc.communicate('hello\nworld\n') + actual = stdout.replace('\r\n', '\n') + expected = textwrap.dedent( + r""" + \foo-script.py + ['arg1', 'arg 2', 'arg "2\\"', 'arg 4\\', 'arg5 a\\\\b'] + 'hello\nworld\n' + non-optimized + """ + ).lstrip() + assert actual == expected + + def test_symlink(self, tmpdir): + """ + Ensure that symlink for the foo.exe is working correctly. + """ + script_dir = tmpdir / "script_dir" + script_dir.mkdir() + self.create_script(script_dir) + symlink = pathlib.Path(tmpdir / "foo.exe") + symlink.symlink_to(script_dir / "foo.exe") + + cmd = [ + str(tmpdir / 'foo.exe'), + 'arg1', + 'arg 2', + 'arg "2\\"', + 'arg 4\\', + 'arg5 a\\\\b', + ] + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE, + text=True, + encoding="utf-8", + ) + stdout, _stderr = proc.communicate('hello\nworld\n') + actual = stdout.replace('\r\n', '\n') + expected = textwrap.dedent( + r""" + \foo-script.py + ['arg1', 'arg 2', 'arg "2\\"', 'arg 4\\', 'arg5 a\\\\b'] + 'hello\nworld\n' + non-optimized + """ + ).lstrip() + assert actual == expected + + def test_with_options(self, tmpdir): + """ + Specifying Python Command-line Options + -------------------------------------- + + You can specify a single argument on the '#!' line. This can be used + to specify Python options like -O, to run in optimized mode or -i + to start the interactive interpreter. You can combine multiple + options as usual. For example, to run in optimized mode and + enter the interpreter after running the script, you could use -Oi: + """ + self.create_script(tmpdir) + tmpl = textwrap.dedent( + """ + #!%(python_exe)s -Oi + import sys + input = repr(sys.stdin.read()) + print(sys.argv[0][-14:]) + print(sys.argv[1:]) + print(input) + if __debug__: + print('non-optimized') + sys.ps1 = '---' + """ + ).lstrip() + with (tmpdir / 'foo-script.py').open('w') as f: + f.write(self.prep_script(tmpl)) + cmd = [str(tmpdir / 'foo.exe')] + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + encoding="utf-8", + ) + stdout, _stderr = proc.communicate() + actual = stdout.replace('\r\n', '\n') + expected = textwrap.dedent( + r""" + \foo-script.py + [] + '' + --- + """ + ).lstrip() + assert actual == expected + + +class TestGUI(WrapperTester): + """ + Testing the GUI Version + ----------------------- + """ + + script_name = 'bar-script.pyw' + wrapper_source = win_launcher_exe('gui') + wrapper_name = 'bar.exe' + + script_tmpl = textwrap.dedent( + """ + #!%(python_exe)s + import sys + f = open(sys.argv[1], 'wb') + bytes_written = f.write(repr(sys.argv[2]).encode('utf-8')) + f.close() + """ + ).strip() + + def test_basic(self, tmpdir): + """Test the GUI version with the simple script, bar-script.py""" + self.create_script(tmpdir) + + cmd = [ + str(tmpdir / 'bar.exe'), + str(tmpdir / 'test_output.txt'), + 'Test Argument', + ] + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + encoding="utf-8", + ) + stdout, stderr = proc.communicate() + assert not stdout + assert not stderr + with (tmpdir / 'test_output.txt').open('rb') as f_out: + actual = f_out.read().decode('ascii') + assert actual == repr('Test Argument') diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/text.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/text.py new file mode 100644 index 0000000000000000000000000000000000000000..e05cc633ede9e5ce4f74b66a7bf76327c2000caa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/text.py @@ -0,0 +1,4 @@ +class Filenames: + unicode = 'smörbröd.py' + latin_1 = unicode.encode('latin-1') + utf_8 = unicode.encode('utf-8') diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/textwrap.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/textwrap.py new file mode 100644 index 0000000000000000000000000000000000000000..5e39618dca4ad6c3f0d4c8cb20af59ab85fb0eba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/setuptools/tests/textwrap.py @@ -0,0 +1,6 @@ +import textwrap + + +def DALS(s): + "dedent and left-strip" + return textwrap.dedent(s).lstrip() diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40bfccc43ceb8316324c87de2cbb3671623d599a Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/__pycache__/_core.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/__pycache__/_core.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dbb68a4c74d352f7e00516cb071ba3bb2ec834ef Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/__pycache__/_core.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/__pycache__/nt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/__pycache__/nt.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8adbee5b6d9a7e2744627f50ed4bfa64533e05ac Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/__pycache__/nt.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5bd2070db27189e62a1867e4de49f16f8c8841ff --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__init__.py @@ -0,0 +1,112 @@ +import os +import re + +from .._core import SHELL_NAMES, ShellDetectionFailure +from . import proc, ps + +# Based on QEMU docs: https://www.qemu.org/docs/master/user/main.html +QEMU_BIN_REGEX = re.compile( + r"""qemu- + (alpha + |armeb + |arm + |m68k + |cris + |i386 + |x86_64 + |microblaze + |mips + |mipsel + |mips64 + |mips64el + |mipsn32 + |mipsn32el + |nios2 + |ppc64 + |ppc + |sh4eb + |sh4 + |sparc + |sparc32plus + |sparc64 + )""", + re.VERBOSE, +) + + +def _iter_process_parents(pid, max_depth=10): + """Select a way to obtain process information from the system. + + * `/proc` is used if supported. + * The system `ps` utility is used as a fallback option. + """ + for impl in (proc, ps): + try: + iterator = impl.iter_process_parents(pid, max_depth) + except EnvironmentError: + continue + return iterator + raise ShellDetectionFailure("compatible proc fs or ps utility is required") + + +def _get_login_shell(proc_cmd): + """Form shell information from SHELL environ if possible.""" + login_shell = os.environ.get("SHELL", "") + if login_shell: + proc_cmd = login_shell + else: + proc_cmd = proc_cmd[1:] + return (os.path.basename(proc_cmd).lower(), proc_cmd) + + +_INTERPRETER_SHELL_NAMES = [ + (re.compile(r"^python(\d+(\.\d+)?)?$"), {"xonsh"}), +] + + +def _get_interpreter_shell(proc_name, proc_args): + """Get shell invoked via an interpreter. + + Some shells are implemented on, and invoked with an interpreter, e.g. xonsh + is commonly executed with an executable Python script. This detects what + script the interpreter is actually running, and check whether that looks + like a shell. + + See sarugaku/shellingham#26 for rational. + """ + for pattern, shell_names in _INTERPRETER_SHELL_NAMES: + if not pattern.match(proc_name): + continue + for arg in proc_args: + name = os.path.basename(arg).lower() + if os.path.isfile(arg) and name in shell_names: + return (name, arg) + return None + + +def _get_shell(cmd, *args): + if cmd.startswith("-"): # Login shell! Let's use this. + return _get_login_shell(cmd) + name = os.path.basename(cmd).lower() + if name == "rosetta" or QEMU_BIN_REGEX.fullmatch(name): + # If the current process is Rosetta or QEMU, this likely is a + # containerized process. Parse out the actual command instead. + cmd = args[0] + args = args[1:] + name = os.path.basename(cmd).lower() + if name in SHELL_NAMES: # Command looks like a shell. + return (name, cmd) + shell = _get_interpreter_shell(name, args) + if shell: + return shell + return None + + +def get_shell(pid=None, max_depth=10): + """Get the shell that the supplied pid or os.getpid() is running in.""" + pid = str(pid or os.getpid()) + for proc_args, _, _ in _iter_process_parents(pid, max_depth): + shell = _get_shell(*proc_args) + if shell: + return shell + return None diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c172c0719af537e0992ea31f2f1c52fd7a6a53be Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/__init__.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/_core.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/_core.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cb36e2c9ad4dcf58f7de47773f94c17b5f2fd7b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/_core.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/proc.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/proc.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..203901d689eb75f99d1454722682e91dec7b694b Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/proc.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/ps.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/ps.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f676264dd21a24e32641d6e0abe5eef6d2aabc88 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/__pycache__/ps.cpython-312.pyc differ diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/_core.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/_core.py new file mode 100644 index 0000000000000000000000000000000000000000..adc49e6e7a9d3edf062c55e0078136899f78d30d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/_core.py @@ -0,0 +1,3 @@ +import collections + +Process = collections.namedtuple("Process", "args pid ppid") diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/proc.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/proc.py new file mode 100644 index 0000000000000000000000000000000000000000..950f63228e5b328f82b70da8851ec60c6a2ff029 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/proc.py @@ -0,0 +1,83 @@ +import io +import os +import re +import sys + +from ._core import Process + +# FreeBSD: https://www.freebsd.org/cgi/man.cgi?query=procfs +# NetBSD: https://man.netbsd.org/NetBSD-9.3-STABLE/mount_procfs.8 +# DragonFlyBSD: https://www.dragonflybsd.org/cgi/web-man?command=procfs +BSD_STAT_PPID = 2 + +# See https://docs.kernel.org/filesystems/proc.html +LINUX_STAT_PPID = 3 + +STAT_PATTERN = re.compile(r"\(.+\)|\S+") + + +def detect_proc(): + """Detect /proc filesystem style. + + This checks the /proc/{pid} directory for possible formats. Returns one of + the following as str: + + * `stat`: Linux-style, i.e. ``/proc/{pid}/stat``. + * `status`: BSD-style, i.e. ``/proc/{pid}/status``. + """ + pid = os.getpid() + for name in ("stat", "status"): + if os.path.exists(os.path.join("/proc", str(pid), name)): + return name + raise ProcFormatError("unsupported proc format") + + +def _use_bsd_stat_format(): + try: + return os.uname().sysname.lower() in ("freebsd", "netbsd", "dragonfly") + except Exception: + return False + + +def _get_ppid(pid, name): + path = os.path.join("/proc", str(pid), name) + with io.open(path, encoding="ascii", errors="replace") as f: + parts = STAT_PATTERN.findall(f.read()) + # We only care about TTY and PPID -- both are numbers. + if _use_bsd_stat_format(): + return parts[BSD_STAT_PPID] + return parts[LINUX_STAT_PPID] + + +def _get_cmdline(pid): + path = os.path.join("/proc", str(pid), "cmdline") + encoding = sys.getfilesystemencoding() or "utf-8" + with io.open(path, encoding=encoding, errors="replace") as f: + # XXX: Command line arguments can be arbitrary byte sequences, not + # necessarily decodable. For Shellingham's purpose, however, we don't + # care. (pypa/pipenv#2820) + # cmdline appends an extra NULL at the end, hence the [:-1]. + return tuple(f.read().split("\0")[:-1]) + + +class ProcFormatError(EnvironmentError): + pass + + +def iter_process_parents(pid, max_depth=10): + """Try to look up the process tree via the /proc interface.""" + stat_name = detect_proc() + + # Inner generator function so we correctly throw an error eagerly if proc + # is not supported, rather than on the first call to the iterator. This + # allows the call site detects the correct implementation. + def _iter_process_parents(pid, max_depth): + for _ in range(max_depth): + ppid = _get_ppid(pid, stat_name) + args = _get_cmdline(pid) + yield Process(args=args, pid=pid, ppid=ppid) + if ppid == "0": + break + pid = ppid + + return _iter_process_parents(pid, max_depth) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/ps.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/ps.py new file mode 100644 index 0000000000000000000000000000000000000000..3bc39a74a56390c263e63bfead028f6bce4df3cb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/shellingham/posix/ps.py @@ -0,0 +1,51 @@ +import errno +import subprocess +import sys + +from ._core import Process + + +class PsNotAvailable(EnvironmentError): + pass + + +def iter_process_parents(pid, max_depth=10): + """Try to look up the process tree via the output of `ps`.""" + try: + cmd = ["ps", "-ww", "-o", "pid=", "-o", "ppid=", "-o", "args="] + output = subprocess.check_output(cmd) + except OSError as e: # Python 2-compatible FileNotFoundError. + if e.errno != errno.ENOENT: + raise + raise PsNotAvailable("ps not found") + except subprocess.CalledProcessError as e: + # `ps` can return 1 if the process list is completely empty. + # (sarugaku/shellingham#15) + if not e.output.strip(): + return + raise + if not isinstance(output, str): + encoding = sys.getfilesystemencoding() or sys.getdefaultencoding() + output = output.decode(encoding) + + processes_mapping = {} + for line in output.split("\n"): + try: + _pid, ppid, args = line.strip().split(None, 2) + # XXX: This is not right, but we are really out of options. + # ps does not offer a sane way to decode the argument display, + # and this is "Good Enough" for obtaining shell names. Hopefully + # people don't name their shell with a space, or have something + # like "/usr/bin/xonsh is uber". (sarugaku/shellingham#14) + args = tuple(a.strip() for a in args.split(" ")) + except ValueError: + continue + processes_mapping[_pid] = Process(args=args, pid=_pid, ppid=ppid) + + for _ in range(max_depth): + try: + process = processes_mapping[pid] + except KeyError: + return + yield process + pid = process.ppid diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/tabledata/_logger/__pycache__/_logger.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/tabledata/_logger/__pycache__/_logger.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d0482997be7a6b05cf24781d43f77385272dfe7 Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/tabledata/_logger/__pycache__/_logger.cpython-312.pyc differ